# 01 - Exploration des Donn√©es: Classification Produits Rakuten

**Objectif:** Analyser le dataset et calculer les bounding boxes pour TOUTES les images

**‚ö†Ô∏è CRITIQUE:** Analyse ~85k images (15-20 min) - REQUIS pour reproduire F1=0.31

---

## üîß Configuration Environnement (Local + Colab)

In [None]:
import sys, os
from pathlib import Path

IS_COLAB = 'google.colab' in sys.modules
if IS_COLAB:
    print("üîµ Ex√©cution sur Google Colab")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    PROJECT_ROOT = Path('/content/drive/MyDrive/CLASSIFICATION-PRODUITS-RAKUTEN')
    os.chdir(PROJECT_ROOT)
else:
    print("üü¢ Ex√©cution en local")
    PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
    os.chdir(PROJECT_ROOT)

print(f"üìÅ Racine projet: {PROJECT_ROOT}")
print(f"‚úÖ R√©pertoire de travail: {os.getcwd()}")

## üì¶ Import Biblioth√®ques

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import json
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Seed al√©atoire
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# R√©pertoire de sortie
OUTPUT_DIR = Path('output/01_exploration')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("‚úÖ Biblioth√®ques import√©es")

## üìä Chargement des Donn√©es

In [None]:
DATA_DIR = Path('data')
IMAGE_DIR = DATA_DIR / 'images'

print("Chargement des datasets...")
X_train = pd.read_csv(DATA_DIR / 'X_train_update.csv', index_col=0)
y_train = pd.read_csv(DATA_DIR / 'Y_train_CVw08PX.csv', index_col=0)
df = X_train.merge(y_train, left_index=True, right_index=True, how='inner')

print(f"‚úÖ {len(df):,} produits charg√©s")
print(f"‚úÖ {df['prdtypecode'].nunique()} classes")
df.head()

## üîç Inspection des Donn√©es

In [None]:
print("Valeurs manquantes:")
print(df.isnull().sum())
print(f"\n% descriptions manquantes: {df['description'].isnull().sum() / len(df) * 100:.1f}%")

## üìà Distribution des Classes

In [None]:
class_counts = df['prdtypecode'].value_counts().sort_index()

# Labels des cat√©gories (vos labels exacts)
prdtypecode_labels = {
    10: "Livre usag√©",
    40: "Jeux vid√©o et accessoires tech",
    50: "Accessoires de console",
    60: "Console de jeux vid√©o",
    1140: "Statuette / Figurine",
    1160: "Cartes collectionnables",
    1180: "Jeux de table",
    1280: "Jouets enfants et costumes",
    1281: "Jeux de soci√©t√©",
    1300: "Jouets √©lectroniques",
    1301: "Bas et chaussettes",
    1302: "Jeux ext√©rieurs et v√™tements",
    1320: "Articles pour b√©b√©",
    1560: "Meubles int√©rieurs",
    1920: "Mobilier de chambre",
    1940: "Ustensiles de cuisine",
    2060: "D√©coration int√©rieure",
    2220: "Produits pour animaux",
    2280: "Magazines et journaux",
    2403: "Livres, magazines et BD",
    2462: "Jeux d'occasion",
    2522: "Mat√©riel de bureau",
    2582: "Mobilier de jardin",
    2583: "√âquipement de piscine",
    2585: "Outillage et bricolage",
    2705: "Livre nouveau",
    2905: "Jeux pour PC"
}

df['prdtype_label'] = df['prdtypecode'].map(prdtypecode_labels)
print("‚úÖ Labels ajout√©s")
print(f"\nDistribution:")
print(class_counts)

In [None]:
plt.figure(figsize=(16, 6))
class_counts.plot(kind='bar', color='steelblue', edgecolor='black')
plt.title('Distribution des Classes', fontsize=16, fontweight='bold')
plt.xlabel('Code Type Produit')
plt.ylabel('Nombre')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"D√©s√©quilibre: {class_counts.max() / class_counts.min():.1f}x")

## üñºÔ∏è Analyse TOUTES les Images (~15-20 min)

**‚ö†Ô∏è CRITIQUE pour reproduire F1=0.31**

In [None]:
df['imagefile'] = 'image_' + df['imageid'].astype(str) + '_product_' + df['productid'].astype(str) + '.jpg'
df['imagepath'] = df['imagefile'].apply(lambda x: str(IMAGE_DIR / 'image_train' / x))
print(f"‚úÖ Chemins images cr√©√©s")

In [None]:
def find_inner_img_box(img):
    """Trouve la bounding box de l'image int√©rieure (zone non-blanche)"""
    for top in range(img.shape[0]):
        if (img[top] != 255).any(): break
    if top == img.shape[0] - 1: return None
    
    for bottom in range(img.shape[0] - 1, top, -1):
        if (img[bottom] != 255).any(): break
    
    for left in range(img.shape[1]):
        if (img[top:bottom+1, left] != 255).any(): break
    
    for right in range(img.shape[1] - 1, left, -1):
        if (img[top:bottom+1, right] != 255).any(): break
    
    return (top, left, bottom, right)

print("‚úÖ Fonction bounding box d√©finie")

In [None]:
print(f"üîÑ Analyse de TOUTES les {len(df):,} images...")
print(f"‚è±Ô∏è  Dur√©e estim√©e: 15-20 minutes\n")

image_stats = []
blank_images = []
error_images = []

for idx in tqdm(df.index, desc="Traitement images"):
    try:
        img = cv2.imread(df.loc[idx, 'imagepath'], cv2.IMREAD_COLOR)
        if img is None:
            error_images.append(idx)
            continue
        
        bbox = find_inner_img_box(img)
        if bbox is None:
            blank_images.append(idx)
            continue
        
        top, left, bottom, right = bbox
        inner_w = right - left + 1
        inner_h = bottom - top + 1
        ratio = (inner_w * inner_h) / (img.shape[0] * img.shape[1])
        
        image_stats.append({
            'index': idx,
            'inner_width': inner_w,
            'inner_height': inner_h,
            'inner_ratio': ratio,
            'top': top,
            'left': left,
            'bottom': bottom,
            'right': right
        })
    except:
        error_images.append(idx)

df_img_stats = pd.DataFrame(image_stats)
print(f"\n‚úÖ {len(df_img_stats):,} images analys√©es")
print(f"‚ö†Ô∏è {len(blank_images)} images blanches")
print(f"‚ùå {len(error_images)} erreurs")

In [None]:
needs_zoom = (df_img_stats['inner_ratio'] <= 0.8).sum()
pct_zoom = needs_zoom / len(df_img_stats) * 100

print(f"\nüìä Images n√©cessitant zoom (‚â§0.8): {needs_zoom:,} ({pct_zoom:.1f}%)")
print(f"\nStatistiques ratio:")
print(df_img_stats['inner_ratio'].describe())

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df_img_stats['inner_ratio'], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
plt.axvline(0.8, color='red', linestyle='--', linewidth=2, label='Seuil (0.8)')
plt.title('Distribution Ratio Image Int√©rieure', fontsize=14, fontweight='bold')
plt.xlabel('Ratio')
plt.ylabel('Fr√©quence')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'inner_ratio_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## üîó Fusion des Statistiques

In [None]:
df_img_stats.set_index('index', inplace=True)
df = df.join(df_img_stats, how='left')

print(f"‚úÖ Fusion compl√©t√©e")
print(f"   Total lignes: {len(df):,}")
print(f"   Avec stats: {df['inner_ratio'].notna().sum():,} (100%)")

## üíæ Sauvegarde

In [None]:
# Sauvegarder le dataframe explor√©
df.to_csv(OUTPUT_DIR / 'df_explored.csv')
print(f"‚úÖ Sauvegard√©: df_explored.csv")

# Statistiques images
df_img_stats.to_csv(OUTPUT_DIR / 'image_stats.csv')
print(f"‚úÖ Sauvegard√©: image_stats.csv")

# Distribution classes
class_counts.to_csv(OUTPUT_DIR / 'class_distribution.csv')
print(f"‚úÖ Sauvegard√©: class_distribution.csv")

# R√©sum√© JSON
summary = {
    'total_products': int(len(df)),
    'num_classes': int(df['prdtypecode'].nunique()),
    'images_analyzed': int(len(df_img_stats)),
    'images_need_zoom': int(needs_zoom),
    'pct_need_zoom': float(pct_zoom)
}

with open(OUTPUT_DIR / 'summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"‚úÖ Sauvegard√©: summary.json")

## üìã R√©sum√©

In [None]:
print("=" * 70)
print("üìä R√âSUM√â EXPLORATION")
print("=" * 70)
print(f"\nüì¶ Dataset:")
print(f"  ‚Ä¢ Produits: {len(df):,}")
print(f"  ‚Ä¢ Classes: {df['prdtypecode'].nunique()}")
print(f"  ‚Ä¢ Descriptions manquantes: {df['description'].isnull().sum() / len(df) * 100:.1f}%")

print(f"\n‚öñÔ∏è √âquilibre Classes:")
print(f"  ‚Ä¢ Plus commune: {class_counts.max():,} produits")
print(f"  ‚Ä¢ Moins commune: {class_counts.min():,} produits")
print(f"  ‚Ä¢ Ratio d√©s√©quilibre: {class_counts.max() / class_counts.min():.1f}x")

print(f"\nüñºÔ∏è Images:")
print(f"  ‚Ä¢ Analys√©es: {len(df_img_stats):,} / {len(df):,} (100%)")
print(f"  ‚Ä¢ N√©cessitant zoom: {needs_zoom:,} ({pct_zoom:.1f}%)")
print(f"  ‚Ä¢ Images blanches: {len(blank_images)}")
print(f"  ‚Ä¢ Erreurs: {len(error_images)}")

print(f"\n‚úÖ TOUTES les images analys√©es - pr√™t pour notebook 02!")
print("=" * 70)