# 03 - R√©√©chantillonnage: √âquilibrage & D√©coupage Dataset

**Objectif:** Cr√©er des ensembles train/val/test √©quilibr√©s

**Sortie:** 3 fichiers CSV (train/val/test)

---

In [None]:
import sys, os
from pathlib import Path

IS_COLAB = 'google.colab' in sys.modules
if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    PROJECT_ROOT = Path('/content/drive/MyDrive/sep25_alt1_mle_ds_rakuten')
    os.chdir(PROJECT_ROOT)
else:
    PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
    os.chdir(PROJECT_ROOT)
print(f"üìÅ {PROJECT_ROOT}")

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import warnings; warnings.filterwarnings('ignore')
np.random.seed(42)
INPUT_DIR = Path('output/02_preprocessing_images')
OUTPUT_DIR = Path('output/03_preprocessing_resampling')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
df = pd.read_csv(INPUT_DIR / 'df_preprocessed.csv', index_col=0)
print(f"‚úÖ {len(df):,} produits | {df['prdtypecode'].nunique()} classes")
df.head()

In [None]:
class_counts = df['prdtypecode'].value_counts().sort_index()
print("Distribution AVANT r√©√©quilibrage:")
print(class_counts)

In [None]:
X = df.drop('prdtypecode', axis=1)
y = df['prdtypecode']

# Split 80/20
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
# Split temp en val/test (10/10)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True, stratify=y_temp)

print(f"Train: {len(X_train):,} | Val: {len(X_val):,} | Test: {len(X_test):,}")

In [None]:
print("R√©√©quilibrage train...")
# Sous-√©chantillonner classe 2583
under_sampler = RandomUnderSampler(sampling_strategy={2583: 4058}, random_state=42)
X_train, y_train = under_sampler.fit_resample(X_train, y_train)
# Sur-√©chantillonner minorit√©s
over_sampler = RandomOverSampler(random_state=42)
X_train, y_train = over_sampler.fit_resample(X_train, y_train)

print(f"‚úÖ Train r√©√©quilibr√©: {len(X_train):,} √©chantillons")

In [None]:
class_counts_after = y_train.value_counts().sort_index()
print("Distribution APR√àS:")
print(class_counts_after)
print(f"\nMin: {class_counts_after.min()} | Max: {class_counts_after.max()}")

In [None]:
df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

df_train.to_csv(OUTPUT_DIR / 'df_train.csv')
df_val.to_csv(OUTPUT_DIR / 'df_val.csv')
df_test.to_csv(OUTPUT_DIR / 'df_test.csv')

print(f"‚úÖ Sauvegard√©: df_train.csv ({len(df_train):,})")
print(f"‚úÖ Sauvegard√©: df_val.csv ({len(df_val):,})")
print(f"‚úÖ Sauvegard√©: df_test.csv ({len(df_test):,})")

In [None]:
print("="*60)
print("‚öñÔ∏è R√âSUM√â R√â√âCHANTILLONNAGE")
print("="*60)
print(f"Train: {len(df_train):,} (√©quilibr√©)")
print(f"Val: {len(df_val):,}")
print(f"Test: {len(df_test):,}")
print(f"\n‚úÖ Pr√™t pour notebook 04!")
print("="*60)