In [7]:
# ------------------------------------------------------
# IMPORT DES LIBRAIRIES
# ------------------------------------------------------

import pandas as pd               # manipulation de tableaux (DataFrame)
import numpy as np                # op√©rations math√©matiques et tableaux
from sklearn.model_selection import train_test_split, StratifiedKFold  # split dataset + validation crois√©e
from sklearn.preprocessing import StandardScaler  # normalisation
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score  # m√©triques d'√©valuation
from xgboost import XGBClassifier  # mod√®le XGBoost (arbres boost√©s)
from imblearn.over_sampling import SMOTE  # pour r√©√©quilibrer les classes
import warnings
warnings.filterwarnings("ignore")  # ignorer les warnings pour un affichage clair

# ------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------

FEATURES_PATH = "/workspaces/datasciencetest_reco_plante/dataset/plantvillage/csv/clean_with_features_data_plantvillage_segmented_all.csv"
SEED = 42  # graine al√©atoire pour reproductibilit√©

# ------------------------------------------------------
# D√âTECTION AUTOMATIQUE DES COLONNES PLANTES (labels)
# ------------------------------------------------------
plant_columns = [col for col in pd.read_csv(FEATURES_PATH, nrows=1).columns if col.startswith("plant_")]
print(f"üå± Colonnes plantes d√©tect√©es : {plant_columns}")

# ------------------------------------------------------
# CHARGEMENT DU DATASET
# ------------------------------------------------------
df = pd.read_csv(FEATURES_PATH)

# ------------------------------------------------------
# SUPPRESSION DES VALEURS MANQUANTES (NaN)
# ------------------------------------------------------
numeric_columns = df.drop(columns=plant_columns).select_dtypes(include=np.number).columns
df = df.dropna(subset=numeric_columns.tolist() + plant_columns)
print(f"‚úÖ Apr√®s suppression des NaN : {df.shape[0]} √©chantillons restants.")

# ------------------------------------------------------
# PR√âPARATION DES DONN√âES
# ------------------------------------------------------
X = df[numeric_columns].values  # features num√©riques
y = df[plant_columns].values    # labels multi-classes (one-hot)
y = np.argmax(y, axis=1)        # transforme one-hot en labels entiers (0,1,...,13)
print(f"‚úÖ Dataset pr√™t : {X.shape[0]} √©chantillons, {X.shape[1]} features num√©riques.")
print(f"Nombre de classes (plantes) : {len(plant_columns)}")

# ------------------------------------------------------
# NORMALISATION
# ------------------------------------------------------
scalers = {"StandardScaler": StandardScaler()}  # moyenne=0, variance=1

# ------------------------------------------------------
# CONFIGURATIONS XGBOOST
# ------------------------------------------------------
configs = {
    "Baseline": {"n_estimators": 200, "learning_rate": 0.1, "max_depth": 6},
    "Deep Trees": {"n_estimators": 300, "learning_rate": 0.05, "max_depth": 10},
    "Shallow Trees": {"n_estimators": 500, "learning_rate": 0.01, "max_depth": 3}
}

# ------------------------------------------------------
# FONCTION D'√âVALUATION
# ------------------------------------------------------
def evaluate(y_true, y_pred, dataset_name="Test"):
    """
    Calcule Accuracy, F1-score pond√©r√© et ROC-AUC pond√©r√© si possible.
    """
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    try:
        auc = roc_auc_score(pd.get_dummies(y_true), pd.get_dummies(y_pred), average="weighted")
    except:
        auc = None
    print(f"üìä {dataset_name} | Accuracy = {acc:.4f}, F1 = {f1:.4f}", end="")
    if auc is not None:
        print(f", ROC-AUC = {auc:.4f}")
    else:
        print(" (ROC-AUC non calculable - multi-classes)")
    return acc, f1, auc

# ------------------------------------------------------
# BOUCLE PRINCIPALE : SCALERS + CONFIGS + VALIDATION CROIS√âE
# ------------------------------------------------------
results = []

for scaler_name, scaler in scalers.items():
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, stratify=y, random_state=SEED
    )

    for config_name, params in configs.items():
        print(f"\nüöÄ Test avec {scaler_name} + Config = {config_name}")

        model = XGBClassifier(
            use_label_encoder=False,
            eval_metric="mlogloss",
            random_state=SEED,
            **params
        )

        # Validation crois√©e 5-fold stratifi√©e
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
        f1_scores = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
            X_tr, X_val = X_train[train_idx], X_train[val_idx]
            y_tr, y_val = y_train[train_idx], y_train[val_idx]

            # SMOTE pour r√©√©quilibrer les classes minoritaires
            smote = SMOTE(random_state=SEED)
            X_tr_bal, y_tr_bal = smote.fit_resample(X_tr, y_tr)

            # Entra√Ænement du mod√®le
            model.fit(X_tr_bal, y_tr_bal)

            # Pr√©diction sur le fold de validation
            y_val_pred = model.predict(X_val)
            f1_fold = f1_score(y_val, y_val_pred, average="weighted")
            f1_scores.append(f1_fold)
            print(f"   Fold {fold+1} : F1 = {f1_fold:.4f}")

        print(f"   ‚û°Ô∏è Moyenne F1 CV = {np.mean(f1_scores):.4f} ¬± {np.std(f1_scores):.4f}")

        # R√©entra√Ænement sur tout le train avec SMOTE
        smote = SMOTE(random_state=SEED)
        X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
        model.fit(X_train_bal, y_train_bal)

        # √âvaluation finale sur le test set
        y_test_pred = model.predict(X_test)
        acc, f1, auc = evaluate(y_test, y_test_pred, dataset_name="Test final")

        # Stockage des r√©sultats
        results.append({
            "Scaler": scaler_name,
            "Config": config_name,
            "CV_F1_mean": np.mean(f1_scores),
            "CV_F1_std": np.std(f1_scores),
            "Test_Accuracy": acc,
            "Test_F1": f1,
            "Test_AUC": auc
        })

# ------------------------------------------------------
# AFFICHAGE DES R√âSULTATS FINAUX
# ------------------------------------------------------
results_df = pd.DataFrame(results)
print("\nüìä Tableau comparatif des r√©sultats :")
print(results_df)


üå± Colonnes plantes d√©tect√©es : ['plant_Apple', 'plant_Blueberry', 'plant_Cherry_(including_sour)', 'plant_Corn_(maize)', 'plant_Grape', 'plant_Orange', 'plant_Peach', 'plant_Pepper,_bell', 'plant_Potato', 'plant_Raspberry', 'plant_Soybean', 'plant_Squash', 'plant_Strawberry', 'plant_Tomato']
‚úÖ Apr√®s suppression des NaN : 54275 √©chantillons restants.
‚úÖ Dataset pr√™t : 54275 √©chantillons, 38 features num√©riques.
Nombre de classes (plantes) : 14

üöÄ Test avec StandardScaler + Config = Baseline
   Fold 1 : F1 = 0.9988
   Fold 2 : F1 = 0.9990
   Fold 3 : F1 = 0.9979
   Fold 4 : F1 = 0.9993
   Fold 5 : F1 = 0.9982
   ‚û°Ô∏è Moyenne F1 CV = 0.9986 ¬± 0.0005
üìä Test final | Accuracy = 0.9983, F1 = 0.9983, ROC-AUC = 0.9991

üöÄ Test avec StandardScaler + Config = Deep Trees
   Fold 1 : F1 = 0.9985
   Fold 2 : F1 = 0.9987
   Fold 3 : F1 = 0.9982
   Fold 4 : F1 = 0.9993
   Fold 5 : F1 = 0.9982
   ‚û°Ô∏è Moyenne F1 CV = 0.9986 ¬± 0.0004
üìä Test final | Accuracy = 0.9982, F1 = 0