In [1]:
# notebooks/modeling.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# üì• Chargement des donn√©es
X = pd.read_csv('../data/processed/X_train_fingerprints.csv')
y = pd.read_csv('../data/raw/y_train.csv')

# üîó Fusion
df = X.merge(y, on="ID")
X_features = df.drop(columns=["ID", "Y1", "Y2", "Y3"])
y_labels = df[["Y1", "Y2", "Y3"]]

# ‚öôÔ∏è Liste des mod√®les √† tester
models = {
    "Random Forest": MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)),
    "Gradient Boosting": MultiOutputClassifier(GradientBoostingClassifier(n_estimators=150)),
    "Logistic Regression": MultiOutputClassifier(
        Pipeline([
            ("scaler", StandardScaler()),
            ("lr", LogisticRegression(max_iter=500))
        ])
    )
}

# üéØ Validation crois√©e
results = {}
for name, model in models.items():
    print(f"‚è≥ √âvaluation de {name} ...")
    scores = cross_val_score(model, X_features, y_labels, cv=5, scoring="f1_micro")
    results[name] = scores
    print(f"‚úîÔ∏è F1 micro (5-fold): {scores.mean():.4f} ¬± {scores.std():.4f}\n")

# üìä R√©sum√©
summary_df = pd.DataFrame({k: v for k, v in results.items()})
summary_df.index = [f"Fold {i+1}" for i in range(5)]
print(summary_df)
print("\nüéØ Moyennes :")
print(summary_df.mean(axis=0).sort_values(ascending=False))



ModuleNotFoundError: No module named 'pandas'