# Step 3 - Train and compare multiple algorithms

Details and rationale are implemented in the code cells below.


### 1. Load preprocessed data

Details and rationale are implemented in the code cells below.


In [None]:
import pandas as pd

# Load prepared data from the previous notebook
X_train = pd.read_parquet("../data/output/X_train.parquet")
X_test = pd.read_parquet("../data/output/X_test.parquet")
y_train = pd.read_parquet("../data/output/y_train.parquet").squeeze()
y_test = pd.read_parquet("../data/output/y_test.parquet").squeeze()

### 2. Set up cross-validation

Details and rationale are implemented in the code cells below.


In [None]:
from sklearn.model_selection import StratifiedKFold

# Define 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### 3. Train baseline models (Logistic Regression & Random Forest)

Details and rationale are implemented in the code cells below.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Dictionary of models to test
# Commented : 
# Logistic Regression: F1-score moyen = 0.1866 (+/- 0.0131)
# Random Forest: F1-score moyen = 0.2780 (+/- 0.0157)
# SVM: F1-score moyen = 0.1699 (+/- 0.0059)
models = {
#    "Logistic Regression": LogisticRegression(max_iter=500, class_weight='balanced'),
#    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5, class_weight='balanced'),
#    "SVM": SVC(probability=True, class_weight='balanced'),
    "XGBoost": XGBClassifier(eval_metric="logloss", scale_pos_weight=10, random_state=42),
    "LightGBM": LGBMClassifier(class_weight="balanced", verbose=-1)
}

### 4. Try stronger models (Boosting, MLP)

Details and rationale are implemented in the code cells below.


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

def clean_column_names(df):
    df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
    return df

X_train = clean_column_names(X_train)
X_test = clean_column_names(X_test)

X_sample = X_train.sample(n=5000, random_state=42)
y_sample = y_train.loc[X_sample.index]

# LightGBM does not support special characters in column names; clean them
X_sample.columns = X_sample.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# Score based on F1-score
scorer = make_scorer(f1_score, average="binary", pos_label=1)

# Evaluate models with cross-validation
for name, model in models.items():
    scores = cross_val_score(model, X_sample, y_sample, cv=cv, scoring=scorer)
    print(f"{name}: F1-score moyen = {scores.mean():.4f} (+/- {scores.std():.4f})")

### 5. Handle class imbalance

Details and rationale are implemented in the code cells below.


In [None]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("modele_classification")

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred)
        f1_train = f1_score(y_train, model.predict(X_train))

        mlflow.log_param("model", name)
        mlflow.log_metric("f1_train", f1_train)
        mlflow.log_metric("f1_test", score)

        signature = infer_signature(X_test, y_pred)
        mlflow.sklearn.log_model(model, name.replace(" ", "_").lower(), signature=signature)

### 6. Track experiments with MLflow

Details and rationale are implemented in the code cells below.


In [None]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import joblib

# Parameters
best_model = XGBClassifier(n_estimators=100, max_depth=5, use_label_encoder=False, eval_metric="logloss")
best_model.fit(X_train, y_train)

# Predictions
y_pred = best_model.predict(X_test)
score = f1_score(y_test, y_pred)

# Tracking MLflow
with mlflow.start_run(run_name="XGBoost_best_model"):
    # Log hyperparameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("model_type", "XGBoost")

    # Log metrics
    mlflow.log_metric("f1_score", score)

    # Input/output signature
    signature = infer_signature(X_test, y_pred)

    # Log du model
    mlflow.sklearn.log_model(best_model, "xgboost_best_model", signature=signature)

    # Optional: add tags for traceability
    mlflow.set_tags({
        "stage": "final_model",
        "author": "David Worsley-Tonks",
        "model": "XGBoost",
        "version": "v1"
    })

# Sauvegarde locale
joblib.dump(best_model, "../models/credit_scoring_xgb.pkl")

### 7. Compare performance and select a model

Details and rationale are implemented in the code cells below.


In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score

# Comparaison des performances des models sur le test set
scores = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    scores.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    })

import pandas as pd

df_scores = pd.DataFrame(scores).sort_values(by="F1-score", ascending=False)
display(df_scores)

### 8. Explore results in the MLflow UI

Details and rationale are implemented in the code cells below.


In [None]:
import webbrowser

# Open the MLflow UI locally
webbrowser.open("http://localhost:8889")

### 9. Conclusion

Details and rationale are implemented in the code cells below.
