<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 40px; border-radius: 15px;'>
    <h1 style='color: white; text-align: center; font-size: 48px; margin: 0; font-weight: 800;'>MODELISATION FINALE</h1>
    <h2 style='color: #e0e7ff; text-align: center; font-size: 28px; margin-top: 15px;'>Comparaison et Selection du Champion</h2>
    <p style='color: white; text-align: center; font-size: 16px; margin-top: 20px;'>Projet CLF02 - Phase 4 Finale</p>
</div>

<div class='section-header'>
    <h2 style='margin: 0;'>1. CONFIGURATION</h2>
</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import joblib
import json
from datetime import datetime
from collections import Counter

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, f1_score, precision_score, recall_score, ConfusionMatrixDisplay

import lightgbm as lgb
import mlflow
import mlflow.sklearn
import mlflow.lightgbm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)

RANDOM_STATE = 42
N_FOLDS = 5
N_FEATURES_SELECTED = 150

print('Configuration terminee')

In [None]:
mlflow.set_tracking_uri('file:../mlruns')
mlflow.set_experiment('credit_scoring_final')
print('MLflow configure')

<div class='section-header'>
    <h2 style='margin: 0;'>2. CHARGEMENT DES DONNEES</h2>
</div>

In [None]:
DATA_PATH = Path('../data/processed')

print('Chargement des donnees...')
train = pd.read_csv(DATA_PATH / 'train_final.csv')
test = pd.read_csv(DATA_PATH / 'test_final.csv')

X = train.drop('TARGET', axis=1)
y = train['TARGET']
X_test = test.copy()

print(f'Train: {train.shape}')
print(f'X: {X.shape}')
print(f'Distribution classe 1: {y.mean()*100:.2f}%')

In [None]:
# Filtrer colonnes numeriques
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()

if len(non_numeric) > 0:
    print(f'Colonnes non-numeriques: {len(non_numeric)}')
    X = X[numeric_cols]
    X_test = X_test[numeric_cols]

print(f'Donnees filtrees: {X.shape}')

In [None]:
# Split stratifie
X_temp, X_test_final, y_temp, y_test_final = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=RANDOM_STATE
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, stratify=y_temp, random_state=RANDOM_STATE
)

print('Split des donnees:')
print(f'  Train: {X_train.shape}')
print(f'  Validation: {X_val.shape}')
print(f'  Test: {X_test_final.shape}')

<div class='section-header'>
    <h2 style='margin: 0;'>3. FONCTIONS UTILITAIRES</h2>
</div>

In [None]:
def calculate_business_cost(y_true, y_pred_proba, threshold=0.5):
    """
    Calcule le business cost avec ratio automatique
    """
    # Calculer le ratio automatiquement
    n_positive = np.sum(y_true == 1)
    n_negative = np.sum(y_true == 0)
    ratio = n_negative / n_positive if n_positive > 0 else 1
    
    # Définir les coûts (seules les erreurs coûtent)
    cost_fn = 1.0
    cost_fp = 1.0 / ratio
    
    # Prédictions et matrice de confusion
    y_pred = (y_pred_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Coût total (seulement FN et FP)
    total_cost = -(fn * cost_fn) - (fp * cost_fp)
    
    return total_cost, {'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn}

def evaluate_model(y_true, y_pred_proba, threshold=0.5):
    y_pred = (y_pred_proba >= threshold).astype(int)
    auc = roc_auc_score(y_true, y_pred_proba)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    business_cost, cm_dict = calculate_business_cost(y_true, y_pred_proba, threshold)
    return {'auc': auc, 'precision': precision, 'recall': recall, 'f1': f1, 'business_cost': business_cost, 'threshold': threshold, **cm_dict}

print('Fonctions definies')

<div class='section-header'>
    <h2 style='margin: 0;'>4. FEATURE SELECTION</h2>
</div>

In [None]:
print('Feature selection multi-methodes...')

# Mutual Information
mi_scores = mutual_info_classif(X_train, y_train, random_state=RANDOM_STATE)
mi_features = X_train.columns[np.argsort(mi_scores)[-N_FEATURES_SELECTED:]].tolist()
print(f'1. Mutual Information: {len(mi_features)} features')

# Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_train)
rf_features = X_train.columns[np.argsort(rf.feature_importances_)[-N_FEATURES_SELECTED:]].tolist()
print(f'2. Random Forest: {len(rf_features)} features')

# F-score
f_scores, _ = f_classif(X_train, y_train)
f_features = X_train.columns[np.argsort(f_scores)[-N_FEATURES_SELECTED:]].tolist()
print(f'3. F-score: {len(f_features)} features')

# Intersection
all_features = mi_features + rf_features + f_features
feature_counts = Counter(all_features)
selected_features = [f for f, count in feature_counts.items() if count >= 2]

if len(selected_features) < N_FEATURES_SELECTED:
    for feat in mi_features:
        if feat not in selected_features:
            selected_features.append(feat)
            if len(selected_features) == N_FEATURES_SELECTED:
                break

selected_features = selected_features[:N_FEATURES_SELECTED]
print(f'\nFeatures selectionnees: {len(selected_features)}')

In [None]:
# Appliquer selection
X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test_selected = X_test_final[selected_features]

print(f'Shapes apres selection:')
print(f'  Train: {X_train_selected.shape}')
print(f'  Val: {X_val_selected.shape}')
print(f'  Test: {X_test_selected.shape}')

# Sauvegarder
FEATURES_PATH = Path('../artifacts/final_model')
FEATURES_PATH.mkdir(parents=True, exist_ok=True)
joblib.dump(selected_features, FEATURES_PATH / 'selected_features.pkl')
print(f'Features sauvegardees')

<div class='section-header'>
    <h2 style='margin: 0;'>5. BASELINE COMPARISON</h2>
</div>

### 5.1 Logistic Regression

In [None]:
print('Logistic Regression Baseline')
scaler_lr = StandardScaler()
X_train_scaled = scaler_lr.fit_transform(X_train_selected)
X_val_scaled = scaler_lr.transform(X_val_selected)

logreg = LogisticRegression(class_weight='balanced', C=1.0, max_iter=1000, random_state=RANDOM_STATE, n_jobs=-1)
logreg.fit(X_train_scaled, y_train)

y_pred_val_lr = logreg.predict_proba(X_val_scaled)[:, 1]
metrics_val_lr = evaluate_model(y_val, y_pred_val_lr)

print(f"AUC: {metrics_val_lr['auc']:.4f}")
print(f"F1: {metrics_val_lr['f1']:.4f}")
print(f"Cout: {metrics_val_lr['business_cost']:.0f}")

### 5.2 LightGBM

In [None]:
print('LightGBM Baseline')
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

lgbm = lgb.LGBMClassifier(
    objective='binary', metric='auc', scale_pos_weight=scale_pos_weight,
    n_estimators=200, learning_rate=0.05, num_leaves=31, max_depth=7,
    random_state=RANDOM_STATE, n_jobs=-1, verbose=-1
)
lgbm.fit(X_train_selected, y_train)

y_pred_val_lgbm = lgbm.predict_proba(X_val_selected)[:, 1]
metrics_val_lgbm = evaluate_model(y_val, y_pred_val_lgbm)

print(f"AUC: {metrics_val_lgbm['auc']:.4f}")
print(f"F1: {metrics_val_lgbm['f1']:.4f}")
print(f"Cout: {metrics_val_lgbm['business_cost']:.0f}")

### 5.3 Random Forest

In [None]:
print('Random Forest Baseline')
rf = RandomForestClassifier(
    n_estimators=200, max_depth=15, min_samples_split=20, min_samples_leaf=10,
    max_features='sqrt', class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1
)
rf.fit(X_train_selected, y_train)

y_pred_val_rf = rf.predict_proba(X_val_selected)[:, 1]
metrics_val_rf = evaluate_model(y_val, y_pred_val_rf)

print(f"AUC: {metrics_val_rf['auc']:.4f}")
print(f"F1: {metrics_val_rf['f1']:.4f}")
print(f"Cout: {metrics_val_rf['business_cost']:.0f}")

<div class='section-header'>
    <h2 style='margin: 0;'>6. COMPARAISON MULTI-CRITERES</h2>
</div>

In [None]:
baseline_results = [
    {'model': 'Logistic Regression', 'auc': metrics_val_lr['auc'], 'f1': metrics_val_lr['f1'], 'business_cost': metrics_val_lr['business_cost']},
    {'model': 'LightGBM', 'auc': metrics_val_lgbm['auc'], 'f1': metrics_val_lgbm['f1'], 'business_cost': metrics_val_lgbm['business_cost']},
    {'model': 'Random Forest', 'auc': metrics_val_rf['auc'], 'f1': metrics_val_rf['f1'], 'business_cost': metrics_val_rf['business_cost']}
]

df = pd.DataFrame(baseline_results)
df['auc_norm'] = df['auc']
df['cost_norm'] = 1 - (df['business_cost'] - df['business_cost'].min()) / (df['business_cost'].max() - df['business_cost'].min() + 1e-10)
df['f1_norm'] = df['f1']
df['final_score'] = 0.40 * df['auc_norm'] + 0.40 * df['cost_norm'] + 0.20 * df['f1_norm']
df = df.sort_values('final_score', ascending=False)

print('Comparaison (40% AUC + 40% Cout + 20% F1):')
print(df[['model', 'auc', 'f1', 'business_cost', 'final_score']].to_string(index=False))

champion_name = df.iloc[0]['model']
print(f'\nChampion: {champion_name}')

<div class='section-header'>
    <h2 style='margin: 0;'>7. OPTIMISATION DU CHAMPION</h2>
</div>

In [None]:
# Selectionner champion
if champion_name == 'LightGBM':
    champion_model = lgbm
    X_train_champion = X_train_selected
    X_val_champion = X_val_selected
    X_test_champion = X_test_selected
    needs_scaling = False
elif champion_name == 'Logistic Regression':
    champion_model = logreg
    X_train_champion = X_train_scaled
    X_val_champion = X_val_scaled
    X_test_champion = scaler_lr.transform(X_test_selected)
    needs_scaling = True
else:
    champion_model = rf
    X_train_champion = X_train_selected
    X_val_champion = X_val_selected
    X_test_champion = X_test_selected
    needs_scaling = False

print(f'Champion: {champion_name}')
print(f'Scaling: {needs_scaling}')

In [None]:
print('Optimisation hyperparametres...')

if champion_name == 'LightGBM':
    param_distributions = {
        'n_estimators': [100, 200, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [5, 7, 10, 15],
        'num_leaves': [31, 63, 127],
        'min_child_samples': [10, 20, 30],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0]
    }
    base_model = lgb.LGBMClassifier(objective='binary', metric='auc', scale_pos_weight=scale_pos_weight, random_state=RANDOM_STATE, n_jobs=-1, verbose=-1)
    n_iter = 50
elif champion_name == 'Logistic Regression':
    param_distributions = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'class_weight': ['balanced', None]
    }
    base_model = LogisticRegression(random_state=RANDOM_STATE, n_jobs=-1, max_iter=2000)
    n_iter = 30
else:
    param_distributions = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [10, 15, 20, 25],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 5, 10],
        'max_features': ['sqrt', 'log2']
    }
    base_model = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced')
    n_iter = 40

random_search = RandomizedSearchCV(
    base_model, param_distributions, n_iter=n_iter,
    cv=StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc', n_jobs=-1, verbose=1, random_state=RANDOM_STATE
)

random_search.fit(X_train_champion, y_train)
print(f'\nOptimisation terminee')
print(f'Meilleur score CV: {random_search.best_score_:.4f}')

In [None]:
best_model = random_search.best_estimator_
y_pred_val_opt = best_model.predict_proba(X_val_champion)[:, 1]
metrics_val_opt = evaluate_model(y_val, y_pred_val_opt)

print('Resultats apres optimisation:')
print(f"  AUC: {metrics_val_opt['auc']:.4f}")
print(f"  F1: {metrics_val_opt['f1']:.4f}")
print(f"  Cout: {metrics_val_opt['business_cost']:.0f}")

<div class='section-header'>
    <h2 style='margin: 0;'>8. THRESHOLD TUNING</h2>
</div>

In [None]:
print('Optimisation threshold...')
thresholds = np.arange(0.3, 0.8, 0.02)
threshold_results = []

for threshold in thresholds:
    metrics = evaluate_model(y_val, y_pred_val_opt, threshold=threshold)
    threshold_results.append(metrics)

threshold_df = pd.DataFrame(threshold_results)
best_threshold_idx = threshold_df['business_cost'].idxmax()
best_threshold = threshold_df.loc[best_threshold_idx, 'threshold']

print(f'Threshold optimal: {best_threshold:.3f}')
print(f"Cout optimal: {threshold_df.loc[best_threshold_idx, 'business_cost']:.0f}")

<div class='section-header'>
    <h2 style='margin: 0;'>9. VALIDATION FINALE</h2>
</div>

In [None]:
y_pred_test = best_model.predict_proba(X_test_champion)[:, 1]
metrics_test = evaluate_model(y_test_final, y_pred_test, threshold=best_threshold)

print('VALIDATION FINALE - TEST SET')
print(f"  AUC: {metrics_test['auc']:.4f}")
print(f"  Precision: {metrics_test['precision']:.4f}")
print(f"  Recall: {metrics_test['recall']:.4f}")
print(f"  F1: {metrics_test['f1']:.4f}")
print(f"  Cout: {metrics_test['business_cost']:.0f}")
print(f"  Threshold: {best_threshold:.3f}")

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test_final, y_pred_test)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, linewidth=2, label=f"{champion_name} (AUC = {metrics_test['auc']:.4f})")
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random')
plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curve - Test Set', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

<div class='section-header'>
    <h2 style='margin: 0;'>10. SAUVEGARDE</h2>
</div>

In [None]:
FINAL_PATH = Path('../artifacts/final_model')
FINAL_PATH.mkdir(parents=True, exist_ok=True)

print('Sauvegarde du modele final...')

joblib.dump(best_model, FINAL_PATH / 'best_model.pkl')

if needs_scaling:
    joblib.dump(scaler_lr, FINAL_PATH / 'scaler.pkl')

with open(FINAL_PATH / 'best_threshold.json', 'w') as f:
    json.dump({'threshold': float(best_threshold)}, f, indent=4)

performance = {
    'model_name': champion_name,
    'best_params': random_search.best_params_,
    'test_metrics': {
        'auc': float(metrics_test['auc']),
        'precision': float(metrics_test['precision']),
        'recall': float(metrics_test['recall']),
        'f1': float(metrics_test['f1']),
        'business_cost': float(metrics_test['business_cost'])
    },
    'timestamp': datetime.now().isoformat()
}

with open(FINAL_PATH / 'performance_metrics.json', 'w') as f:
    json.dump(performance, f, indent=4)

print(f'Sauvegarde terminee dans {FINAL_PATH}')

<div class='section-header'>
    <h2 style='margin: 0;'>11. RESUME FINAL</h2>
</div>

In [None]:
print('RESUME FINAL')
print('='*60)
print(f'Modele champion: {champion_name}')
print(f'\nPerformances Test Set:')
print(f"  AUC:       {metrics_test['auc']:.4f}")
print(f"  Precision: {metrics_test['precision']:.4f}")
print(f"  Recall:    {metrics_test['recall']:.4f}")
print(f"  F1:        {metrics_test['f1']:.4f}")
print(f"  Cout:      {metrics_test['business_cost']:.0f}")
print(f'\nConfiguration:')
print(f'  Features:  {N_FEATURES_SELECTED}')
print(f'  Threshold: {best_threshold:.3f}')
print('='*60)

<div style='background: linear-gradient(135deg, #10b981 0%, #059669 100%); padding: 30px; border-radius: 10px; margin-top: 30px;'>
    <h2 style='color: white; text-align: center; margin: 0;'>MODELISATION FINALE TERMINEE</h2>
    <p style='color: white; text-align: center; margin-top: 15px;'>Le modele champion est pret pour la production</p>
</div>