<a href="https://colab.research.google.com/github/nadaimani789/cc-datasience-2025/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 0) Installer dépendances (décommenter si nécessaire)
!pip install -q category_encoders xgboost shap joblib

# 1) Imports
import os
import zipfile
import io
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
from sklearn.calibration import CalibratedClassifierCV
import category_encoders as ce
import joblib
import shap

# For Colab file upload
try:
    from google.colab import files
    _IS_COLAB = True
except Exception:
    _IS_COLAB = False

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ---------- 2) Chargement du dataset ----------
# Le script tente plusieurs méthodes :
# - chercher un fichier CSV local 'credit_scoring.csv' ou 'creditscoring-data.zip'
# - sinon, proposer d'uploader le fichier via l'interface Colab

def load_dataset():
    possible_csv_names = ['credit_scoring.csv', 'creditscoring.csv', 'credit_scoring_data.csv', 'credit_scoring_data.csv', 'credit_scoring.csv', 'credit_scoring-data.csv']
    # 1) chercher dans le répertoire courant
    for name in possible_csv_names:
        if os.path.exists(name):
            print(f"Chargement depuis le fichier local : {name}")
            return pd.read_csv(name, sep=';')
    # 2) chercher zip mentionné (ex: creditscoring-data.zip)
    zip_names = ['creditscoring-data.zip', 'credit_scoring.zip', 'creditscoring_data.zip']
    for z in zip_names:
        if os.path.exists(z):
            print(f"Extraction puis chargement depuis l'archive : {z}")
            with zipfile.ZipFile(z, 'r') as archive:
                # trouver le premier csv dans l'archive
                for f in archive.namelist():
                    if f.lower().endswith('.csv'):
                        print(f"  -> lecture de {f} depuis l'archive")
                        with archive.open(f) as fh:
                            return pd.read_csv(fh, sep=';')
    # 3) si Colab, proposer upload
    if _IS_COLAB:
        print("Aucun fichier trouvé automatiquement. Veuillez uploader votre fichier CSV ou ZIP contenant le dataset.")
        uploaded = files.upload()
        for fn in uploaded:
            if fn.lower().endswith('.csv'):
                print(f"Chargé : {fn}")
                return pd.read_csv(io.BytesIO(uploaded[fn]), sep=';')
            elif fn.lower().endswith('.zip'):
                print(f"Extraction depuis : {fn}")
                with zipfile.ZipFile(io.BytesIO(uploaded[fn]), 'r') as archive:
                    for f in archive.namelist():
                        if f.lower().endswith('.csv'):
                            print(f"  -> lecture de {f} depuis l'archive")
                            with archive.open(f) as fh:
                                return pd.read_csv(fh, sep=';')
    # 4) échec
    raise FileNotFoundError("Aucun dataset trouvé. Déposez un fichier CSV/ZIP dans l'environnement ou nommez votre fichier 'credit_scoring.csv'.")

# Charge le dataset (exécuter)
df = load_dataset()
print("Taille du dataset :", df.shape)
display(df.head(5))

# ---------- 3) Dictionnaire & exploration sommaire ----------
def describe_dataset(df):
    print("---- Aperçu des colonnes ----")
    display(pd.DataFrame({
        'dtype': df.dtypes.astype(str),
        'n_unique': df.nunique(),
        'n_missing': df.isna().sum()
    }))
    print("\nExemples de valeurs (quelques colonnes) :")
    display(df.iloc[:, :10].head())
describe_dataset(df)

# Trouver automatiquement la target probable (colonne 'default' si présente)
if 'default' in df.columns:
    target_col = 'default'
else:
    # heuristique : colonne contenant 'default' ou 'defaut' ou 'target' ou 'y'
    candidates = [c for c in df.columns if any(x in c.lower() for x in ['default','defaut','target','y'])]
    target_col = candidates[0] if candidates else None

if target_col is None:
    print("Aucune colonne target trouvée automatiquement. Merci de renommer la colonne target en 'default' ou indiquer le nom de la target dans le code.")
    # On arrête pour éviter erreurs
    raise Exception("Target non trouvée. Renommez la colonne target en 'default' ou modifiez la variable target_col.")
else:
    print(f"Target détectée : '{target_col}'")
    # convertir target en binaire si besoin
    if df[target_col].dtype == object:
        # essayer mapping commun
        df[target_col] = df[target_col].map({'yes':1,'no':0,'Y':1,'N':0,'Yes':1,'No':0}).fillna(df[target_col])
    # forcer int
    df[target_col] = pd.to_numeric(df[target_col], errors='coerce').fillna(0).astype(int)
    print(df[target_col].value_counts())

TARGET = target_col

# ---------- 4) Pré-traitement initial ----------
# 4.1 Nettoyage de base
df = df.drop_duplicates().reset_index(drop=True)
# optionnel : supprimer colonnes id trop uniques (ex: customer_id)
# garder une copie origine
df_orig = df.copy()

# 4.2 Séparer X / y et train-test split (stratifié)
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print("Taille train:", X_train.shape, "Taille test:", X_test.shape)
print("Distribution target train:\n", y_train.value_counts(normalize=True))

# 4.3 Identifier colonnes numériques et catégoriques
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category', 'bool']).columns.tolist()

# retirer colonnes avec trop de cardinalité si id (heuristique)
# Exclure colonnes numériques qui sont en réalité des ids (toutes valeurs uniques)
to_drop = []
for col in numeric_cols:
    if X_train[col].nunique() == X_train.shape[0]:
        to_drop.append(col)
if to_drop:
    print("Colonnes identifiées comme ID (trop d'unicité) -> suppression :", to_drop)
    X_train = X_train.drop(columns=to_drop)
    X_test = X_test.drop(columns=to_drop)
    numeric_cols = [c for c in numeric_cols if c not in to_drop]

# rafraîchir listes
numeric_cols = [c for c in X_train.columns if pd.api.types.is_numeric_dtype(X_train[c])]
cat_cols = [c for c in X_train.columns if c not in numeric_cols]

print("Numériques:", numeric_cols)
print("Catégoriques:", cat_cols)

# ---------- 5) Feature Engineering (exemples génériques) ----------
def add_basic_features(df):
    df = df.copy()
    # Exemple: si colonne 'loan_amount' et 'monthly_income' existent => debt_to_income
    if 'loan_amount' in df.columns and 'monthly_income' in df.columns:
        # éviter division par zéro
        df['debt_to_income'] = df['loan_amount'] / (df['monthly_income'].replace({0: np.nan}))
    # age groups
    if 'age' in df.columns:
        df['age_group'] = pd.cut(df['age'], bins=[0,25,35,50,70,120], labels=['<=25','26-35','36-50','51-70','70+'])
    # Nb features null par observation
    df['n_missing_row'] = df.isna().sum(axis=1)
    return df

X_train = add_basic_features(X_train)
X_test = add_basic_features(X_test)

# mettre à jour listes de colonnes
numeric_cols = [c for c in X_train.columns if pd.api.types.is_numeric_dtype(X_train[c])]
cat_cols = [c for c in X_train.columns if c not in numeric_cols]

print("Après feature engineering, num:", numeric_cols)
print("Après feature engineering, cat:", cat_cols)

# ---------- 6) Pipeline de préprocessing ----------
# Choix :
# - pour numériques : IterativeImputer (MICE) + StandardScaler
# - pour catégoriques : imputer constant + OneHotEncoder pour faibles cardinalités
#   pour hautes cardinalités on utilisera TargetEncoder dans un pipeline séparé (category_encoders)

# Séparer cat en low_card et high_card
low_card = [c for c in cat_cols if X_train[c].nunique() <= 10]
high_card = [c for c in cat_cols if X_train[c].nunique() > 10]

print("Cat low_card:", low_card)
print("Cat high_card:", high_card)

numeric_transformer = Pipeline([
    ('imputer', IterativeImputer(random_state=RANDOM_STATE, max_iter=10)),
    ('scaler', StandardScaler())
])

lowcard_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# For high_card, use TargetEncoder (needs to be fit with y) -> handled in a custom pipeline below

# ColumnTransformer for low-card categorical + numeric
preprocessor_basic = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('lowcat', lowcard_transformer, low_card)
], remainder='drop', verbose_feature_names_out=False)

# ---------- 7) Baseline model training function ----------
def evaluate_model_with_target_encoder(model, X_train, y_train, X_test, y_test, random_state=RANDOM_STATE, use_calibration=False):
    # We need to handle high_card columns via TargetEncoder fitted on train only
    Xtr = X_train.copy()
    Xte = X_test.copy()
    if high_card:
        te = ce.TargetEncoder(cols=high_card, smoothing=0.3)
        te.fit(Xtr[high_card], y_train)
        Xtr_he = te.transform(Xtr[high_card])
        Xte_he = te.transform(Xte[high_card])
        # drop original high_card and concat transformed
        Xtr = pd.concat([Xtr.drop(columns=high_card), Xtr_he], axis=1)
        Xte = pd.concat([Xte.drop(columns=high_card), Xte_he], axis=1)

    # Preprocess basic (numeric + lowcard)
    Xtr_prep = preprocessor_basic.fit_transform(Xtr)
    Xte_prep = preprocessor_basic.transform(Xte)

    # If OneHot created many cols, preprocessor_basic returns numpy array; that's fine
    clf = model
    if use_calibration:
        clf = CalibratedClassifierCV(base_estimator=model, cv=3)

    clf.fit(Xtr_prep, y_train)
    y_pred = clf.predict(Xte_prep)
    # some classifiers (CalibratedClassifierCV) may not have predict_proba for certain base estimators
    try:
        y_proba = clf.predict_proba(Xte_prep)[:,1]
    except:
        # fallback: decision_function then sigmoid
        try:
            scores = clf.decision_function(Xte_prep)
            y_proba = (scores - scores.min()) / (scores.max() - scores.min())
        except:
            y_proba = np.zeros_like(y_pred, dtype=float)

    results = {
        'accuracy': accuracy_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba),
        'y_pred': y_pred,
        'y_proba': y_proba,
        'model': clf,
        'Xte_prep': Xte_prep
    }
    return results

# ---------- 8) Modèles testés : Logistic, RandomForest, XGBoost ----------
# 8.1 LogisticRegression baseline
print("\n--- Entraînement : LogisticRegression (baseline) ---")
log_clf = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=RANDOM_STATE)
res_log = evaluate_model_with_target_encoder(log_clf, X_train, y_train, X_test, y_test)
print("Logistic metrics:", {k: round(v,4) for k,v in res_log.items() if k in ['accuracy','f1','precision','recall','roc_auc']})

# 8.2 RandomForest baseline
print("\n--- Entraînement : RandomForest (baseline) ---")
rf_clf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
res_rf = evaluate_model_with_target_encoder(rf_clf, X_train, y_train, X_test, y_test)
print("RandomForest metrics:", {k: round(v,4) for k,v in res_rf.items() if k in ['accuracy','f1','precision','recall','roc_auc']})

# 8.3 XGBoost baseline
print("\n--- Entraînement : XGBoost (baseline) ---")
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
res_xgb = evaluate_model_with_target_encoder(xgb_clf, X_train, y_train, X_test, y_test)
print("XGBoost metrics:", {k: round(v,4) for k,v in res_xgb.items() if k in ['accuracy','f1','precision','recall','roc_auc']})

# ---------- 9) Cross-validation (StratifiedKFold) pour comparaison robuste ----------
def cross_validate_pipeline(model, X, y, n_splits=5):
    # We'll perform a custom CV that applies target encoding within each fold to avoid leakage
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    metrics = {'accuracy':[], 'f1':[], 'precision':[], 'recall':[], 'roc_auc':[]}
    for train_idx, val_idx in skf.split(X, y):
        Xtr, Xval = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        ytr, yval = y.iloc[train_idx], y.iloc[val_idx]
        res = evaluate_model_with_target_encoder(model, Xtr, ytr, Xval, yval)
        for m in metrics:
            metrics[m].append(res[m])
    return {m: np.mean(metrics[m]) for m in metrics}

print("\n--- Cross-validation 5-fold (estimation) ---")
print("Logistic CV:", cross_validate_pipeline(log_clf, X_train, y_train))
print("RF CV:", cross_validate_pipeline(rf_clf, X_train, y_train))
print("XGB CV:", cross_validate_pipeline(xgb_clf, X_train, y_train))

# ---------- 10) Hyperparam tuning (RandomizedSearchCV) pour RandomForest et XGBoost ----------
# NOTE: nous faisons une recherche en pipeline simplifiée (target encoding done inside fit function not directly compatible with sklearn CV),
# donc nous allons tuner sur processed numpy arrays using full-train target encoding (approx) or use sklearn-compatible wrappers.
# Pour simplifier et être reproductible : nous encoderons high-card cat with TargetEncoder on whole train (acceptable but not ideal).
# Ensuite RandomizedSearch sur pipeline contenant preprocessor_basic and model.

# Préparer X_train_enc, X_test_enc une fois (target-encoding sur train)
def prepare_encoded_arrays(X_train, y_train, X_test):
    Xtr = X_train.copy()
    Xte = X_test.copy()
    if high_card:
        te = ce.TargetEncoder(cols=high_card, smoothing=0.3)
        te.fit(Xtr[high_card], y_train)
        Xtr_he = te.transform(Xtr[high_card])
        Xte_he = te.transform(Xte[high_card])
        Xtr = pd.concat([Xtr.drop(columns=high_card), Xtr_he], axis=1)
        Xte = pd.concat([Xte.drop(columns=high_card), Xte_he], axis=1)
    Xtr_p = preprocessor_basic.fit_transform(Xtr)
    Xte_p = preprocessor_basic.transform(Xte)
    return Xtr_p, Xte_p

print("\nPréparation des arrays encodés pour tuning (target-encoding sur train)...")
Xtr_p, Xte_p = prepare_encoded_arrays(X_train, y_train, X_test)

# 10.1 RandomForest RandomizedSearch
from scipy.stats import randint, uniform
rf_param_dist = {
    'n_estimators': [100,200,400],
    'max_depth': [None, 6, 10, 20],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4],
}
print("Recherche hyperparamètres RandomForest (RandomizedSearch)")
rf_base = RandomForestClassifier(class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
rs_rf = RandomizedSearchCV(rf_base, rf_param_dist, n_iter=20, scoring='roc_auc', cv=3, random_state=RANDOM_STATE, n_jobs=-1)
rs_rf.fit(Xtr_p, y_train)
print("Meilleurs params RF:", rs_rf.best_params_)
best_rf = rs_rf.best_estimator_
y_proba_rf = best_rf.predict_proba(Xte_p)[:,1]
print("RF test ROC-AUC:", round(roc_auc_score(y_test, y_proba_rf),4))

# 10.2 XGBoost RandomizedSearch
xgb_param_dist = {
    'n_estimators': [100,200,400],
    'max_depth': [3,6,10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6,0.8,1.0],
    'colsample_bytree': [0.6,0.8,1.0]
}
print("Recherche hyperparamètres XGBoost (RandomizedSearch)")
xgb_base = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
rs_xgb = RandomizedSearchCV(xgb_base, xgb_param_dist, n_iter=25, scoring='roc_auc', cv=3, random_state=RANDOM_STATE, n_jobs=-1)
rs_xgb.fit(Xtr_p, y_train)
print("Meilleurs params XGB:", rs_xgb.best_params_)
best_xgb = rs_xgb.best_estimator_
y_proba_xgb = best_xgb.predict_proba(Xte_p)[:,1]
print("XGB test ROC-AUC:", round(roc_auc_score(y_test, y_proba_xgb),4))

# 10.3 LogisticRegression tuning (C)
from sklearn.model_selection import GridSearchCV
log_param_grid = {'C':[0.01,0.1,1,10,100]}
log_pipe = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=RANDOM_STATE)
gs_log = GridSearchCV(log_pipe, log_param_grid, scoring='roc_auc', cv=3, n_jobs=-1)
gs_log.fit(Xtr_p, y_train)
best_log = gs_log.best_estimator_
y_proba_log = best_log.predict_proba(Xte_p)[:,1]
print("Log best C:", gs_log.best_params_, "ROC-AUC:", round(roc_auc_score(y_test, y_proba_log),4))

# ---------- 11) Évaluation finale & rapports ----------
def print_eval(y_true, y_pred, y_proba, model_name="model"):
    print(f"\n--- Évaluation: {model_name} ---")
    print("Accuracy:", round(accuracy_score(y_true, y_pred),4))
    print("Precision:", round(precision_score(y_true, y_pred),4))
    print("Recall:", round(recall_score(y_true, y_pred),4))
    print("F1:", round(f1_score(y_true, y_pred),4))
    print("ROC-AUC:", round(roc_auc_score(y_true, y_proba),4))
    print("\nClassification report:\n", classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Matrice de confusion - {model_name}")
    plt.xlabel("Pred")
    plt.ylabel("True")
    plt.show()
    # ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc_val = auc(fpr, tpr)
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc_val:.4f}")
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC - {model_name}")
    plt.legend()
    plt.show()
    # Precision-Recall
    prec, rec, _ = precision_recall_curve(y_true, y_proba)
    pr_auc = auc(rec, prec)
    plt.figure(figsize=(5,4))
    plt.plot(rec, prec, label=f"PR AUC = {pr_auc:.4f}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall - {model_name}")
    plt.legend()
    plt.show()

# Évaluer best models (sur Xte_p préparé)
y_pred_rf = best_rf.predict(Xte_p)
y_proba_rf = best_rf.predict_proba(Xte_p)[:,1]
print_eval(y_test, y_pred_rf, y_proba_rf, "RandomForest (tuned)")

y_pred_xgb = best_xgb.predict(Xte_p)
y_proba_xgb = best_xgb.predict_proba(Xte_p)[:,1]
print_eval(y_test, y_pred_xgb, y_proba_xgb, "XGBoost (tuned)")

y_pred_log = best_log.predict(Xte_p)
y_proba_log = best_log.predict_proba(Xte_p)[:,1]
print_eval(y_test, y_pred_log, y_proba_log, "LogisticRegression (tuned)")

# ---------- 12) Interprétabilité : importances + SHAP (pour XGBoost si disponible) ----------
def show_feature_importance(model, X_processed, feature_names=None, top_n=20):
    if hasattr(model, "feature_importances_"):
        imp = model.feature_importances_
        if feature_names is None:
            feature_names = [f"f{i}" for i in range(len(imp))]
        df_imp = pd.DataFrame({'feature':feature_names, 'importance':imp}).sort_values('importance', ascending=False).head(top_n)
        plt.figure(figsize=(8,6))
        sns.barplot(data=df_imp, x='importance', y='feature')
        plt.title("Feature importances")
        plt.show()
    else:
        print("Le modèle n'a pas d'attribut feature_importances_.")

# essayer obtenir feature names (preprocessor_basic crée numpy arrays)
# construire feature names approximatives
def get_feature_names_from_preprocessor(preprocessor, X_sample):
    # numeric names + onehot categories
    num_names = numeric_cols
    cat_names = []
    if low_card:
        # get onehot names
        ohe = preprocessor.named_transformers_.get('lowcat').named_steps['onehot']
        try:
            categories = ohe.categories_
            for i, catcol in enumerate(low_card):
                for c in categories[i]:
                    cat_names.append(f"{catcol}__{c}")
        except:
            # fallback
            cat_names += low_card
    return num_names + cat_names

feature_names = get_feature_names_from_preprocessor(preprocessor_basic, X_train)
print("Noms features approximatifs (extrait):", feature_names[:30])

print("\nImportances RF (tuned):")
show_feature_importance(best_rf, Xtr_p, feature_names=feature_names)

# SHAP for XGBoost (peut être lent)
try:
    print("\nCalcul des valeurs SHAP pour XGBoost (peut prendre du temps)...")
    explainer = shap.TreeExplainer(best_xgb)
    shap_values = explainer.shap_values(Xtr_p)
    shap.summary_plot(shap_values, Xtr_p, feature_names=feature_names, show=True)
except Exception as e:
    print("SHAP échoué ou trop lent dans cet environnement:", e)

# ---------- 13) Calibration & sauvegarde du modèle final ----------
# Exemple : calibrer le meilleur modèle si nécessaire (ici XGBoost)
calibrated = CalibratedClassifierCV(best_xgb, cv=3)
calibrated.fit(Xtr_p, y_train)
y_proba_cal = calibrated.predict_proba(Xte_p)[:,1]
print("XGB calibré ROC-AUC:", round(roc_auc_score(y_test, y_proba_cal),4))

# Sauvegarder préprocesseur (encoders) + best model
# On sauvegarde : preprocessor_basic (fit), target encoder (if used), and model
# Pour simplicité, on sauvegarde les objets suivants :
artifacts = {}
# sauvegarder preprocessor_basic (déjà fit)
artifacts['preprocessor_basic'] = preprocessor_basic
if high_card:
    # sauvegarder target encoder entrainé sur tout train
    te_final = ce.TargetEncoder(cols=high_card, smoothing=0.3)
    te_final.fit(X_train[high_card], y_train)
    artifacts['target_encoder'] = te_final
# sauvegarder model calibré
artifacts['model'] = calibrated

joblib.dump(artifacts, 'credit_model_artifacts.joblib')
print("Artifacts sauvegardés dans 'credit_model_artifacts.joblib'")

# Si Colab => proposer téléchargement
if _IS_COLAB:
    try:
        files.download('credit_model_artifacts.joblib')
    except Exception:
        pass

# ---------- 14) Résumé final et suggestions pour le rapport ----------
print("""
--- FIN du script ---
Résumé :
 - 3 modèles testés : LogisticRegression, RandomForest, XGBoost.
 - Hyperparam tuning réalisé pour RF et XGB (RandomizedSearch).
 - Évaluations affichées (ROC-AUC, F1, matrice de confusion, courbes).
 - Artifacts sauvegardés : préprocesseur, target encoder (si présent), modèle calibré.

Suggestions pour le rapport scientifique (à inclure):
 - Décrire le dataset : taille, variables, target.
 - Justifier les choix d'imputation (IterativeImputer), d'encodage (OneHot vs TargetEncoding).
 - Expliquer la stratégie de validation (StratifiedKFold).
 - Présenter les métriques clés (ROC-AUC, F1, recall) et discuter des faux négatifs.
 - Limites : fuite de données possible si TargetEncoding mal appliqué (éviter d'encoder sur l'ensemble).
 - Pistes d'amélioration : collecte features comportementales, pipeline complet sklearn-compatible, calibrage des probabilités, tests coûts/avantages business.
""")

Aucun fichier trouvé automatiquement. Veuillez uploader votre fichier CSV ou ZIP contenant le dataset.


In [4]:
import pandas as pd
import os

file_name = 'bank.csv'

if os.path.exists(file_name):
    print(f"Tentative de chargement du fichier '{file_name}' pour vérifier le délimiteur...")
    try:
        # Tentative de chargement avec point-virgule
        df_bank = pd.read_csv(file_name, sep=';')
        if df_bank.shape[1] > 1: # Si plus d'une colonne, le délimiteur est probablement correct
            print(f"'{file_name}' chargé avec succès en utilisant le délimiteur ';'. Forme: {df_bank.shape}")
            display(df_bank.head())
        else:
            print(f"'{file_name}' chargé avec délimiteur ';' a résulté en {df_bank.shape[1]} colonne. Essai avec délimiteur ','.")
            # Tentative de chargement avec virgule
            df_bank = pd.read_csv(file_name, sep=',')
            if df_bank.shape[1] > 1:
                print(f"'{file_name}' chargé avec succès en utilisant le délimiteur ','. Forme: {df_bank.shape}")
                display(df_bank.head())
            else:
                print(f"'{file_name}' chargé avec le délimiteur ',' a également résulté en {df_bank.shape[1]} colonne. Inspection manuelle nécessaire.")
                display(df_bank.head())
    except Exception as e:
        print(f"Erreur lors du chargement de '{file_name}': {e}")
else:
    print(f"Le fichier '{file_name}' n'a pas été trouvé dans le répertoire courant. Veuillez vous assurer qu'il est téléchargé.")

# Si le fichier 'bank.csv' est le fichier principal que vous souhaitez utiliser,
# vous devrez peut-être réexécuter la cellule de chargement du dataset (section 2)
# ou adapter le code pour utiliser `df_bank` à la place de `df`.


Tentative de chargement du fichier 'bank.csv' pour vérifier le délimiteur...
'bank.csv' chargé avec succès en utilisant le délimiteur ';'. Forme: (4521, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
