In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score, log_loss,
    brier_score_loss, confusion_matrix
)

import joblib
import os


In [2]:
PROCESSED_PATH = "../data/processed/"

X_train = pd.read_csv(PROCESSED_PATH + "X_train.csv")
X_val   = pd.read_csv(PROCESSED_PATH + "X_val.csv")
y_train = pd.read_csv(PROCESSED_PATH + "y_train.csv").squeeze("columns")
y_val   = pd.read_csv(PROCESSED_PATH + "y_val.csv").squeeze("columns")

print("X_train:", X_train.shape, "X_val:", X_val.shape)
print("y_train:", y_train.shape, "y_val:", y_val.shape)


X_train: (24000, 24) X_val: (6000, 24)
y_train: (24000,) y_val: (6000,)


In [3]:
def ks_statistic(y_true, y_prob):
    # KS = max difference between CDFs of positives and negatives
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    order = np.argsort(y_prob)
    y_true_sorted = y_true[order]

    pos = (y_true_sorted == 1).astype(int)
    neg = (y_true_sorted == 0).astype(int)

    cdf_pos = np.cumsum(pos) / max(pos.sum(), 1)
    cdf_neg = np.cumsum(neg) / max(neg.sum(), 1)
    return float(np.max(np.abs(cdf_pos - cdf_neg)))

def expected_calibration_error(y_true, y_prob, n_bins=10):
    # Simple ECE
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)

    bins = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        lo, hi = bins[i], bins[i + 1]
        mask = (y_prob >= lo) & (y_prob < hi) if i < n_bins - 1 else (y_prob >= lo) & (y_prob <= hi)
        if mask.sum() == 0:
            continue
        acc = y_true[mask].mean()
        conf = y_prob[mask].mean()
        ece += (mask.sum() / len(y_prob)) * abs(acc - conf)
    return float(ece)

def evaluate_model(name, model, X_tr, y_tr, X_va, y_va):
    model.fit(X_tr, y_tr)
    prob_va = model.predict_proba(X_va)[:, 1]

    metrics = {
        "model": name,
        "roc_auc": float(roc_auc_score(y_va, prob_va)),
        "pr_auc": float(average_precision_score(y_va, prob_va)),
        "log_loss": float(log_loss(y_va, prob_va)),
        "brier": float(brier_score_loss(y_va, prob_va)),
        "ks": ks_statistic(y_va, prob_va),
        "ece_10bin": expected_calibration_error(y_va, prob_va, n_bins=10),
    }
    return metrics, model


In [4]:
models = []

# 1) Logistic Regression (strong baseline)
models.append(("LogReg", LogisticRegression(max_iter=5000, class_weight="balanced", n_jobs=None)))

# 2) Random Forest (robust, non-linear)
models.append(("RandomForest", RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)))

# 3) Gradient boosting (sklearn) as reliable fallback
models.append(("HistGB", HistGradientBoostingClassifier(
    max_depth=6,
    learning_rate=0.07,
    max_iter=300,
    random_state=42
)))

# 4) Optional: XGBoost if installed
try:
    from xgboost import XGBClassifier
    models.append(("XGBoost", XGBClassifier(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1
    )))
except Exception as e:
    print("XGBoost not available, skipping.", type(e).__name__, "-", e)

# 5) Optional: LightGBM if installed
try:
    import lightgbm as lgb
    models.append(("LightGBM", lgb.LGBMClassifier(
        n_estimators=800,
        learning_rate=0.03,
        num_leaves=31,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        n_jobs=-1
    )))
except Exception as e:
    print("LightGBM not available, skipping.", type(e).__name__, "-", e)

print("Models to train:", [m[0] for m in models])


XGBoost not available, skipping. ModuleNotFoundError - No module named 'xgboost'
LightGBM not available, skipping. ModuleNotFoundError - No module named 'lightgbm'
Models to train: ['LogReg', 'RandomForest', 'HistGB']


In [5]:
results = []
trained_models = {}

for name, model in models:
    metrics, fitted = evaluate_model(name, model, X_train, y_train, X_val, y_val)
    results.append(metrics)
    trained_models[name] = fitted
    print(name, "=> ROC-AUC:", round(metrics["roc_auc"], 4), "| PR-AUC:", round(metrics["pr_auc"], 4), "| ECE:", round(metrics["ece_10bin"], 4))


LogReg => ROC-AUC: 0.7085 | PR-AUC: 0.4901 | ECE: 0.2338
RandomForest => ROC-AUC: 0.7698 | PR-AUC: 0.5526 | ECE: 0.0418
HistGB => ROC-AUC: 0.7784 | PR-AUC: 0.5565 | ECE: 0.0075


In [6]:
leaderboard = pd.DataFrame(results).sort_values(
    by=["roc_auc", "log_loss", "ece_10bin"],
    ascending=[False, True, True]
).reset_index(drop=True)

leaderboard


Unnamed: 0,model,roc_auc,pr_auc,log_loss,brier,ks,ece_10bin
0,HistGB,0.778389,0.5565,0.430233,0.135259,0.427061,0.007453
1,RandomForest,0.769837,0.552647,0.440167,0.13884,0.41413,0.041784
2,LogReg,0.708464,0.490074,0.607468,0.20884,0.358291,0.233846


In [7]:
winner_name = leaderboard.loc[0, "model"]
winner_model = trained_models[winner_name]
print("Winner:", winner_name)


Winner: HistGB


In [8]:
os.makedirs("../artifacts/models", exist_ok=True)
os.makedirs("../artifacts/reports", exist_ok=True)
os.makedirs("../artifacts/metrics", exist_ok=True)

leaderboard.to_csv("../artifacts/reports/leaderboard.csv", index=False)
joblib.dump(winner_model, f"../artifacts/models/best_model_{winner_name}.joblib")

print("Saved leaderboard.csv and best model joblib")


Saved leaderboard.csv and best model joblib


In [9]:
from sklearn.calibration import CalibratedClassifierCV

calibrator = CalibratedClassifierCV(winner_model, method="isotonic", cv=3)
calibrator.fit(X_train, y_train)

prob_cal = calibrator.predict_proba(X_val)[:, 1]

cal_metrics = {
    "model": f"{winner_name}+Calibrated(Isotonic)",
    "roc_auc": float(roc_auc_score(y_val, prob_cal)),
    "pr_auc": float(average_precision_score(y_val, prob_cal)),
    "log_loss": float(log_loss(y_val, prob_cal)),
    "brier": float(brier_score_loss(y_val, prob_cal)),
    "ks": ks_statistic(y_val, prob_cal),
    "ece_10bin": expected_calibration_error(y_val, prob_cal, n_bins=10),
}

cal_metrics


{'model': 'HistGB+Calibrated(Isotonic)',
 'roc_auc': 0.7779336021148604,
 'pr_auc': 0.555210255695134,
 'log_loss': 0.43035102264405684,
 'brier': 0.13523625917416685,
 'ks': 0.42346991350365126,
 'ece_10bin': 0.007188521587524174}

In [10]:
joblib.dump(calibrator, "../artifacts/models/calibrator.joblib")
pd.DataFrame([cal_metrics]).to_csv("../artifacts/metrics/calibrated_winner_metrics.csv", index=False)

print("Saved calibrator.joblib and calibrated_winner_metrics.csv")


Saved calibrator.joblib and calibrated_winner_metrics.csv


In [11]:
import json, os
from datetime import datetime

os.makedirs("../artifacts/metrics", exist_ok=True)

run_summary = {
    "timestamp_utc": datetime.utcnow().isoformat() + "Z",
    "dataset": "UCI Default of Credit Card Clients",
    "models_trained": leaderboard["model"].tolist(),
    "leaderboard": leaderboard.to_dict(orient="records"),
    "winner": winner_name,
}

# add calibrated metrics if you computed them
try:
    run_summary["calibrated_winner_metrics"] = cal_metrics
except NameError:
    run_summary["calibrated_winner_metrics"] = None

with open("../artifacts/metrics/run_summary.json", "w") as f:
    json.dump(run_summary, f, indent=2)

print("Saved../artifacts/metrics/run_summary.json")


Saved../artifacts/metrics/run_summary.json


  "timestamp_utc": datetime.utcnow().isoformat() + "Z",


In [12]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss, brier_score_loss
import json, os
from datetime import datetime


In [13]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

def summarize_search(search, model_name):
    best = search.best_estimator_
    best_params = search.best_params_
    best_cv = float(search.best_score_)
    return {
        "model": model_name,
        "best_cv_roc_auc": best_cv,
        "best_params": best_params
    }, best


In [14]:
search_spaces = []

# 1) Logistic Regression
logreg = LogisticRegression(max_iter=8000, class_weight="balanced")
logreg_space = {
    "C": np.logspace(-3, 2, 40),
    "penalty": ["l2"],
    "solver": ["lbfgs"]
}
search_spaces.append(("LogReg_Tuned", logreg, logreg_space, 30))

# 2) Random Forest
rf = RandomForestClassifier(class_weight="balanced_subsample", random_state=42, n_jobs=-1)
rf_space = {
    "n_estimators": [200, 400, 600, 800],
    "max_depth": [None, 4, 6, 8, 12, 16],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["sqrt", "log2", None],
}
search_spaces.append(("RandomForest_Tuned", rf, rf_space, 25))

# 3) HistGradientBoosting
hgb = HistGradientBoostingClassifier(random_state=42)
hgb_space = {
    "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.1],
    "max_depth": [3, 4, 5, 6, 8],
    "max_iter": [200, 300, 500, 800],
    "min_samples_leaf": [20, 50, 100]
}
search_spaces.append(("HistGB_Tuned", hgb, hgb_space, 25))

print([s[0] for s in search_spaces])


['LogReg_Tuned', 'RandomForest_Tuned', 'HistGB_Tuned']


In [15]:
tuning_results = []
tuned_models = {}

for name, estimator, space, n_iter in search_spaces:
    print("\n=== Tuning:", name, "===")

    search = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=space,
        n_iter=n_iter,
        scoring="roc_auc",
        cv=cv,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

    search.fit(X_train, y_train)
    summary, best_model = summarize_search(search, name)

    # Evaluate best model on validation set
    prob_val = best_model.predict_proba(X_val)[:, 1]
    summary.update({
        "val_roc_auc": float(roc_auc_score(y_val, prob_val)),
        "val_pr_auc": float(average_precision_score(y_val, prob_val)),
        "val_log_loss": float(log_loss(y_val, prob_val)),
        "val_brier": float(brier_score_loss(y_val, prob_val)),
    })

    tuning_results.append(summary)
    tuned_models[name] = best_model

    print("Best CV ROC-AUC:", round(summary["best_cv_roc_auc"], 4))
    print("Val ROC-AUC:", round(summary["val_roc_auc"], 4), "| Val PR-AUC:", round(summary["val_pr_auc"], 4))
    print("Best params:", summary["best_params"])



=== Tuning: LogReg_Tuned ===
Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best CV ROC-AUC: 0.7262
Val ROC-AUC: 0.7085 | Val PR-AUC: 0.4901
Best params: {'solver': 'lbfgs', 'penalty': 'l2', 'C': np.float64(0.2728333376486767)}

=== Tuning: RandomForest_Tuned ===
Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best CV ROC-AUC: 0.7838
Val ROC-AUC: 0.777 | Val PR-AUC: 0.5544
Best params: {'n_estimators': 600, 'min_samples_leaf': 8, 'max_features': 'log2', 'max_depth': 12}

=== Tuning: HistGB_Tuned ===
Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best CV ROC-AUC: 0.7827
Val ROC-AUC: 0.7809 | Val PR-AUC: 0.5547
Best params: {'min_samples_leaf': 50, 'max_iter': 200, 'max_depth': 8, 'learning_rate': 0.03}


In [16]:
import pandas as pd
import joblib
import os
from datetime import datetime

os.makedirs("../artifacts/reports", exist_ok=True)
os.makedirs("../artifacts/models", exist_ok=True)
os.makedirs("../artifacts/metrics", exist_ok=True)

tuning_df = pd.DataFrame(tuning_results).sort_values(
    by=["val_roc_auc", "val_log_loss"],
    ascending=[False, True]
).reset_index(drop=True)

tuning_df.to_csv("../artifacts/reports/tuning_leaderboard.csv", index=False)

run = {
    "timestamp": datetime.now().isoformat(),
    "tuning_results": tuning_results
}

with open("../artifacts/metrics/tuning_run.json", "w") as f:
    json.dump(run, f, indent=2)

best_tuned_name = tuning_df.loc[0, "model"]
best_tuned_model = tuned_models[best_tuned_name]

joblib.dump(best_tuned_model, f"../artifacts/models/best_tuned_model_{best_tuned_name}.joblib")

print("Saved tuning_leaderboard.csv, tuning_run.json, and best tuned model:", best_tuned_name)
tuning_df.head()


Saved tuning_leaderboard.csv, tuning_run.json, and best tuned model: HistGB_Tuned


Unnamed: 0,model,best_cv_roc_auc,best_params,val_roc_auc,val_pr_auc,val_log_loss,val_brier
0,HistGB_Tuned,0.782726,"{'min_samples_leaf': 50, 'max_iter': 200, 'max...",0.780908,0.554696,0.428975,0.134902
1,RandomForest_Tuned,0.783761,"{'n_estimators': 600, 'min_samples_leaf': 8, '...",0.777017,0.554428,0.513692,0.166692
2,LogReg_Tuned,0.726165,"{'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.27...",0.708451,0.490059,0.607487,0.208848


In [17]:
import pandas as pd
import numpy as np
import joblib

from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss, brier_score_loss

# Load processed data
X_train = pd.read_csv("../data/processed/X_train.csv")
X_val   = pd.read_csv("../data/processed/X_val.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze("columns")
y_val   = pd.read_csv("../data/processed/y_val.csv").squeeze("columns")

# Load tuned best model
best_tuned = joblib.load("../artifacts/models/best_tuned_model_HistGB_Tuned.joblib")

# Calibrate
calibrator_tuned = CalibratedClassifierCV(best_tuned, method="isotonic", cv=3)
calibrator_tuned.fit(X_train, y_train)

# Validate calibrated PDs
pd_val_tuned = calibrator_tuned.predict_proba(X_val)[:, 1]

print("Calibrated tuned model metrics:")
print("ROC-AUC:", roc_auc_score(y_val, pd_val_tuned))
print("PR-AUC:", average_precision_score(y_val, pd_val_tuned))
print("LogLoss:", log_loss(y_val, pd_val_tuned))
print("Brier:", brier_score_loss(y_val, pd_val_tuned))

# Save calibrator
joblib.dump(calibrator_tuned, "../artifacts/models/calibrator_tuned.joblib")
print("Saved ../artifacts/models/calibrator_tuned.joblib")


Calibrated tuned model metrics:
ROC-AUC: 0.7792445853305017
PR-AUC: 0.5549157543663917
LogLoss: 0.43016152223295673
Brier: 0.13520135189985455
Saved ../artifacts/models/calibrator_tuned.joblib
