In [1]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    log_loss,
    brier_score_loss
)
from sklearn.calibration import CalibratedClassifierCV


In [2]:
PROCESSED_PATH = "../data/processed/"

X_train = pd.read_csv(PROCESSED_PATH + "X_train.csv")
X_val   = pd.read_csv(PROCESSED_PATH + "X_val.csv")
X_test  = pd.read_csv(PROCESSED_PATH + "X_test.csv")

y_train = pd.read_csv(PROCESSED_PATH + "y_train.csv").squeeze("columns")
y_val   = pd.read_csv(PROCESSED_PATH + "y_val.csv").squeeze("columns")
y_test  = pd.read_csv(PROCESSED_PATH + "y_test.csv").squeeze("columns")

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


Train: (18000, 24) Val: (6000, 24) Test: (6000, 24)


In [3]:
def evaluate(model, X, y):
    prob = model.predict_proba(X)[:, 1]
    return {
        "roc_auc": roc_auc_score(y, prob),
        "pr_auc": average_precision_score(y, prob),
        "log_loss": log_loss(y, prob),
        "brier": brier_score_loss(y, prob)
    }


In [4]:
models = {
    "LogReg": LogisticRegression(
        max_iter=5000,
        class_weight="balanced"
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        min_samples_leaf=2,
        class_weight="balanced_subsample",
        random_state=42,
        n_jobs=-1
    ),
    "HistGB": HistGradientBoostingClassifier(
        max_depth=6,
        learning_rate=0.07,
        max_iter=300,
        random_state=42
    )
}

results = []
trained = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    trained[name] = model
    metrics = evaluate(model, X_val, y_val)
    metrics["model"] = name
    results.append(metrics)
    print(name, "VAL ROC-AUC:", round(metrics["roc_auc"], 4))


LogReg VAL ROC-AUC: 0.704
RandomForest VAL ROC-AUC: 0.7682
HistGB VAL ROC-AUC: 0.772


In [5]:
leaderboard = (
    pd.DataFrame(results)
    .sort_values(by="roc_auc", ascending=False)
    .reset_index(drop=True)
)

leaderboard

winner_name = leaderboard.loc[0, "model"]
winner_model = trained[winner_name]
print("Winner:", winner_name)


Winner: HistGB


In [6]:
test_metrics = evaluate(winner_model, X_test, y_test)
print("TEST metrics (uncalibrated):")
test_metrics


TEST metrics (uncalibrated):


{'roc_auc': 0.7863257008345816,
 'pr_auc': 0.5688285349753347,
 'log_loss': 0.42487507262869895,
 'brier': 0.1330326739020595}

In [7]:
# Refit winner on full TRAIN
winner_model.fit(X_train, y_train)

# Calibrate on VAL
calibrator = CalibratedClassifierCV(
    winner_model,
    method="isotonic",
    cv="prefit"
)
calibrator.fit(X_val, y_val)

# Evaluate calibrated PD on TEST
pd_test = calibrator.predict_proba(X_test)[:, 1]

print("TEST metrics (calibrated):")
print("ROC-AUC:", roc_auc_score(y_test, pd_test))
print("PR-AUC:", average_precision_score(y_test, pd_test))
print("LogLoss:", log_loss(y_test, pd_test))
print("Brier:", brier_score_loss(y_test, pd_test))


TEST metrics (calibrated):
ROC-AUC: 0.7846422819541979
PR-AUC: 0.5504181514096691
LogLoss: 0.426253592982231
Brier: 0.13340796495975535




In [8]:
os.makedirs("../artifacts/models", exist_ok=True)
os.makedirs("../artifacts/reports", exist_ok=True)

joblib.dump(winner_model, f"../artifacts/models/best_model_{winner_name}.joblib")
joblib.dump(calibrator, "../artifacts/models/calibrator_final.joblib")

leaderboard.to_csv("../artifacts/reports/leaderboard.csv", index=False)

print("Saved model, calibrator, and leaderboard.")


Saved model, calibrator, and leaderboard.
