#### Imports

In [1]:
import joblib
import json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, precision_recall_curve, auc

#### Ensure folders exists

In [2]:
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(exist_ok=True)

MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)

#### Load preprocessed data

In [3]:
data = joblib.load(ARTIFACTS_DIR / "data_splits.pkl")
X_train, y_train = data["X_train"], data["y_train"]
X_val, y_val     = data["X_val"], data["y_val"]
X_test, y_test   = data["X_test"], data["y_test"]


In [4]:
# Dictionary to store metrics
metrics_tree = {}

In [5]:
# -----------------------------
# Helper function to compute PR AUC
# -----------------------------
def compute_pr_auc(y_true, y_scores):
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    return auc(recall, precision)

## Random Forest

In [6]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_val_pred_rf = rf.predict(X_val)
y_val_proba_rf = rf.predict_proba(X_val)[:,1]

metrics_rf = {
    "roc_auc": roc_auc_score(y_val, y_val_proba_rf),
    "pr_auc": compute_pr_auc(y_val, y_val_proba_rf),
    "f1": f1_score(y_val, y_val_pred_rf),
    "precision": precision_score(y_val, y_val_pred_rf),
    "recall": recall_score(y_val, y_val_pred_rf)
}

joblib.dump(rf, MODELS_DIR / "random_forest.pkl")
joblib.dump(metrics_rf, MODELS_DIR / "metrics_rf.pkl")
print("Random Forest metrics:", metrics_rf)

Random Forest metrics: {'roc_auc': 0.9364573317424665, 'pr_auc': 0.6461686000960438, 'f1': 0.6067864271457086, 'precision': 0.6238030095759234, 'recall': 0.5906735751295337}


## XGBoost

In [7]:
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    scale_pos_weight=(y_train==0).sum() / (y_train==1).sum(),  # handle imbalance
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

y_val_pred_xgb = xgb.predict(X_val)
y_val_proba_xgb = xgb.predict_proba(X_val)[:,1]

metrics_xgb = {
    "roc_auc": roc_auc_score(y_val, y_val_proba_xgb),
    "pr_auc": compute_pr_auc(y_val, y_val_proba_xgb),
    "f1": f1_score(y_val, y_val_pred_xgb),
    "precision": precision_score(y_val, y_val_pred_xgb),
    "recall": recall_score(y_val, y_val_pred_xgb)
}

joblib.dump(xgb, MODELS_DIR / "xgboost.pkl")
joblib.dump(metrics_xgb, MODELS_DIR / "metrics_xgb.pkl")
print("XGBoost metrics:", metrics_xgb)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost metrics: {'roc_auc': 0.9374408777536856, 'pr_auc': 0.6344953842659459, 'f1': 0.5834482758620689, 'precision': 0.6238938053097345, 'recall': 0.5479274611398963}


In [8]:
print("✅ Tree models trained and metrics saved to 'models/' folder.")

✅ Tree models trained and metrics saved to 'models/' folder.
