In [None]:
"""
Universal XGBoost training script
- Reads any CSV dataset
- Uses all columns except [id, status, group] as features
- England only for 5-fold CV
- Best fold chosen based on AUC
- Best fold's validation set = Internal Validation
- Scotland + Wales = External Validation
- Saves best model
"""

import pandas as pd
import numpy as np
import time
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier

In [None]:
def train_model(
        data_path: str,
        label_col: str = "status",
        id_col: str = "Participant ID",
        output_model_name: str = "best_model.joblib",
):
    print("\n===============================")
    print("  UNIVERSAL MODEL TRAINING")
    print("===============================\n")
    # ------------------------------------------------------------
    # Load data
    # ------------------------------------------------------------
    df = pd.read_csv(data_path)

    # Detect feature columns automatically
    ignore_cols = [label_col, id_col, "Region"]
    feature_cols = [c for c in df.columns if c not in ignore_cols]

    # Region-based splitting
    eng_idx = df[df["Region"] == "England"].index
    ext_idx = df[df["Region"].isin(["Scotland", "Wales"])].index

    X_eng = df.loc[eng_idx, feature_cols]
    y_eng = df.loc[eng_idx, label_col]

    X_ext = df.loc[ext_idx, feature_cols]
    y_ext = df.loc[ext_idx, label_col]

    print(f"Total samples: {len(df)}")
    print(f"England samples: {len(X_eng)}")
    print(f"External samples: {len(X_ext)}")
    print(f"Features: {len(feature_cols)}\n")
    # ------------------------------------------------------------
    # 5-fold CV on England data
    # ------------------------------------------------------------
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    fold_models = []
    fold_auc = []
    fold_indices = []

    params = {
        'alpha': 1.5,
        'lambda': 1.5,
        'colsample_bytree': 0.75,
        'n_estimators': 400,
        'learning_rate': 0.01,
        'max_depth': 3,
        'subsample': 0.7,
        'min_child_weight': 3,
        'gamma': 0.9,
        'eval_metric': 'auc'
    }

    print("=========== START 5-FOLD CV ===========")

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_eng), 1):
        print(f"\n===== Fold {fold} =====")

        X_tr, X_val = X_eng.iloc[train_idx], X_eng.iloc[val_idx]
        y_tr, y_val = y_eng.iloc[train_idx], y_eng.iloc[val_idx]

        # Handle class imbalance
        params_fold = params.copy()
        params_fold["scale_pos_weight"] = np.sum(y_tr == 0) / np.sum(y_tr == 1)

        model = XGBClassifier(**params_fold)

        start = time.time()
        print("Training...")
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        print(f"Training done: {time.time() - start:.2f}s")

        y_val_prob = model.predict_proba(X_val)[:, 1]
        auc_val = roc_auc_score(y_val, y_val_prob)

        print(f"Fold {fold} Val AUC = {auc_val:.4f}")

        fold_models.append(model)
        fold_auc.append(auc_val)
        fold_indices.append((train_idx, val_idx))

    # ------------------------------------------------------------
    # Select best fold
    # ------------------------------------------------------------
    best_fold = int(np.argmax(fold_auc))
    best_model = fold_models[best_fold]
    train_idx, val_idx = fold_indices[best_fold]

    print("\n===============================")
    print(f"Best fold = {best_fold + 1}")
    print(f"Best AUC  = {fold_auc[best_fold]:.4f}")
    print("===============================\n")

    # ------------------------------------------------------------
    # Internal validation (best fold val set)
    # ------------------------------------------------------------
    X_int = X_eng.iloc[val_idx]
    y_int = y_eng.iloc[val_idx]

    y_int_prob = best_model.predict_proba(X_int)[:, 1]
    fpr_int, tpr_int, _ = roc_curve(y_int, y_int_prob)
    auc_int = auc(fpr_int, tpr_int)

    # ------------------------------------------------------------
    # External validation (Scotland + Wales)
    # ------------------------------------------------------------
    y_ext_prob = best_model.predict_proba(X_ext)[:, 1]
    fpr_ext, tpr_ext, _ = roc_curve(y_ext, y_ext_prob)
    auc_ext = auc(fpr_ext, tpr_ext)

    # ------------------------------------------------------------
    # Save model
    # ------------------------------------------------------------
    joblib.dump(best_model, output_model_name)
    print(f"Best model saved â†’ {output_model_name}\n")

    # ------------------------------------------------------------
    # Plot ROC
    # ------------------------------------------------------------
    plt.figure(figsize=(10, 8))
    plt.plot(fpr_int, tpr_int, label=f"Internal (AUC={auc_int:.3f})", lw=2)
    plt.plot(fpr_ext, tpr_ext, label=f"External (AUC={auc_ext:.3f})", lw=2)
    plt.plot([0, 1], [0, 1], "k--")

    plt.xlabel("False Positive Rate", fontsize=16)
    plt.ylabel("True Positive Rate", fontsize=16)
    plt.title("ROC Curve", fontsize=20)
    plt.legend(fontsize=14)
    plt.tight_layout()
    plt.show()

    print(f"Internal Validation AUC = {auc_int:.4f}")
    print(f"External Validation AUC = {auc_ext:.4f}")

    return {
        "best_fold": best_fold + 1,
        "internal_auc": auc_int,
        "external_auc": auc_ext,
        "model_path": output_model_name
    }

In [None]:
# ========== Example Run ==========
if __name__ == "__main__":
    result = train_model(
        data_path="your_data.csv",
        output_model_name="best_xgboost_model.joblib"
    )
    print(result)