In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
import joblib
import os


import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
DATA_PATH = "../data/processed/credit_data_train_processed.csv"
MODEL_DIR = "../models"
MODEL_FILE = os.path.join(MODEL_DIR, "model_xgboost.bin")

In [3]:
TARGET = "SeriousDlqin2yrs"

In [4]:
df = pd.read_csv(DATA_PATH)

y = df[TARGET]
X = df.drop(columns=[TARGET])

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [7]:
# Base model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# ROC-AUC scoring
roc_auc = make_scorer(roc_auc_score, needs_proba=True)

# Grid search
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=roc_auc,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid.fit(X_train, y_train)

# Best model
best_xgb = grid.best_estimator_

# Predict on validation
y_pred_xgb = best_xgb.predict_proba(X_val)[:,1]
auc_xgb = roc_auc_score(y_val, y_pred_xgb)

print(f"Tuned XGBoost ROC-AUC: {auc_xgb:.4f}")
print(f"Best parameters: {grid.best_params_}")


Fitting 3 folds for each of 32 candidates, totalling 96 fits


Tuned XGBoost ROC-AUC: 0.8662
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}


In [8]:
os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(best_xgb, MODEL_FILE)
print(f"Model saved to {MODEL_FILE}")

Model saved to ../models/model_xgboost.bin
