In [1]:
import pandas as pd


df = pd.read_csv("../data/creditcard.csv")


X = df.drop("Class", axis=1)
y = df["Class"]

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [6]:
import xgboost as xgb
from xgboost import XGBClassifier

neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos
print("neg:", neg, "pos:", pos, "scale_pos_weight:", scale_pos_weight)


neg: 227451 pos: 394 scale_pos_weight: 577.2868020304569


In [8]:
xgb_clf = XGBClassifier(
    n_estimators=300,              # number of boosting trees
    max_depth=4,                  # maximum depth of each tree
    learning_rate=0.05,           # learning rate (shrinkage)
    subsample=0.8,                # row subsampling ratio
    colsample_bytree=0.8,         # column subsampling ratio
    objective="binary:logistic",  # output probability for binary classification
    eval_metric="logloss",        # evaluation metric during training
    scale_pos_weight=scale_pos_weight,  # handle class imbalance (positive class weight)
    n_jobs=-1,                    # use all CPU cores
    random_state=42               # reproducibility
)

xgb_clf.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],  # monitor train/valid loss
    verbose=False  # set True to print training log
)



In [9]:
from sklearn.metrics import (
    roc_auc_score, 
    average_precision_score, 
    precision_score, 
    recall_score, 
    f1_score,
    confusion_matrix
)

# probability prediction
y_valid_proba_xgb = xgb_clf.predict_proba(X_valid)[:, 1]
# 0/1 prediction
y_valid_pred_xgb = (y_valid_proba_xgb >= 0.5).astype(int)

auc_xgb = roc_auc_score(y_valid, y_valid_proba_xgb)
pra_xgb = average_precision_score(y_valid, y_valid_proba_xgb)
precision_xgb = precision_score(y_valid, y_valid_pred_xgb)
recall_xgb = recall_score(y_valid, y_valid_pred_xgb)
f1_xgb = f1_score(y_valid, y_valid_pred_xgb)
cm_xgb = confusion_matrix(y_valid, y_valid_pred_xgb)

print("XGBoost Metrics:")
print("AUC:", auc_xgb)
print("PR-AUC:", pra_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1:", f1_xgb)
print("Confusion Matrix:\n", cm_xgb)


XGBoost Metrics:
AUC: 0.9859501869121312
PR-AUC: 0.8564979811327577
Precision: 0.5576923076923077
Recall: 0.8877551020408163
F1: 0.6850393700787402
Confusion Matrix:
 [[56795    69]
 [   11    87]]


In [10]:
import os

os.makedirs("../results", exist_ok=True)

xgb_results = pd.DataFrame([
    ['XGBoost', auc_xgb, pra_xgb, recall_xgb, precision_xgb, f1_xgb]
], columns=['Model','AUC','PR-AUC','Recall','Precision','F1'])

xgb_results.to_csv("../results/xgb_results.csv", index=False)
xgb_results


Unnamed: 0,Model,AUC,PR-AUC,Recall,Precision,F1
0,XGBoost,0.98595,0.856498,0.887755,0.557692,0.685039


In [11]:
import joblib
os.makedirs("../models", exist_ok=True)

joblib.dump(xgb_clf, "../models/xgb_model.pkl")
print("Saved XGBoost model to models/xgb_model.pkl")


Saved XGBoost model to models/xgb_model.pkl


In [12]:
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay

# only XGBoost
plt.figure(figsize=(6,6))
RocCurveDisplay.from_predictions(y_valid, y_valid_proba_xgb, name="XGBoost")
plt.title("ROC Curve - XGBoost")
plt.savefig("../results/xgb_roc.png", dpi=300)
plt.close()

plt.figure(figsize=(6,6))
PrecisionRecallDisplay.from_predictions(y_valid, y_valid_proba_xgb, name="XGBoost")
plt.title("Precision-Recall Curve - XGBoost")
plt.savefig("../results/xgb_pr.png", dpi=300)
plt.close()

print("Saved XGBoost ROC & PR curves.")


Saved XGBoost ROC & PR curves.


<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>