# Quick model training

The purpose of this notebook is to allow to quickly train and evaluate a model, using training data only and cross-validation.

All parameters are defined in the next cell. Only those should be modified.

In [None]:
xgb_params = {
    "device": "gpu",
    "eta": 0.3,
    "max_depth": 6,
    "subsample": 0.5,
    "colsample_bytree": 0.8,
    "lambda": 1,
    "alpha": 0,
    "eval_metric": "aucpr",
    "objective": "binary:logistic",
    "verbosity": 0,
}

k_fold_params = {"n_splits": 5, "shuffle": True, "random_state": 42}

features_to_keep = []  # keep all


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    PrecisionRecallDisplay,
    classification_report,
)
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from xgboost import XGBClassifier


In [None]:
train = pd.read_csv("../data/split/train.csv")
y_train = train["Class"]
X_train = train.drop(["Class"], axis=1)
if features_to_keep:
    X_train = X_train[features_to_keep]

labels = ["Not Fraud", "Fraud"]
model = XGBClassifier(**xgb_params)
k_fold = StratifiedKFold(**k_fold_params)
train_preds = cross_val_predict(model, X_train, y_train, cv=k_fold)
train_proba = cross_val_predict(
    model, X_train, y_train, cv=k_fold, method="predict_proba"
)

print(classification_report(y_train, train_preds, target_names=labels))
ConfusionMatrixDisplay.from_predictions(y_train, train_preds)
PrecisionRecallDisplay.from_predictions(y_train, train_proba[:, 1])
plt.show()