# 1. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb

import os
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, message='.*use_label_encoder.*')

In [2]:
os.getcwd()

'/Users/nicolaskossacoff/Documents/Projects/data-science-library/Notebooks/Model Evaluation'

# 2. Data

## 2.1. Access Kaggle's API

First we are going to use Kaggle's API to access to the [Credit Card Frad Detection](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud) dataset. This dataset contains transactions made by credit cards in September 2023 by European cardholders. We have 492 fraud transactions out of 284,807.

In [None]:
!kaggle datasets list -s 'fraud detection'

Download the dataset as a `.zip` file.

In [None]:
!kaggle datasets download -d 'mlg-ulb/creditcardfraud'

Finally, we unzip the dataset and we save it in the `Data/` folder.

In [None]:
!unzip creditcardfraud.zip -d Data/

## 2.2. Load Data

In [3]:
df = pd.read_csv('Data/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


We are going to remove columns that are not useful and then rename target feature.

In [4]:
# Lower feature names
df.columns = df.columns.str.lower()

# Rename features
df.rename(columns={'class': 'is_fraud'}, inplace=True)

In [5]:
df.head()

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,is_fraud
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   time      284807 non-null  float64
 1   v1        284807 non-null  float64
 2   v2        284807 non-null  float64
 3   v3        284807 non-null  float64
 4   v4        284807 non-null  float64
 5   v5        284807 non-null  float64
 6   v6        284807 non-null  float64
 7   v7        284807 non-null  float64
 8   v8        284807 non-null  float64
 9   v9        284807 non-null  float64
 10  v10       284807 non-null  float64
 11  v11       284807 non-null  float64
 12  v12       284807 non-null  float64
 13  v13       284807 non-null  float64
 14  v14       284807 non-null  float64
 15  v15       284807 non-null  float64
 16  v16       284807 non-null  float64
 17  v17       284807 non-null  float64
 18  v18       284807 non-null  float64
 19  v19       284807 non-null  float64
 20  v20 

In [7]:
df.is_fraud.value_counts() / df.is_fraud.shape[0]

is_fraud
0    0.998273
1    0.001727
Name: count, dtype: float64

In [8]:
# Split features and target
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

## 2.3. Train/Test sets

We first split the data into train and test sets without using stratification.

In [9]:
# Split data into training and test sets without stratification
X_train_no_strat, X_test_no_strat, y_train_no_strat, y_test_no_strat = train_test_split(X, y, test_size=0.3)

# Check the distribution of the target in the training and test sets
df_no_strat = pd.DataFrame({'train': y_train_no_strat.value_counts() / y_train_no_strat.shape[0], 'test': y_test_no_strat.value_counts() / y_test_no_strat.shape[0]})
df_no_strat.style.format("{:.2%}")

Unnamed: 0_level_0,train,test
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1
0,99.83%,99.82%
1,0.17%,0.18%


Now we split the data into train and test sets using stratification, which mantains the fraud ratio in both samples.

In [10]:
# Split data into training and test sets without stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Check the distribution of the target in the training and test sets
df = pd.DataFrame({'train': y_train.value_counts() / y_train.shape[0], 'test': y_test.value_counts() / y_test.shape[0]})
df.style.format("{:.2%}")

Unnamed: 0_level_0,train,test
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1
0,99.83%,99.83%
1,0.17%,0.17%


I remove the features that I don't think makes sense for training the model.

In [11]:
X_train.drop(columns=['time', 'amount'], inplace=True)
X_test.drop(columns=['time', 'amount'], inplace=True)

## 2.4. Feature Selection

We are going to use the `RFECV` method for feature selection, which recursively removes features based on their feature importance. However, we need to define a clasifier first. In this case we are going to use an XGBoost classifier.

In [12]:
# The 'objective' argument defines the specific learning task. The 'eval_metric' defines the loss function to be minimized.
# The 'logloss' is the Binary Cross-Entropy loss function.
classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

Now we define an evaluation metric (using the `make_scrorer` method) and we set-up a Cross-Validation strategy. Since we are working with an imbalanced dataset, we are going to use the F1-score as our evaluation metric.

In [13]:
scorer = metrics.make_scorer(metrics.f1_score)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Finally, we run the feature selection method (I recommend you collapse the output since it's quite large)

In [14]:
rfe = RFECV(
    estimator=classifier,
    step=1,
    cv=cv,
    scoring=scorer,
    verbose=1,
    n_jobs=-1
)

# Fit the Recursive Feature Elimination method
rfe.fit(X_train, y_train)

Fitting estimator with 28 features.Fitting estimator with 28 features.

Fitting estimator with 28 features.Fitting estimator with 28 features.

Fitting estimator with 28 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 27 features.
Fitting estimator with 27 features.
Fitting estimator with 27 features.
Fitting estimator with 27 features.
Fitting estimator with 27 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 26 features.
Fitting estimator with 26 features.
Fitting estimator with 26 features.
Fitting estimator with 26 features.
Fitting estimator with 26 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 25 features.
Fitting estimator with 25 features.
Fitting estimator with 25 features.
Fitting estimator with 25 features.
Fitting estimator with 25 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 24 features.
Fitting estimator with 24 features.
Fitting estimator with 24 features.
Fitting estimator with 24 features.
Fitting estimator with 24 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 23 features.
Fitting estimator with 23 features.
Fitting estimator with 23 features.
Fitting estimator with 23 features.
Fitting estimator with 23 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 22 features.
Fitting estimator with 22 features.
Fitting estimator with 22 features.
Fitting estimator with 22 features.
Fitting estimator with 22 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 21 features.
Fitting estimator with 21 features.
Fitting estimator with 21 features.
Fitting estimator with 21 features.
Fitting estimator with 21 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 20 features.
Fitting estimator with 20 features.
Fitting estimator with 20 features.
Fitting estimator with 20 features.
Fitting estimator with 20 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 19 features.
Fitting estimator with 19 features.
Fitting estimator with 19 features.
Fitting estimator with 19 features.
Fitting estimator with 19 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 18 features.
Fitting estimator with 18 features.
Fitting estimator with 18 features.
Fitting estimator with 18 features.
Fitting estimator with 18 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 17 features.
Fitting estimator with 17 features.
Fitting estimator with 17 features.
Fitting estimator with 17 features.
Fitting estimator with 17 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 15 features.
Fitting estimator with 15 features.
Fitting estimator with 15 features.
Fitting estimator with 15 features.
Fitting estimator with 15 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 14 features.
Fitting estimator with 14 features.
Fitting estimator with 14 features.
Fitting estimator with 14 features.
Fitting estimator with 14 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 13 features.
Fitting estimator with 13 features.
Fitting estimator with 13 features.
Fitting estimator with 13 features.
Fitting estimator with 13 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 12 features.
Fitting estimator with 12 features.
Fitting estimator with 12 features.
Fitting estimator with 12 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 12 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 11 features.
Fitting estimator with 11 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 11 features.
Fitting estimator with 11 features.
Fitting estimator with 11 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 10 features.
Fitting estimator with 10 features.
Fitting estimator with 10 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 10 features.
Fitting estimator with 10 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 9 features.
Fitting estimator with 9 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 9 features.
Fitting estimator with 9 features.
Fitting estimator with 9 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 8 features.
Fitting estimator with 8 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 8 features.
Fitting estimator with 8 features.
Fitting estimator with 8 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 7 features.
Fitting estimator with 7 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 6 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 6 features.
Fitting estimator with 6 features.
Fitting estimator with 6 features.
Fitting estimator with 6 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 5 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 5 features.
Fitting estimator with 5 features.
Fitting estimator with 5 features.
Fitting estimator with 5 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 4 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 4 features.
Fitting estimator with 4 features.
Fitting estimator with 4 features.
Fitting estimator with 4 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 3 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 3 features.
Fitting estimator with 3 features.
Fitting estimator with 3 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 2 features.
Fitting estimator with 2 features.
Fitting estimator with 2 features.
Fitting estimator with 2 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.


In [15]:
print(f'Optimal number of features: {rfe.n_features_}')

Optimal number of features: 20


In [16]:
X_train_fs = pd.DataFrame(rfe.transform(X_train), columns=X_train.columns[rfe.support_])
X_test_fs = pd.DataFrame(rfe.transform(X_test), columns=X_test.columns[rfe.support_])

# 3. Train model

## 3.1. Hyperparameter Tuning

We tune hyperparameters to find the model that fits the data the best. I didn't run this job on my computer, I used a Google Colab notebook.

In [None]:
classifier = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss' # Binary Cross-Entropy loss function
)

# Hyperparameter distributions
param_dist = {
    'learning_rate': np.arange(0.001, 1, 0.01), # Step used to update the weights
    'n_estimators': np.arange(100, 1050, 50), # Numbers of boosting rounds
    'growth_policy': ['depthwise', 'lossguide'], # Control the way the tree grows
    'min_child_weight': np.arange(1, 10, 1), # Minimum child weight
    'reg_alpha': np.arange(0, 1, 0.1), # L1 regularization term on weights
    'reg_lambda': np.arange(0, 1, 0.1), # L2 regularization term on weights
    'early_stopping_rounds': np.arange(10, 60, 10), # Activates early stopping
}

# Define the evaluation metric 
scorer = metrics.make_scorer(metrics.f1_score)

# Define the cross-validation strategy
cv = GridSearchCV(
    estimator=classifier,
    param_grid=param_dist,
    scoring=scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit the model
cv.fit(X_train, y_train)

# Best parameters
print(f'Best F1-score: {cv.best_score_}')
print(f'Best parameters: {cv.best_params_}')

In [14]:
# Create the Dmatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

# Set hyperparameters
params = {
    'objective': 'binary:logistic',
    'eta': 0.001,
    'gamma': 0.01,
    'lambda': 1,
    'max_depth': 1
}

# Create the model
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=100
)

In [None]:
# Predictions for the test set
y_pred = model.predict(dtest)
# Threshold the predictions
y_pred = np.where(y_pred > 0.5, 1, 0)

# Calculate the accuracy
f_score = metrics.f1_score(y_test, y_pred)
print(f'F1-score: {f_score:.2f}')

In [None]:
precision, recall, _ = metrics.precision_recall_curve(y_test, y_pred)
avg_precision = metrics.average_precision_score(y_test, y_pred)

# Plot the precision-recall curve
fig, ax = plt.subplots(figsize=(12, 6))
ax.step(recall, precision, where='post', label=f'Avg. Precision = {avg_precision:.2f}%')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision-Recall Curve', fontdict={'fontsize': 15, 'fontweight': 'bold'}, loc='left')
ax.legend(loc='best')