# 1. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
import lightgbm as lgb

import os
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, message='.*use_label_encoder.*')

In [2]:
os.chdir('/Users/nicolaskossacoff/Documents/Projects/data-science-library/notebooks')
from utils.feature_selection.strategies import *

In [3]:
os.chdir('/Users/nicolaskossacoff/Documents/Projects/data-science-library/notebooks/model-evaluation')

# 2. Data

## 2.1. Access Kaggle's API

First we are going to use Kaggle's API to access to the [Credit Card Frad Detection](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud) dataset. This dataset contains transactions made by credit cards in September 2023 by European cardholders. We have 492 fraud transactions out of 284,807.

In [4]:
!kaggle datasets list -s 'fraud detection'

ref                                                         title                                              size  lastUpdated          downloadCount  voteCount  usabilityRating  
----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
mlg-ulb/creditcardfraud                                     Credit Card Fraud Detection                        66MB  2018-03-23 01:17:27         806222      11826  0.85294116       
whenamancodes/fraud-detection                               Fraud Detection                                    66MB  2022-09-12 11:54:40           9486        113  1.0              
ealaxi/paysim1                                              Synthetic Financial Datasets For Fraud Detection  178MB  2017-04-03 08:40:34         102949       1424  0.88235295       
mishra5001/credit-card                                      Credit Card Fraud Detection   

Download the dataset as a `.zip` file.

In [5]:
!kaggle datasets download -d 'mlg-ulb/creditcardfraud'

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
License(s): DbCL-1.0
Downloading creditcardfraud.zip to /Users/nicolaskossacoff/Documents/Projects/data-science-library/notebooks/model-evaluation
 99%|█████████████████████████████████████▍| 65.0M/66.0M [00:05<00:00, 13.5MB/s]
100%|██████████████████████████████████████| 66.0M/66.0M [00:05<00:00, 11.7MB/s]


Finally, we unzip the dataset and we save it in the `Data/` folder.

In [7]:
!unzip creditcardfraud.zip -d data/

Archive:  creditcardfraud.zip
  inflating: data/creditcard.csv     


Load the CSV file and save it as a Parquet file (it's more efficient and consumes less memory).

In [4]:
df = pd.read_csv('data/creditcard.csv')
df.to_parquet('data/creditcard.parquet')

Delete both `.zip` and `.csv` files:

In [5]:
!rm creditcardfraud.zip
!rm data/creditcard.csv

## 2.2. Load Data

In [6]:
# Removes previous DataFrame
del df

In [7]:
df = pd.read_parquet('data/creditcard.parquet')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Convert column names to lowercase and rename the target variable.

In [8]:
# Lower feature names
df.columns = df.columns.str.lower()

# Rename features
df.rename(columns={'class': 'is_fraud'}, inplace=True)

In [9]:
df.head()

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,is_fraud
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   time      284807 non-null  float64
 1   v1        284807 non-null  float64
 2   v2        284807 non-null  float64
 3   v3        284807 non-null  float64
 4   v4        284807 non-null  float64
 5   v5        284807 non-null  float64
 6   v6        284807 non-null  float64
 7   v7        284807 non-null  float64
 8   v8        284807 non-null  float64
 9   v9        284807 non-null  float64
 10  v10       284807 non-null  float64
 11  v11       284807 non-null  float64
 12  v12       284807 non-null  float64
 13  v13       284807 non-null  float64
 14  v14       284807 non-null  float64
 15  v15       284807 non-null  float64
 16  v16       284807 non-null  float64
 17  v17       284807 non-null  float64
 18  v18       284807 non-null  float64
 19  v19       284807 non-null  float64
 20  v20 

In [11]:
df.is_fraud.value_counts() / df.is_fraud.shape[0]

is_fraud
0    0.998273
1    0.001727
Name: count, dtype: float64

In [12]:
# Split features and target
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

## 2.3. Train/Test sets

We first split the data into train and test sets without using stratification.

In [13]:
# Split data into training and test sets without stratification
X_train_no_strat, X_test_no_strat, y_train_no_strat, y_test_no_strat = train_test_split(X, y, test_size=0.3)

# Check the distribution of the target in the training and test sets
df_no_strat = pd.DataFrame({'train': y_train_no_strat.value_counts() / y_train_no_strat.shape[0], 'test': y_test_no_strat.value_counts() / y_test_no_strat.shape[0]})
df_no_strat.style.format("{:.2%}")

Unnamed: 0_level_0,train,test
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1
0,99.83%,99.82%
1,0.17%,0.18%


Now we split the data into train and test sets using stratification, which mantains the fraud ratio in both samples.

In [14]:
# Split data into training and test sets without stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Check the distribution of the target in the training and test sets
df = pd.DataFrame({'train': y_train.value_counts() / y_train.shape[0], 'test': y_test.value_counts() / y_test.shape[0]})
df.style.format("{:.2%}")

Unnamed: 0_level_0,train,test
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1
0,99.83%,99.83%
1,0.17%,0.17%


I remove the features that I don't think makes sense for training the model.

In [15]:
X_train.drop(columns=['time', 'amount'], inplace=True)
X_test.drop(columns=['time', 'amount'], inplace=True)

Reset index for both train and test sample.

In [16]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

## 2.4. Feature Selection

We are going to use a recursive method for feature selection, which removes features based on their feature importance. However, we need to define a clasifier first. In this case we are going to use an XGBoost classifier.

In [17]:
# The 'objective' argument defines the specific learning task. The 'eval_metric' defines the loss function to be minimized.
# The 'logloss' is the Binary Cross-Entropy loss function.
classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

Now we define an evaluation metric (using the `make_scrorer` method). Since we are working with an imbalanced dataset, we are going to use the F1-score as our evaluation metric.

In [18]:
scorer = metrics.make_scorer(metrics.f1_score)

Finally, we create a `RecursiveFeatureSelection` object (this class can be found in `notebooks/utils/feature_selection/strategies.py`). We then run the strategy (I recommend you collapse the output since it's quite large)

In [20]:
feature_selection = RecursiveFeatureSelection(
    estimator=classifier,
    score=scorer,
    step=1,
    folds=5,
    n_jobs=-1,
    X_train=X_train,
    y_train=y_train
)

# Run strategy
feature_selection.run_strategy()

Fitting estimator with 28 features.
Fitting estimator with 28 features.
Fitting estimator with 28 features.
Fitting estimator with 28 features.
Fitting estimator with 28 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 27 features.
Fitting estimator with 27 features.
Fitting estimator with 27 features.
Fitting estimator with 27 features.
Fitting estimator with 27 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 26 features.
Fitting estimator with 26 features.
Fitting estimator with 26 features.
Fitting estimator with 26 features.
Fitting estimator with 26 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 25 features.
Fitting estimator with 25 features.
Fitting estimator with 25 features.
Fitting estimator with 25 features.
Fitting estimator with 25 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 24 features.
Fitting estimator with 24 features.
Fitting estimator with 24 features.
Fitting estimator with 24 features.
Fitting estimator with 24 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 23 features.
Fitting estimator with 23 features.
Fitting estimator with 23 features.
Fitting estimator with 23 features.
Fitting estimator with 23 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 22 features.
Fitting estimator with 22 features.
Fitting estimator with 22 features.
Fitting estimator with 22 features.
Fitting estimator with 22 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 21 features.
Fitting estimator with 21 features.
Fitting estimator with 21 features.
Fitting estimator with 21 features.
Fitting estimator with 21 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 20 features.
Fitting estimator with 20 features.
Fitting estimator with 20 features.
Fitting estimator with 20 features.
Fitting estimator with 20 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 19 features.
Fitting estimator with 19 features.
Fitting estimator with 19 features.
Fitting estimator with 19 features.
Fitting estimator with 19 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 18 features.
Fitting estimator with 18 features.
Fitting estimator with 18 features.
Fitting estimator with 18 features.
Fitting estimator with 18 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 17 features.
Fitting estimator with 17 features.
Fitting estimator with 17 features.
Fitting estimator with 17 features.
Fitting estimator with 17 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.
Fitting estimator with 16 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 15 features.
Fitting estimator with 15 features.
Fitting estimator with 15 features.
Fitting estimator with 15 features.
Fitting estimator with 15 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 14 features.
Fitting estimator with 14 features.
Fitting estimator with 14 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 14 features.
Fitting estimator with 14 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 13 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 13 features.
Fitting estimator with 13 features.
Fitting estimator with 13 features.
Fitting estimator with 13 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 12 features.
Fitting estimator with 12 features.
Fitting estimator with 12 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 12 features.
Fitting estimator with 12 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 11 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 11 features.
Fitting estimator with 11 features.
Fitting estimator with 11 features.
Fitting estimator with 11 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 10 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 10 features.
Fitting estimator with 10 features.
Fitting estimator with 10 features.
Fitting estimator with 10 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 9 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 9 features.
Fitting estimator with 9 features.
Fitting estimator with 9 features.
Fitting estimator with 9 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 8 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 8 features.
Fitting estimator with 8 features.
Fitting estimator with 8 features.
Fitting estimator with 8 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 7 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.
Fitting estimator with 7 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 6 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 6 features.
Fitting estimator with 6 features.
Fitting estimator with 6 features.
Fitting estimator with 6 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 5 features.


Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 5 features.
Fitting estimator with 5 features.
Fitting estimator with 5 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 4 features.
Fitting estimator with 4 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 4 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 3 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 3 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 3 features.
Fitting estimator with 2 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 2 features.
Fitting estimator with 2 features.
Fitting estimator with 2 features.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.


In [21]:
print(f'Optimal number of features: {feature_selection.rfe.n_features_}')
print(f'Features selected: {feature_selection.get_feature_names()}')

Optimal number of features: 20
Features selected: ['v1', 'v3', 'v4', 'v7', 'v9', 'v10', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20', 'v21', 'v24', 'v26', 'v27', 'v28']


Save the filtered DataFrame for later

In [22]:
df_train = feature_selection.get_dataframe()
df_train

Unnamed: 0,v1,v3,v4,v7,v9,v10,v12,v13,v14,v15,...,v17,v18,v19,v20,v21,v24,v26,v27,v28,is_fraud
0,-0.012102,0.163334,-0.756498,0.844608,-0.184399,-0.222719,0.386075,-0.650025,0.418040,-1.009056,...,-0.738861,-0.068285,0.329703,-0.038329,-0.231385,-0.360882,0.142657,0.235922,0.083758,0
1,1.776151,-2.204096,1.191668,0.919254,-0.318277,0.517022,0.247935,-1.055373,1.242616,-0.433066,...,-0.720880,0.328234,-0.016297,-0.030437,0.281190,-0.342096,-0.427682,-0.075228,-0.056755,0
2,-1.083391,-1.399530,0.469764,1.601441,-1.288745,0.086419,0.191335,1.046362,0.420874,0.805190,...,0.059491,0.915477,-1.087435,2.189260,0.346463,0.505926,0.870190,-0.316982,0.227833,0
3,-0.518847,-0.614624,-0.780959,0.046111,-0.322448,0.128383,-0.341516,0.004210,-0.287098,1.385535,...,-0.299877,0.403600,0.929925,0.225957,-0.193099,0.944217,0.039995,0.010804,0.254309,0
4,-0.640421,0.283341,-1.786916,-0.198897,0.474428,-0.283699,-0.180028,-0.254641,-0.388456,0.414223,...,-0.710405,0.007241,-0.458141,0.154309,0.167939,0.767971,0.612936,0.074029,-0.033344,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199359,-0.415022,1.221551,-2.108216,1.118548,0.463473,-0.999876,1.319156,0.299893,0.161817,0.098311,...,-0.783303,0.392973,0.106964,0.075377,0.132886,0.558730,-0.871689,0.239482,-0.020422,0
199360,1.993864,-0.620118,0.129845,-0.822358,0.995898,0.212619,0.584204,-0.251486,0.062302,0.013795,...,-0.906442,0.739436,0.043943,-0.174051,0.262526,0.275689,0.623598,-0.032455,-0.058552,0
199361,-1.497933,1.581568,-0.024286,0.609212,0.452745,0.108640,0.373473,-0.533295,-0.401809,0.771781,...,1.241541,-2.623211,-1.241768,-0.225079,-0.072452,-0.615980,0.263968,-0.448445,0.045178,0
199362,1.069777,0.496540,1.505318,0.100551,0.319684,-0.131553,0.445453,-0.547450,0.166727,-0.073930,...,0.404795,-1.033061,-0.500426,-0.149402,-0.061991,0.400171,-0.335160,0.031014,0.024886,0


In [23]:
df_test = X_test[feature_selection.get_feature_names()]
df_test['is_fraud'] = y_test
df_test

Unnamed: 0,v1,v3,v4,v7,v9,v10,v12,v13,v14,v15,...,v17,v18,v19,v20,v21,v24,v26,v27,v28,is_fraud
0,-2.537331,-0.840555,-1.102759,-2.737844,-0.483147,0.112724,0.635855,0.406871,0.854371,0.505753,...,0.557331,1.546638,-0.789047,-1.959611,4.328165,-0.241143,-0.475427,-0.177157,-0.082691,0
1,-0.250839,0.206089,0.417324,1.369769,-0.957100,-0.261170,0.418555,-0.475202,0.658159,-1.274781,...,-0.397771,0.178173,0.604366,-0.128120,0.133048,-0.393364,-0.490717,0.143897,0.136810,0
2,0.568980,-1.114138,1.670652,0.190403,0.840595,0.099172,-0.251312,-2.279548,0.893064,0.503413,...,-0.824311,0.837538,-0.555301,1.039287,0.318037,0.595866,-1.048927,-0.096970,0.088686,0
3,1.328892,-0.308968,0.381772,0.029460,0.227173,-0.267319,-0.578758,-0.188827,-0.236694,1.270649,...,-0.475997,0.241545,0.329519,-0.050933,-0.379270,-1.074464,0.182045,-0.031732,0.017501,0
4,1.124863,1.337053,1.030033,-0.825094,2.301808,-0.641129,-2.299609,1.066235,1.177166,-0.218823,...,0.870825,-0.336065,-0.600662,-0.196966,-0.216512,0.045625,0.308617,0.001142,0.017673,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85438,-1.808185,-0.991449,-2.473986,-1.030373,0.182992,-0.099751,0.013435,-0.311592,0.259128,0.579725,...,-1.077630,-0.021182,-0.496123,0.745500,-1.441887,1.014602,0.672870,-0.701644,-0.338981,0
85439,-0.442905,1.256775,0.860048,0.888914,-0.172991,0.594082,0.534272,0.398728,-0.408602,-0.370914,...,-0.290372,0.078126,1.474090,0.310200,-0.154031,-1.125678,-0.233577,-0.013664,-0.274549,0
85440,2.092418,-1.553089,-1.131369,-0.390388,-0.702381,0.946786,-0.084962,-0.093172,0.216543,-0.520426,...,-0.175598,-1.121103,1.200952,0.120353,-0.008183,0.257477,-0.458271,-0.056360,-0.053928,0
85441,1.293234,-0.209304,0.329036,-0.203271,0.217831,-0.041285,-0.742165,-2.064189,0.318913,0.742388,...,-0.542123,0.871911,0.546941,-0.186409,-0.340111,-0.956668,0.158010,-0.044116,0.003539,0


In [24]:
df_train.to_parquet('data/creditcard-train.parquet', index=False)
df_test.to_parquet('data/creditcard-test.parquet', index=False)

# 3. Modeling

## 3.1. Hyperparameter Tuning

We tune hyperparameters to find the model that fits the data the best. I didn't run this job on my computer, I used a Google Colab notebook.

The best parameters are:


In [None]:
classifier = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss' # Binary Cross-Entropy loss function
)

# Hyperparameter distributions
param_dist = {
    'learning_rate': np.arange(0.001, 1, 0.01), # Step used to update the weights
    'n_estimators': np.arange(100, 1050, 50), # Numbers of boosting rounds
    'growth_policy': ['depthwise', 'lossguide'], # Control the way the tree grows
    'min_child_weight': np.arange(1, 10, 1), # Minimum child weight
    'reg_alpha': np.arange(0, 1, 0.1), # L1 regularization term on weights
    'reg_lambda': np.arange(0, 1, 0.1), # L2 regularization term on weights
    'early_stopping_rounds': np.arange(10, 60, 10), # Activates early stopping
}

# Define the evaluation metric 
scorer = metrics.make_scorer(metrics.f1_score)

# Define the cross-validation strategy
cv = GridSearchCV(
    estimator=classifier,
    param_grid=param_dist,
    scoring=scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit the model
cv.fit(X_train, y_train)

# Best parameters
print(f'Best F1-score: {cv.best_score_}')
print(f'Best parameters: {cv.best_params_}')

## 3.2. Train (without Cross-Validation)