# Setup

In [66]:
import gc
import numpy as np
import pandas as pd
from xgboost import (
    XGBClassifier
)
from sklearn.model_selection import (
    train_test_split
)
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    recall_score
)

In [25]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

  pd.set_option('max_colwidth', -1)


# Data

In [9]:
df = pd.read_csv(
    "./data/dataset.csv"
)
df.describe()

Unnamed: 0,Time,P1,P2,P3,P4,P5,P6,P7,P8,P9,...,P21,P22,P23,P24,P25,P26,P27,P28,Dollar_amount,Outcome
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284795.0,284807.0
mean,94813.859575,1.759061e-12,-8.25113e-13,-9.654937e-13,8.321385e-13,1.649999e-13,4.248366e-13,-3.0546e-13,8.777971e-14,-1.179749e-12,...,-3.405756e-13,-5.723197e-13,-9.725856e-13,1.46415e-12,-6.987102e-13,-5.617874e-13,3.332082e-12,-3.518874e-12,88.351353,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.124968,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.16,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


## Rows with NaN

Only the Dollor Amount column has Nan. Replace it with 0.

In [26]:
df[df.isna().any(axis=1)]

Unnamed: 0,Time,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,P15,P16,P17,P18,P19,P20,P21,P22,P23,P24,P25,P26,P27,P28,Dollar_amount,Outcome
200,132.0,-1.571359,1.687508,0.73467,1.29335,-0.217532,-0.002677,0.147364,0.515362,-0.372442,0.078021,-0.592495,0.997941,1.109328,0.060048,0.086141,-1.149893,0.765198,-0.810589,0.73755,-0.093614,0.048549,0.377256,-0.030436,0.117608,-0.06052,-0.29655,-0.48157,-0.167897,,0
520,386.0,1.213136,0.462143,0.664599,1.301135,-0.407416,-0.994125,0.180626,-0.279035,-0.216489,0.016012,0.12478,1.049709,1.42461,0.034876,0.794658,0.039519,-0.455407,-0.358972,-0.605013,-0.065426,0.069834,0.318994,-0.048026,0.758936,0.616221,-0.354057,0.032492,0.030264,,0
530,394.0,1.293053,0.457969,-1.94045,0.173149,2.60957,3.014117,-0.269415,0.75442,-0.221009,-0.6208,0.348748,-0.296105,-0.118736,-1.192582,1.278393,0.923268,0.395379,1.039038,-0.252924,0.02502,-0.121126,-0.427753,-0.159336,0.857135,0.850055,-0.311685,0.037536,0.050618,,0
1074,821.0,-1.026206,-0.454773,2.745089,-1.533086,-1.091166,-0.085628,0.062351,-0.06582,-0.886331,0.231772,0.97079,0.300761,0.135392,-0.683825,-0.643731,-0.933018,-0.49121,1.74626,-1.012147,-0.401264,-0.22904,0.025355,-0.014196,0.583596,0.07328,0.97451,-0.242234,-0.193198,,0
1075,822.0,0.860733,-0.802727,1.105443,0.390424,-1.24468,-0.03719,-0.468192,0.083195,1.032531,-0.590352,-0.131767,1.231443,0.780181,-0.644361,-0.151085,-0.529319,0.497742,-1.270436,-0.069039,0.234768,-0.085445,-0.244728,0.009459,0.505358,-0.005593,0.933611,-0.038996,0.041718,,0
1076,823.0,-1.060119,0.697025,0.523657,-0.270607,-0.367703,-0.627989,1.18826,0.093292,0.027599,-0.477586,-0.533336,-0.227888,-1.202125,0.338096,-0.324453,-0.443188,0.174474,-0.943135,-0.960178,-0.220494,0.056744,0.464838,0.272573,0.429093,-0.46742,0.277149,0.020102,-0.111751,,0
1077,823.0,-2.220124,2.522457,-0.219905,0.516665,-0.202546,0.940743,-1.193313,-3.283544,-0.198646,0.093703,-1.435011,-0.246084,-0.726153,0.980986,1.299293,-0.16637,0.371468,0.143836,0.314943,-1.069662,4.003921,-0.901312,0.407891,-0.847506,-0.157341,-0.268754,-0.322953,-0.167473,,0
1760,1358.0,-1.265956,1.292896,0.244323,-1.193612,0.335996,0.288527,0.135064,0.738729,0.295977,-0.053675,-0.923443,-0.37195,-1.335333,0.453693,-0.719704,0.840314,-0.934544,0.660887,0.663043,0.133753,-0.289444,-0.761086,-0.226403,-1.416745,0.058551,0.398471,0.371641,0.208464,,0
1761,1358.0,-0.368093,0.193261,2.094644,-0.39808,-0.746666,-0.08817,0.035217,0.201767,0.326741,-0.692418,0.959856,1.348787,0.377247,-0.531343,-1.401818,-0.131868,-0.086881,-0.078261,0.801954,0.07089,0.006147,0.198106,0.153753,0.617063,-0.994866,0.758272,0.128474,0.175824,,0
1762,1358.0,-0.589153,0.756574,1.34856,-1.48967,0.046295,-0.804692,0.780205,-0.330465,1.068251,0.431671,-0.996525,-1.231343,-1.624514,-0.340315,0.828247,0.676048,-1.017637,-0.080779,-0.599915,0.317143,-0.324497,-0.486022,-0.130672,-0.165254,-0.189635,0.713832,0.046514,-0.262555,,0


In [27]:
df.fillna(0, inplace=True)
df[df.isna().any(axis=1)]

Unnamed: 0,Time,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,P15,P16,P17,P18,P19,P20,P21,P22,P23,P24,P25,P26,P27,P28,Dollar_amount,Outcome


## Data imbalance
Most data (99.83%)is fraud-negative as expected.

In [43]:
num_negatives= df[df['Outcome'] == 0].shape[0]
num_positives = df[df['Outcome'] == 1].shape[0] 
ratio = num_negatives / (num_negatives+num_positives)
print(f"positives {num_positives} negatives {num_negatives} ratio {np.round(ratio * 100, 2)}% negatives")

positives 492 negatives 284315 ratio 99.83% negatives


--- 
## Data and label split

In [45]:
X = df.loc[:, df.columns != 'Outcome']
Y = df.loc[:, ['Outcome']]

In [46]:
Y[~Y['Outcome'].isin([0,1])]

Unnamed: 0,Outcome


## Train and test split

In [57]:
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, 
    test_size=test_size, 
    random_state=seed,
    stratify=None,
    shuffle=True
)

In [65]:
del df, X, Y

---
# Model (imbalanced)

In [49]:
model = XGBClassifier()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Performance (imbalanced)

In [50]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

### Accuracy

In [55]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.96%


### Recall

In [56]:
recall = recall_score(y_test, predictions)
print("Recall: %.2f%%" % (recall * 100.0))

Recall: 81.00%


### ROC/AUC

In [54]:
auc = roc_auc_score(y_test, predictions)
print("AUC: %.2f%%" % (auc * 100.0))

AUC: 90.50%


In [None]:
del model

---
# Model (SMOTE)

Oversample the data to balance the labels.

In [60]:
from imblearn.over_sampling import (
    SMOTE, 
    ADASYN
)

In [63]:
X_train_smote, y_train_smote = SMOTE().fit_resample(X_train, y_train)

In [64]:
model_smote = XGBClassifier()
model_smote.fit(X_train_smote, y_train_smote)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Performance (SMOTE)

In [67]:
y_pred_smote = model_smote.predict(X_test)
predictions_smote = [round(value) for value in y_pred_smote]

### Accuracy

In [68]:
accuracy = accuracy_score(y_test, predictions_smote)
print("Accuracy (SMOTE): %.2f%%" % (accuracy * 100.0))

Accuracy (SMOTE): 99.94%


### Recall

In [69]:
recall = recall_score(y_test, predictions_smote)
print("Recall (SMOTE): %.2f%%" % (recall * 100.0))

Recall (SMOTE): 86.00%


### ROC/AUC

In [70]:
auc = roc_auc_score(y_test, predictions_smote)
print("AUC (SMOTE): %.2f%%" % (auc * 100.0))

AUC (SMOTE): 92.98%
