In [83]:
import pandas as pd
import numpy as np
import itertools

import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

plt.style.use('seaborn')

In [69]:
import pickle
with open('./data/cleaned_df.pickle', 'rb') as file:
    df = pickle.load(file)
    

## Preview the class imbalance

In [70]:
# Count the number of fraudulent/infraudulent purchases
df['default_next_month'].value_counts(normalize=True)

0    0.778594
1    0.221406
Name: default_next_month, dtype: float64

## Define the predictor and target variables

In [152]:
# Your code here
y = df['default_next_month']
X = df.drop(columns=['default_next_month'], axis=1)

#scaler = StandardScaler()
#scaled_df = scaler.fit_transform(X)

SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

Find the class imbalance in the training and test sets: 

In [153]:
# Training set
print(y_train.value_counts(normalize=True))
print('\n')
# Test set
print(y_test.value_counts(normalize=True))

0    0.778398
1    0.221602
Name: default_next_month, dtype: float64


0    0.779052
1    0.220948
Name: default_next_month, dtype: float64


In [154]:
rus = RandomOverSampler()
#rus = RandomUnderSampler(random_state=3)

# Fit SMOTE to training data
#X_train_resampled, y_train_resampled = ADASYN().fit_sample(X_train, y_train) 
X_train_resampled, y_train_resampled = rus.fit_sample(X_train, y_train)

## Train a vanilla classifier and show accuracy

In [155]:

#clf = xgb.XGBClassifier(scale_pos_weight=1)
clf = xgb.XGBClassifier(scale_pos_weight=5)
clf.fit(X_train, y_train)
training_preds = clf.predict(X_train)
val_preds = clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 69.53%
Validation accuracy: 67.94%


In [156]:
print(classification_report(y_test, val_preds))

              precision    recall  f1-score   support

           0       0.90      0.66      0.76      6999
           1       0.38      0.73      0.50      1985

    accuracy                           0.68      8984
   macro avg       0.64      0.70      0.63      8984
weighted avg       0.78      0.68      0.71      8984



## Tuning XGBoost

In [75]:
param_grid = {
    "learning_rate": [0.1],
    'max_depth': [6],
    'min_child_weight': [10],
    'subsample': [ 0.7],
    'n_estimators': [5, 30, 100, 250],
}

In [170]:
# Gridsearch did not work when used with gridsearch so it wasnt used.

# Create the samplers
#rus = RandomOverSampler(random_state=3)
#rus = RandomUnderSampler(random_state=3)

#X_train_resampled, y_train_resampled = ADASYN().fit_sample(X_train, y_train) 
#X_train_resampled, y_train_resampled = rus.fit_sample(X_train, y_train)

In [167]:
grid_clf = GridSearchCV(clf, param_grid, scoring='f1', cv=None, n_jobs=1)
grid_clf.fit(X, y)

best_parameters = grid_clf.best_params_

print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
val_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Grid Search found the following optimal parameters: 
learning_rate: 0.1
max_depth: 6
min_child_weight: 10
n_estimators: 250
subsample: 0.7

Training Accuracy: 78.88%
Validation accuracy: 78.57%


In [168]:
print(confusion_matrix(y_test, val_preds))

[[5322 1677]
 [ 248 1737]]


In [169]:
print(classification_report(y_test, val_preds))

              precision    recall  f1-score   support

           0       0.96      0.76      0.85      6999
           1       0.51      0.88      0.64      1985

    accuracy                           0.79      8984
   macro avg       0.73      0.82      0.75      8984
weighted avg       0.86      0.79      0.80      8984

