<h3>Fraud Detection using XGBoost</h3>
<p>Kaggle Credit Card Fraud Data Set - https://www.kaggle.com/mlg-ulb/creditcardfraud - 
<br>sklearn, XGBoost</p>

In [1]:
import sys, os, time, gc

import pandas as pd
import numpy as np

from collections import Counter
from scipy.stats import ks_2samp

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score,        \
                            recall_score,           \
                            fbeta_score,            \
                            f1_score,               \
                            confusion_matrix,       \
                            precision_recall_curve, \
                            roc_curve,              \
                            auc

from xgboost.sklearn import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
input_file = 'data/creditcard.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Preprocessing

In [3]:
# make a column 'A' - and put there value of 'Amount' rescaled into (-1,1)
sc = StandardScaler()
df['A']= sc.fit_transform(df.Amount.values.reshape(-1, 1))

# add columns for minuts and hours
timedelta = pd.to_timedelta(df['Time'], unit='s')
df['Time_min'] = (timedelta.dt.components.minutes).astype(int)
df['Time_hour'] = (timedelta.dt.components.hours).astype(int)

In [4]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class', 'A', 'Time_min', 'Time_hour'], dtype='object')

In [5]:
cols = []
for ii in range(1,29):
    cols += ["V%d"%ii]
cols += ['A'] + ['Time_min'] + ['Time_hour']
print(str(cols))
# ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 
# 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
# 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 
# 'A', 'Time_min', 'Time_hour']

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'A', 'Time_min', 'Time_hour']


### Pick 10 most correlated variables

In [6]:
# find which columns' values change their distribution the most
# when we change from rows with Class=0 to Class==1.
# To compare two distributions we will use Kolmogorov-Smirnov test.
#  - https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
# Note - this is just one of many tests, for example, read this discussion:
#  - https://stats.stackexchange.com/questions/1001/is-spearmans-correlation-coefficient-usable-to-compare-distributions
# 
# function ks_2samp(array1, array2) 
# compares distributions of values in two arrays
# and returns p_value
#    p_value > 0.4 - distributions are very similar (identical)
#    p_value very small - distributions differ

ks = []
print("small p_values indicate different distributions")
print('-'*40)
print("label => p_value") 
print('-'*40)
for col in cols:
    vals_0 = df[df.Class == 0][col].values # values for column <col> for rows where Class==0
    vals_1 = df[df.Class == 1][col].values # values for column <col> for rows where Class==1
    p_val = ks_2samp(vals_0, vals_1)[1]    # p_val is bigger for similar distributions
    ks.append((col, p_val))
    print("%s => %.6f" % (col,p_val))

small p_values indicate different distributions
----------------------------------------
label => p_value
----------------------------------------
V1 => 0.000000
V2 => 0.000000
V3 => 0.000000
V4 => 0.000000
V5 => 0.000000
V6 => 0.000000
V7 => 0.000000
V8 => 0.000000
V9 => 0.000000
V10 => 0.000000
V11 => 0.000000
V12 => 0.000000
V13 => 0.000053
V14 => 0.000000
V15 => 0.036886
V16 => 0.000000
V17 => 0.000000
V18 => 0.000000
V19 => 0.000000
V20 => 0.000000
V21 => 0.000000
V22 => 0.071807
V23 => 0.000000
V24 => 0.000000
V25 => 0.000910
V26 => 0.000570
V27 => 0.000000
V28 => 0.000000
A => 0.000000
Time_min => 0.646457
Time_hour => 0.000000


In [7]:
# pick 10 variables whose distributions change the most between Class values 0,1
# (this means that p_val is the lowest)

# create DataFrame [label, p_val]
ks_df = pd.DataFrame(data = ks, columns = ['label', 'p_val'])

# sort by p_val in ascending order
df_sorted_by_pvals = ks_df.sort_values(by='p_val',ascending=True)[:10] # .label.iloc[:20]
df_sorted_by_pvals

Unnamed: 0,label,p_val
9,V10,0.0
13,V14,0.0
11,V12,8.607e-321
3,V4,2.040642e-302
10,V11,4.347507e-292
16,V17,1.387411e-281
2,V3,9.92758e-245
15,V16,9.366543000000001e-231
6,V7,6.7768620000000005e-211
1,V2,2.1638950000000002e-191


In [8]:
ser_corr10_cols = df_sorted_by_pvals['label']  # type - pandas series, values = [V14,V10, etc.]
ser_corr10_cols

9     V10
13    V14
11    V12
3      V4
10    V11
16    V17
2      V3
15    V16
6      V7
1      V2
Name: label, dtype: object

<h3>Create Cross-Validation Folds</h3>
<p>We randomly split all data rows in 4 groups (folds) numbered as 0,1,2,3
<br>For modeling we will randomly select one of the folds as test data, 
<br>and combination of other three folds as training data.
<br>Thus we can repeat modeling 4 times (if there are 4 folds).
<br>Below we do that. We also repeat everything 3 times.
<br>So we make 12 calculations, and then calculate mean and standard deviation.</p>

In [9]:
n_cv = 4           # number of cross-validation folds
df_len = len(df)   # length of data

# randomly populate column cv_fold with numbers 0,1,2,3
df['cv_fold'] = np.random.randint(0,4, df_len)  
print("values in column 'cv_fold':", np.unique(df['cv_fold']))

print("check that classes (0 or 1) are approx evenly distributed")
print("between folds: (0+1+2+3)/4 = 1.5")
print ("0 => %.4f" % np.mean(df.cv_fold[df.Class == 0]))
print ("1 => %.4f" % np.mean(df.cv_fold[df.Class == 1]))

values in column 'cv_fold': [0 1 2 3]
check that classes (0 or 1) are approx evenly distributed
between folds: (0+1+2+3)/4 = 1.5
0 => 1.5027
1 => 1.5833


In [10]:
def print_results(ytest, ypredict):
    print ("Precision : %.6f" %  precision_score(ytest, ypredict))
    print ("Recall    : %.6f" %  recall_score   (ytest, ypredict))
    print ("F1-score  : %.6f" %  fbeta_score    (ytest, ypredict, beta=1))
    print ("F2-score  : %.6f" %  fbeta_score    (ytest, ypredict, beta=1))  

In [11]:
def f1score_comp(recall, precision):
    """Calculate and return F1 score"""
    f1score = []
    for r,p in zip(recall, precision):
        if r == p == 0:
            f1score.append(0.0)
        else:
            f1score.append(2.0 * r * p/(r+p))
    return f1score

<h3>run_cv() - XGBClassifier without imputing data</h3>

In [12]:
def run_cv(model, label, columns):
    """
    #  runs XGBClassifier
    #  gets data from external DataFrame df:
    #      X => df[columns]
    #      Y => df['Class']
    #  goess through n_cv folds
    #  returns dictionary "res" 
    #      keys   [precision, recall, f1core]
    #      values as tuples (mean, std_deviation)
    """   
    print("running ", label)
    t1=time.time()

    precisions = []
    recalls = []
    f1scores = []
    n_cv = len(df.cv_fold.unique())

    for n in range(n_cv):
        print("(fold=%d), " % (n), end='')
        X_train = df[df.cv_fold != n][columns]
        Y_train = df[df.cv_fold != n]['Class']
        X_test  = df[df.cv_fold == n][columns]
        Y_test  = df[df.cv_fold == n]['Class']
        model.fit(X_train, Y_train, 
                  eval_set=[((X_train, Y_train)), (X_test, Y_test)], 
                  verbose = False)

        Y_predict = model.predict(X_test)
        recalls.append    (recall_score    (Y_test, Y_predict))
        precisions.append (precision_score (Y_test, Y_predict))
        f1scores.append   (f1_score        (Y_test, Y_predict))

    print()
    res = {}
    res['precision'] = (np.mean(precisions), np.std(precisions))
    res['recall'   ] = (np.mean(recalls)   , np.std(recalls)   )
    res['f1score'  ] = (np.mean(f1scores)  , np.std(f1scores)  )    
    
    print("\nfinished %d calculations" % (n_cv))
    print ("Precision : %.6f +- %.6f" % (np.mean(precisions) , np.std(precisions)))
    print ("Recall    : %.6f +- %.6f" % (np.mean(recalls)    , np.std(recalls)))
    print ("F1 score  : %.6f +- %.6f" % (np.mean(f1scores)   , np.std(f1scores)))
    print("Elapsed %.2f sec" % (time.time()-t1) )
    print('-'*65,"\n")
    
    return res

<h3>run_cv() - XGBClassifier without imputing data</h3>

In [13]:
def run_cv_smote(model, label, columns, resampler):
    """
    #  runs XGBClassifier
    #  with imputing values into minority class
    #  (using SMOTE or ADASYN to decrease imbalance between classes)
    #  gets data from external DataFrame df:
    #      X => df[columns]
    #      Y => df['Class']
    #  goess through n_cv folds
    #  returns dictionary "res" 
    #      keys   [precision, recall, f1core]
    #      values as tuples (mean, std_deviation)
    """   
    print("running ", label)
    t1=time.time()

    precisions = []
    recalls = []
    f1scores = []
    n_cv = len(df.cv_fold.unique())

    for n in range(n_cv):
        print("(fold=%d), " % (n), end='')
        X_train = df[df.cv_fold != n][columns]
        Y_train = df[df.cv_fold != n]['Class']
        X_test  = df[df.cv_fold == n][columns]
        Y_test  = df[df.cv_fold == n]['Class']
        #sm = SMOTE(random_state=12,  k_neighbors=5, ratio = {1:sum(Y_train == 0)})
        x_train_res, y_train_res = resampler.fit_sample(X_train, Y_train)
        X_train_res = pd.DataFrame(data = x_train_res, columns = X_train.columns)
        model.fit(X_train_res, y_train_res, 
                  eval_set=[((X_train_res, y_train_res)),(X_test, Y_test)], 
                  verbose = False)
        
        Y_predict = model.predict(X_test)
        recalls.append    (recall_score    (Y_test, Y_predict))
        precisions.append (precision_score (Y_test, Y_predict))
        f1scores.append   (f1_score        (Y_test, Y_predict))

    print()
    res = {}
    res['precision'] = (np.mean(precisions), np.std(precisions))
    res['recall'   ] = (np.mean(recalls)   , np.std(recalls)   )
    res['f1score'  ] = (np.mean(f1scores)  , np.std(f1scores)  )    

    print("\nfinished %d calculations" % (n_cv))
    print ("Precision : %.6f +- %.6f" % (np.mean(precisions) , np.std(precisions)))
    print ("Recall    : %.6f +- %.6f" % (np.mean(recalls)    , np.std(recalls)))
    print ("F1 score  : %.6f +- %.6f" % (np.mean(f1scores)   , np.std(f1scores)))
    print("Elapsed %.2f sec" % (time.time()-t1) )
    print('-'*65,"\n")

    return res

In [14]:
results = {}

In [15]:
# create sets of params:

# ------------------------------
#xgb1
params = {}
params['eta'] = 0.03
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['max_depth'] = 15
params['early_stopping_rounds'] = 100
#params['min_child_weight']= 6
#params['subsample']=0.7
#params['colsample_bytree']= 0.3

# ------------------------------
#xgb2
params2 = {}
params2['eta'] = 0.03
params2['objective'] = 'binary:logistic'
params2['eval_metric'] = 'auc'
params2['max_depth'] = 5
params2['gamma'] = 0.5
params2['early_stopping_rounds'] = 100
#params['min_child_weight']= 6
#params['subsample']=0.7
#params['colsample_bytree']= 0.3

<h3>Functions to run different cases</h3>

In [16]:
# --------------------------------------------------------------
def run_xgb1():
    xgb_model = XGBClassifier(**params)
    label = 'xgb1'
    results[label] = run_cv(xgb_model, label, cols)

# --------------------------------------------------------------
def run_xgb2():
    xgb_model = XGBClassifier(**params2)
    label = 'xgb2'
    results[label] = run_cv(xgb_model, label, cols)

# --------------------------------------------------------------
def run_xgb1_corr_cols():
    xgb_model = XGBClassifier(**params)
    label = 'xgb1_corr_cols'
    results[label] = run_cv(xgb_model, label, ser_corr10_cols)

# --------------------------------------------------------------
def run_xgb1_smote():
    xgb_model = XGBClassifier(**params)
    sm = SMOTE(random_state = 12, ratio = 'minority')
    label = 'xgb1_smote'
    results[label] = run_cv_smote(xgb_model, label, cols, sm)

# --------------------------------------------------------------
def run_xgb1_ada():
    xgb_model = XGBClassifier(**params)
    ada = ADASYN(random_state=42, ratio = 'minority')
    label = 'xgb1_ada'
    results[label] = run_cv_smote(xgb_model, label, cols, ada)

# --------------------------------------------------------------
def run_xgb1_smote_corr_cols():
    xgb_model = XGBClassifier(**params)
    sm = SMOTE(random_state = 12, ratio = 'minority')
    label = 'xgb1_smote_corr_cols'
    results[label] = run_cv_smote(xgb_model, label, ser_corr10_cols, sm)

<h3>Now we can run all cases as needed</h3>

In [18]:
run_xgb1()
run_xgb2()
run_xgb1_corr_cols()

run_xgb1_smote()
run_xgb1_ada()
run_xgb1_smote_corr_cols()

<h3>Prepare Summary Table of All Results</h3>

In [None]:
results_orig = results.copy()

In [None]:
results_df = pd.DataFrame.from_dict(results).transpose()
rx = results_df.applymap(lambda x: x[0])
re = results_df.applymap(lambda x: x[1])
rx

In [None]:
def make_graph(xx, yy, ee, label):
    plt.figure(figsize=(20,10))
    plt.errorbar(xx, yy, ee, fmt="s", label=label)
    plt.yticks(fontsize=22)
    plt.xticks(fontsize=22)
    plt.legend(fontsize=22)
    plt.show()    

In [None]:
# make_graph(rx.index, rx.f1score,   re.f1score,   'f1score'   )
# make_graph(rx.index, rx.recall,    re.recall,    'recall'    )
# make_graph(rx.index, rx.precision, re.precision, 'precision' )
