# Analysis using XGB classifier with and without oversampling methods
### Author: Marlene Marchena

### Importing Libraries

In [1]:
#Import data manipulation libraries
import pandas as pd
import numpy as np
import time
import multiprocessing as mp

#Import visualization libraries
import matplotlib.pyplot as plt 

from sklearn.preprocessing import MinMaxScaler

# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# over sampling functions
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

# metrics
from sklearn.metrics import f1_score, recall_score, precision_score, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

#set random seed and percentage of test data
random_seed = 12345678

Loading the train and test data saved in the data processing notebook

In [3]:
x_train1 = np.loadtxt('x_train1_outlierTreatment.csv', delimiter=',')
y_train1 = np.loadtxt('y_train1_outlierTreatment.csv', dtype='int32', delimiter=',')
y_train1 = y_train1.reshape(-1,1)
x_test1 = np.loadtxt('x_test_fold1.csv', delimiter=',')
y_test1 = np.loadtxt('y_test_fold1.csv', dtype='int32', delimiter=',')
y_test1 = y_test1.reshape(-1,1)

x_train2 = np.loadtxt('x_train2_outlierTreatment.csv', delimiter=',')
y_train2 = np.loadtxt('y_train2_outlierTreatment.csv', dtype='int32', delimiter=',')
y_train2 = y_train2.reshape(-1,1)
x_test2 = np.loadtxt('x_test_fold2.csv', delimiter=',')
y_test2 = np.loadtxt('y_test_fold2.csv', dtype='int32', delimiter=',') 
y_test2 = y_test2.reshape(-1,1)

x_train3 = np.loadtxt('x_train3_outlierTreatment.csv', delimiter=',')
y_train3 = np.loadtxt('y_train3_outlierTreatment.csv', dtype='int32', delimiter=',')
y_train3 = y_train3.reshape(-1,1)
x_test3 = np.loadtxt('x_test_fold3.csv', delimiter=',')
y_test3 = np.loadtxt('y_test_fold3.csv', dtype='int32', delimiter=',') 
y_test3 = y_test3.reshape(-1,1)

x_train4 = np.loadtxt('x_train4_outlierTreatment.csv', delimiter=',')
y_train4 = np.loadtxt('y_train4_outlierTreatment.csv', dtype='int32', delimiter=',')
y_train4 = y_train4.reshape(-1,1)
x_test4 = np.loadtxt('x_test_fold4.csv', delimiter=',')
y_test4 = np.loadtxt('y_test_fold4.csv', dtype='int32', delimiter=',') 
y_test4 = y_test4.reshape(-1,1)

x_train5 = np.loadtxt('x_train5_outlierTreatment.csv', delimiter=',')
y_train5 = np.loadtxt('y_train5_outlierTreatment.csv', dtype='int32', delimiter=',')
y_train5 = y_train5.reshape(-1,1)
x_test5 = np.loadtxt('x_test_fold5.csv', delimiter=',')
y_test5 = np.loadtxt('y_test_fold5.csv', dtype='int32', delimiter=',') 
y_test5 = y_test5.reshape(-1,1)

### Working with the original dataset

No oversampling method is used

In [4]:
def Evaluate_Top100(model,x_train,y_train,x_test,y_test):
    '''
    This fuction fits a model with the original data and evaluate it with the top 100 alerts
    model to be used
    Returns the confusion matrix and performance measures over the top 100 alerts
    '''   
    clf = model.fit(x_train,np.ravel(y_train))

    # Prediction on the test dataset
    predicted = clf.predict(x_test)
    pred_prob = clf.predict_proba(x_test) 
    #selecting only probabilities of frauds
    pred_prob1 = pred_prob[:,1]
    
    #Sorting in descending order by the probability of class 1  
    pred_prob_sorted_idx = np.argsort(-pred_prob1)
    pred_prob_sorted_idx = pred_prob_sorted_idx[:100]
        
    y_pred_top100 =predicted[pred_prob_sorted_idx]
    y_test_top100 = y_test[pred_prob_sorted_idx]
  
    f  = round(f1_score(y_test_top100,y_pred_top100),6)  # f1: 2 tp / (2 tp + fp + fn)
    recall  = round(recall_score(y_test_top100,y_pred_top100),6) # recall: tp / (tp + fn)
    pre  = round(precision_score(y_test_top100,y_pred_top100),6) # precision: tp / (tp + fp)
    p, r, _ = precision_recall_curve(y_test_top100,y_pred_top100)

    auprc = round(auc(r, p),6) if not np.isnan(auc(r, p)) else None


    tn, fp, fn, tp = confusion_matrix(y_test_top100,y_pred_top100).ravel()

    results = {'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp,'precision': pre, 'recall': recall, 
               'f1_score': f, 'auprc': auprc }

    return results

In [5]:
#Using multiprocessing to speed the running time
if __name__ == '__main__':
    
    start = time.time()
    
    # Setting the models to be used
    xgb = XGBClassifier(random_state=random_seed)   
    
    values = ((xgb , x_train1,y_train1,x_test1,y_test1), 
              (xgb , x_train2,y_train2,x_test2,y_test2), 
              (xgb , x_train3,y_train3,x_test3,y_test3),   
              (xgb , x_train4,y_train4,x_test4,y_test4),   
              (xgb , x_train5,y_train5,x_test5,y_test5)   
             )   
    pool = mp.Pool()
    p = pool.starmap(Evaluate_Top100, values)
    print(p) 
    df = pd.DataFrame(p, index =['fold 1', 'fold 2', 'fold 3', 'fold 4', 'fold 5'])
    print(df)
    xgb_df_mean = df.iloc[:,4:].mean()
    print(xgb_df_mean)
    end = time.time()
    total = round(end - start,1)
    print('Time taken = {} minutes'.format(total/60))

[{'tn': 11, 'fp': 3, 'fn': 9, 'tp': 77, 'precision': 0.9625, 'recall': 0.895349, 'f1_score': 0.927711, 'auprc': 0.973924}, {'tn': 16, 'fp': 4, 'fn': 7, 'tp': 73, 'precision': 0.948052, 'recall': 0.9125, 'f1_score': 0.929936, 'auprc': 0.965276}, {'tn': 20, 'fp': 4, 'fn': 2, 'tp': 74, 'precision': 0.948718, 'recall': 0.973684, 'f1_score': 0.961039, 'auprc': 0.971201}, {'tn': 14, 'fp': 7, 'fn': 8, 'tp': 71, 'precision': 0.910256, 'recall': 0.898734, 'f1_score': 0.904459, 'auprc': 0.944495}, {'tn': 15, 'fp': 6, 'fn': 6, 'tp': 73, 'precision': 0.924051, 'recall': 0.924051, 'f1_score': 0.924051, 'auprc': 0.954051}]
        tn  fp  fn  tp  precision    recall  f1_score     auprc
fold 1  11   3   9  77   0.962500  0.895349  0.927711  0.973924
fold 2  16   4   7  73   0.948052  0.912500  0.929936  0.965276
fold 3  20   4   2  74   0.948718  0.973684  0.961039  0.971201
fold 4  14   7   8  71   0.910256  0.898734  0.904459  0.944495
fold 5  15   6   6  73   0.924051  0.924051  0.924051  0.954051

## Traditional Oversampling Techniques

Evaluating Traditional Oversampling Techniques:
* Random Oversampling - ROS
* Synthetic Minority Over-sampling Technique - SMOTE
* BorderlineSMOTE
* ADASYN

In [6]:
def Oversampling_Evaluate_Top100(model,resample_tech,x_train,y_train,x_test,y_test):
    '''
    This fuction does an oversampling to balance the training set, then it 
    fits a model with the balanced data and evaluate the prediction with the top 100 
    risky transactions.
    model: the model to be used
    resample_tech : resample method to be used
    Returns the confusion matrix and performance measures over the top 100 alerts
    '''   
    # Perform resampling
    x_over, y_over = resample_tech.fit_resample(x_train,y_train)

    clf = model.fit(x_over,np.ravel(y_over))

    # Prediction on the test dataset
    predicted = clf.predict(x_test)
    pred_prob = clf.predict_proba(x_test) 
    #selecting only probabilities of frauds
    pred_prob1 = pred_prob[:,1]
    
    #Sorting in descending order by the probability of class 1  
    pred_prob_sorted_idx = np.argsort(-pred_prob1)
    pred_prob_sorted_idx = pred_prob_sorted_idx[:100]
        
    y_pred_top100 =predicted[pred_prob_sorted_idx]
    y_test_top100 = y_test[pred_prob_sorted_idx]
  
    f  = round(f1_score(y_test_top100,y_pred_top100),6)  # f1: 2 tp / (2 tp + fp + fn)
    recall  = round(recall_score(y_test_top100,y_pred_top100),6) # recall: tp / (tp + fn)
    pre  = round(precision_score(y_test_top100,y_pred_top100),6) # precision: tp / (tp + fp)
    p, r, _ = precision_recall_curve(y_test_top100,y_pred_top100)

    auprc = round(auc(r, p),6) if not np.isnan(auc(r, p)) else None


    tn, fp, fn, tp = confusion_matrix(y_test_top100,y_pred_top100).ravel()

    results = {'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp,'precision': pre, 'recall': recall, 
               'f1_score': f, 'auprc': auprc }

    return results

**ROS**

In [7]:
#Using multiprocessing to speed the running time
if __name__ == '__main__':
    
    start = time.time()
    
    # Setting the models to be used
    ros = RandomOverSampler(random_state=random_seed)
    xgb = XGBClassifier(random_state=random_seed) 
    
    values = ((xgb, ros , x_train1,y_train1,x_test1,y_test1), 
              (xgb, ros , x_train2,y_train2,x_test2,y_test2), 
              (xgb, ros , x_train3,y_train3,x_test3,y_test3),   
              (xgb, ros , x_train4,y_train4,x_test4,y_test4),   
              (xgb, ros , x_train5,y_train5,x_test5,y_test5)   
             )   
    pool = mp.Pool()
    p = pool.starmap(Oversampling_Evaluate_Top100, values)
    print(p) 
    df = pd.DataFrame(p, index =['fold 1', 'fold 2', 'fold 3', 'fold 4', 'fold 5'])
    print(df)
    xgb_ros_df_mean = df.iloc[:,4:].mean()
    print(xgb_ros_df_mean)
    end = time.time()
    total = round(end - start,1)
    print('Time taken = {} minutes'.format(total/60))

[{'tn': 5, 'fp': 9, 'fn': 0, 'tp': 86, 'precision': 0.905263, 'recall': 1.0, 'f1_score': 0.950276, 'auprc': 0.952632}, {'tn': 14, 'fp': 7, 'fn': 4, 'tp': 75, 'precision': 0.914634, 'recall': 0.949367, 'f1_score': 0.931677, 'auprc': 0.952001}, {'tn': 15, 'fp': 9, 'fn': 1, 'tp': 75, 'precision': 0.892857, 'recall': 0.986842, 'f1_score': 0.9375, 'auprc': 0.94485}, {'tn': 14, 'fp': 8, 'fn': 6, 'tp': 72, 'precision': 0.9, 'recall': 0.923077, 'f1_score': 0.911392, 'auprc': 0.941538}, {'tn': 15, 'fp': 6, 'fn': 2, 'tp': 77, 'precision': 0.927711, 'recall': 0.974684, 'f1_score': 0.950617, 'auprc': 0.961197}]
        tn  fp  fn  tp  precision    recall  f1_score     auprc
fold 1   5   9   0  86   0.905263  1.000000  0.950276  0.952632
fold 2  14   7   4  75   0.914634  0.949367  0.931677  0.952001
fold 3  15   9   1  75   0.892857  0.986842  0.937500  0.944850
fold 4  14   8   6  72   0.900000  0.923077  0.911392  0.941538
fold 5  15   6   2  77   0.927711  0.974684  0.950617  0.961197
precision

**SMOTE**

In [8]:
#Using multiprocessing to speed the running time
if __name__ == '__main__':
    
    start = time.time()
    
    # Setting the models to be used
    sm = SMOTE(random_state=random_seed)
    xgb = XGBClassifier(random_state=random_seed) 
    
    values = ((xgb, sm  , x_train1,y_train1,x_test1,y_test1), 
              (xgb, sm  , x_train2,y_train2,x_test2,y_test2), 
              (xgb, sm  , x_train3,y_train3,x_test3,y_test3),   
              (xgb, sm  , x_train4,y_train4,x_test4,y_test4),   
              (xgb, sm  , x_train5,y_train5,x_test5,y_test5)   
             )   
    pool = mp.Pool()
    p = pool.starmap(Oversampling_Evaluate_Top100, values)
    print(p) 
    df = pd.DataFrame(p, index =['fold 1', 'fold 2', 'fold 3', 'fold 4', 'fold 5'])
    print(df)
    xgb_sm_df_mean = df.iloc[:,4:].mean()
    print(xgb_sm_df_mean)
    end = time.time()
    total = round(end - start,1)
    print('Time taken = {} minutes'.format(total/60))

[{'tn': 0, 'fp': 19, 'fn': 0, 'tp': 81, 'precision': 0.81, 'recall': 1.0, 'f1_score': 0.895028, 'auprc': 0.905}, {'tn': 0, 'fp': 22, 'fn': 0, 'tp': 78, 'precision': 0.78, 'recall': 1.0, 'f1_score': 0.876404, 'auprc': 0.89}, {'tn': 0, 'fp': 26, 'fn': 0, 'tp': 74, 'precision': 0.74, 'recall': 1.0, 'f1_score': 0.850575, 'auprc': 0.87}, {'tn': 0, 'fp': 26, 'fn': 0, 'tp': 74, 'precision': 0.74, 'recall': 1.0, 'f1_score': 0.850575, 'auprc': 0.87}, {'tn': 0, 'fp': 21, 'fn': 0, 'tp': 79, 'precision': 0.79, 'recall': 1.0, 'f1_score': 0.882682, 'auprc': 0.895}]
        tn  fp  fn  tp  precision  recall  f1_score  auprc
fold 1   0  19   0  81       0.81     1.0  0.895028  0.905
fold 2   0  22   0  78       0.78     1.0  0.876404  0.890
fold 3   0  26   0  74       0.74     1.0  0.850575  0.870
fold 4   0  26   0  74       0.74     1.0  0.850575  0.870
fold 5   0  21   0  79       0.79     1.0  0.882682  0.895
precision    0.772000
recall       1.000000
f1_score     0.871053
auprc        0.886000


**Bordeline SMOTE**

In [9]:
#Using multiprocessing to speed the running time
if __name__ == '__main__':
    
    start = time.time()
    
    # Setting the models to be used
    blSMOTE = BorderlineSMOTE(random_state=random_seed)
    xgb = XGBClassifier(random_state=random_seed) 
    
    values = ((xgb, blSMOTE , x_train1,y_train1,x_test1,y_test1), 
              (xgb, blSMOTE , x_train2,y_train2,x_test2,y_test2), 
              (xgb, blSMOTE , x_train3,y_train3,x_test3,y_test3),   
              (xgb, blSMOTE , x_train4,y_train4,x_test4,y_test4),   
              (xgb, blSMOTE , x_train5,y_train5,x_test5,y_test5)   
             )   
    pool = mp.Pool()
    p = pool.starmap(Oversampling_Evaluate_Top100, values)
    print(p) 
    df = pd.DataFrame(p, index =['fold 1', 'fold 2', 'fold 3', 'fold 4', 'fold 5'])
    print(df)
    xgb_blSMOTE_df_mean = df.iloc[:,4:].mean()
    print(xgb_blSMOTE_df_mean)
    end = time.time()
    total = round(end - start,1)
    print('Time taken = {} minutes'.format(total/60))

[{'tn': 3, 'fp': 13, 'fn': 0, 'tp': 84, 'precision': 0.865979, 'recall': 1.0, 'f1_score': 0.928177, 'auprc': 0.93299}, {'tn': 7, 'fp': 13, 'fn': 1, 'tp': 79, 'precision': 0.858696, 'recall': 0.9875, 'f1_score': 0.918605, 'auprc': 0.928098}, {'tn': 11, 'fp': 14, 'fn': 0, 'tp': 75, 'precision': 0.842697, 'recall': 1.0, 'f1_score': 0.914634, 'auprc': 0.921348}, {'tn': 12, 'fp': 11, 'fn': 1, 'tp': 76, 'precision': 0.873563, 'recall': 0.987013, 'f1_score': 0.926829, 'auprc': 0.935288}, {'tn': 12, 'fp': 12, 'fn': 1, 'tp': 75, 'precision': 0.862069, 'recall': 0.986842, 'f1_score': 0.920245, 'auprc': 0.929456}]
        tn  fp  fn  tp  precision    recall  f1_score     auprc
fold 1   3  13   0  84   0.865979  1.000000  0.928177  0.932990
fold 2   7  13   1  79   0.858696  0.987500  0.918605  0.928098
fold 3  11  14   0  75   0.842697  1.000000  0.914634  0.921348
fold 4  12  11   1  76   0.873563  0.987013  0.926829  0.935288
fold 5  12  12   1  75   0.862069  0.986842  0.920245  0.929456
preci

**ADASYN**

In [10]:
#Using multiprocessing to speed the running time
if __name__ == '__main__':
    
    start = time.time()
    
    # Setting the models to be used
    adasyn = ADASYN(random_state=random_seed)
    xgb = XGBClassifier(random_state=random_seed) 
    
    values = ((xgb, adasyn , x_train1,y_train1,x_test1,y_test1), 
              (xgb, adasyn , x_train2,y_train2,x_test2,y_test2), 
              (xgb, adasyn , x_train3,y_train3,x_test3,y_test3),   
              (xgb, adasyn , x_train4,y_train4,x_test4,y_test4),   
              (xgb, adasyn , x_train5,y_train5,x_test5,y_test5)   
             )   
    pool = mp.Pool()
    p = pool.starmap(Oversampling_Evaluate_Top100, values)
    print(p) 
    df = pd.DataFrame(p, index =['fold 1', 'fold 2', 'fold 3', 'fold 4', 'fold 5'])
    print(df)
    xgb_adasyn_df_mean = df.iloc[:,4:].mean()
    print(xgb_adasyn_df_mean)
    end = time.time()
    total = round(end - start,1)
    print('Time taken = {} minutes'.format(total/60))

[{'tn': 0, 'fp': 22, 'fn': 0, 'tp': 78, 'precision': 0.78, 'recall': 1.0, 'f1_score': 0.876404, 'auprc': 0.89}, {'tn': 0, 'fp': 22, 'fn': 0, 'tp': 78, 'precision': 0.78, 'recall': 1.0, 'f1_score': 0.876404, 'auprc': 0.89}, {'tn': 0, 'fp': 25, 'fn': 0, 'tp': 75, 'precision': 0.75, 'recall': 1.0, 'f1_score': 0.857143, 'auprc': 0.875}, {'tn': 0, 'fp': 24, 'fn': 0, 'tp': 76, 'precision': 0.76, 'recall': 1.0, 'f1_score': 0.863636, 'auprc': 0.88}, {'tn': 0, 'fp': 25, 'fn': 0, 'tp': 75, 'precision': 0.75, 'recall': 1.0, 'f1_score': 0.857143, 'auprc': 0.875}]
        tn  fp  fn  tp  precision  recall  f1_score  auprc
fold 1   0  22   0  78       0.78     1.0  0.876404  0.890
fold 2   0  22   0  78       0.78     1.0  0.876404  0.890
fold 3   0  25   0  75       0.75     1.0  0.857143  0.875
fold 4   0  24   0  76       0.76     1.0  0.863636  0.880
fold 5   0  25   0  75       0.75     1.0  0.857143  0.875
precision    0.764000
recall       1.000000
f1_score     0.866146
auprc        0.882000
