In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
import math
import warnings
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC


from IPython.display import display

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.decomposition import PCA
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier

from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2, f_classif

from timeit import default_timer as timer

import pydotplus
from sklearn import tree
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

In [3]:
# Upload dataset
base_fraud_data = pd.read_csv('creditcard.csv').dropna()

# Sample smaller chunks to speed up testing models
half_data = base_fraud_data.sample(frac=.5, random_state=2)
quarter_data = base_fraud_data.sample(frac=.25, random_state=2)
tenth_data = base_fraud_data.sample(frac=.1, random_state=2)


In [4]:
# create a report function that can be used for any model

def accuracy_report(testing_X, testing_Y, model):
    predictions = model.predict(testing_X)
    print('Model score:')
    print(model.score(testing_X, testing_Y))
    print(" ")
    print('Confusion Matrix (UL: True Negative, UR: False Positive, BL: False Negative, BR: True Positive)')
    print(confusion_matrix(testing_Y.values, predictions))
    print(" ")
    print('Accuracy:')
    print(accuracy_score(testing_Y.values, predictions))
    print(" ")
    print('Recall/Sensitivity (% of late arrivals predicted):')
    print(recall_score(testing_Y.values, predictions))
    print(" ")
    print('Precision (% of on-time arrivals predicted accurately):')
    print(precision_score(testing_Y.values, predictions))
    print(" ")
    auc = roc_auc_score(testing_Y.values, predictions)
    print('AUC score:%.3f'% auc)
    print(" ")
    print('Model cross-valuation:')
    print(sklearn.model_selection.cross_val_score(model, testing_X, testing_Y, cv = 5))
    return

In [5]:
working_df = base_fraud_data
training_fraction = .8

training_X = working_df.sample(frac=training_fraction, random_state=10)
testing_X = working_df.drop(training_X.index)

# separate the Class feature out into Y datasets
training_Y = training_X['Class']
testing_Y = testing_X['Class']

# dropping the Class feature from the X datasets so that the model isn't able to cheat
training_X.drop('Class', axis=1, inplace=True)
testing_X.drop('Class', axis=1, inplace=True)
print(" ")

 


In [8]:
training_X.head(1)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
168367,119191.0,2.052,-0.133,-1.741,0.278,0.338,-0.827,0.335,-0.227,0.268,0.294,0.428,0.504,-0.989,0.762,-0.985,-0.296,-0.337,-0.232,0.544,-0.269,-0.007,0.121,0.038,-0.356,0.148,0.569,-0.104,-0.092,8.73


In [26]:
# Gridsearch CV for the KNN algorithm

neighbors = KNeighborsClassifier(weights='uniform', n_neighbors=6, algorithm= 'ball_tree')

grid_param_KNN = {
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree'],
    'n_neighbors' : [1,2,3,4,5,6,7]
}

grid_search_KNN = GridSearchCV(estimator = neighbors,  
                              param_grid = grid_param_KNN,
                              scoring = 'neg_mean_squared_error',
                              cv = 5)

grid_search_KNN.fit(testing_X, testing_Y)
print('KNN recommended parameters:')
print(grid_search_KNN.best_params_)
print(' ')

KNN recommended parameters:
{'algorithm': 'ball_tree', 'n_neighbors': 2, 'weights': 'uniform'}
 


In [27]:
# Gridsearch CV for the Random Forest algorithm
rfc = ensemble.RandomForestClassifier(max_depth = None, max_features = 0.3, n_estimators = 5)

grid_param_RFC = {
    'max_features': [.1,.3,.5,.7,.9,'sqrt','log2'],
    'max_depth': [None,1,2,3,4,5],
    'n_estimators' : [2,5,8,9,10,11,12,15,18,20]
}

grid_search_RFC = GridSearchCV(estimator = rfc,  
                              param_grid = grid_param_RFC,
                              scoring = 'neg_mean_squared_error',
                              cv = 5)

grid_search_RFC.fit(testing_X, testing_Y)
print('Random Forest recommended parameters:')
print(grid_search_RFC.best_params_)
print(' ')

KNN recommended parameters:
{'max_depth': None, 'max_features': 0.3, 'n_estimators': 5}
 


In [8]:
# Gridsearch CV for the Logistic Regression algorithm

lr = LogisticRegression(penalty='l1', C=.9, solver='liblinear')

grid_param_LR = {
    'penalty': ['l1', 'l2'],
    'C': [1,.1,.3,.5,.7,.9]
#   , 'solver' : ['liblinear','saga']
# Repeated warnings about iter limits being exceeded resulted from including the previous line in the code
}

grid_search_LR = GridSearchCV(estimator = lr,  
                              param_grid = grid_param_LR,
                              scoring = 'neg_mean_squared_error',
                              cv = 5)

grid_search_LR.fit(testing_X, testing_Y)
print('Logistic Regression recommended parameters:')
print(grid_search_LR.best_params_)
print(' ')

Logistic Regression recommended parameters:
{'C': 1, 'penalty': 'l1'}
 


In [None]:
# Gridsearch CV for the Gradient Boosting Classifier algorithm
# Warning, this takes a LONG time to run...

clf = ensemble.GradientBoostingClassifier(n_estimators=100,max_depth=2)

grid_param_GBC = {
    'n_estimators': [100,75,200],
    'learning_rate': [.1,.08,.12], 
    'criterion' : ['friedman_mse','mse','mae'],
    'max_depth': [1,2,3,4,5]
}

grid_search_GBC = GridSearchCV(estimator = clf,  
                              param_grid = grid_param_GBC,
                              scoring = 'neg_mean_squared_error',
                              cv = 5)

grid_search_GBC.fit(testing_X, testing_Y)
print('Gradient Boosting Classifier recommended parameters:')
print(grid_search_GBC.best_params_)
print(' ')

In [None]:
# Gridsearch CV for the SVM algorithm
# Warning, this takes a LONG time to run...

svm = SVC(kernel = 'linear')

grid_param_SVM = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1, 'auto']
}

grid_search_SVM = GridSearchCV(estimator = clf,  
                              param_grid = grid_param_SVM,
                              scoring = 'neg_mean_squared_error',
                              cv = 5)

grid_search_SVM.fit(testing_X, testing_Y)
print('Gradient Boosting Classifier recommended parameters:')
print(grid_search_SVM.best_params_)
print(' ')


In [69]:
# Gridsearch CV for the Decision Tree algorithm
dtree = tree.DecisionTreeClassifier()

grid_param_DT = {
    'max_features': [None, 'auto', 'sqrt', 'log2', 2, .1,.3,.5,.7,.9], 
    'criterion' : ['entropy','gini'],
    'max_depth': [1,2,3,4,5]
}

grid_search_DT = GridSearchCV(estimator = dtree,  
                              param_grid = grid_param_DT,
                              scoring = 'neg_mean_squared_error',
                              cv = 5)

grid_search_DT.fit(testing_X, testing_Y)
print('Decision Tree Classifier recommended parameters:')
print(grid_search_DT.best_params_)
print(' ')


Decision Tree Classifier recommended parameters:
{'criterion': 'entropy', 'max_depth': 3, 'max_features': 0.7}
 


In [51]:
# KNN
neighbors = KNeighborsClassifier(weights='uniform', n_neighbors=2, algorithm= 'ball_tree')
neighbors.fit(training_X,training_Y)
print('KNN results:')
print(' ')
accuracy_report(testing_X, testing_Y, neighbors)

KNN results:
 
Model score:
0.9984375274310493
 
Confusion Matrix (UL: True Negative, UR: False Positive, BL: False Negative, BR: True Positive)
[[56863     1]
 [   88     9]]
 
Accuracy:
0.9984375274310493
 
Recall/Sensitivity (% of late arrivals predicted):
0.09278350515463918
 
Precision (% of on-time arrivals predicted accurately):
0.9
 
AUC score:0.546
 
Model cross-valuation:
[0.03941016 0.2878083  0.64870084 0.99833216 0.99833202]


In [52]:
# RFC
rfc = ensemble.RandomForestClassifier(max_depth = None, max_features = 0.3, n_estimators = 5)
rfc.fit(training_X,training_Y)
print('RFC results:')
print(' ')
accuracy_report(testing_X, testing_Y, rfc)

RFC results:
 
Model score:
0.9995084355962852
 
Confusion Matrix (UL: True Negative, UR: False Positive, BL: False Negative, BR: True Positive)
[[56852    12]
 [   16    81]]
 
Accuracy:
0.9995084355962852
 
Recall/Sensitivity (% of late arrivals predicted):
0.8350515463917526
 
Precision (% of on-time arrivals predicted accurately):
0.8709677419354839
 
AUC score:0.917
 
Model cross-valuation:
[0.99526025 0.99929781 0.99912219 0.99903441 0.99938548]


In [53]:
# Logistic Regression
lr = LogisticRegression(penalty='l1', C=.9)
lr.fit(training_X,training_Y)
print('Logistic Regression results:')
print(' ')
accuracy_report(testing_X, testing_Y, lr)



Logistic Regression results:
 
Model score:
0.9991046505503766
 
Confusion Matrix (UL: True Negative, UR: False Positive, BL: False Negative, BR: True Positive)
[[56848    16]
 [   35    62]]
 
Accuracy:
0.9991046505503766
 
Recall/Sensitivity (% of late arrivals predicted):
0.6391752577319587
 
Precision (% of on-time arrivals predicted accurately):
0.7948717948717948
 
AUC score:0.819
 
Model cross-valuation:




[0.99859563 0.99929781 0.99894663 0.99868329 0.99885875]


In [54]:
# Gradient Boosting Classifier

clf = ensemble.GradientBoostingClassifier(n_estimators=300,max_depth=2)
clf.fit(training_X,training_Y)
print('GBC results:')
print(' ')
accuracy_report(testing_X, testing_Y, clf)

GBC results:
 
Model score:
0.9989466477063254
 
Confusion Matrix (UL: True Negative, UR: False Positive, BL: False Negative, BR: True Positive)
[[56855     9]
 [   51    46]]
 
Accuracy:
0.9989466477063254
 
Recall/Sensitivity (% of late arrivals predicted):
0.4742268041237113
 
Precision (% of on-time arrivals predicted accurately):
0.8363636363636363
 
AUC score:0.737
 
Model cross-valuation:
[0.28824717 0.99956113 0.99806882 0.99894663 0.99929769]


In [57]:
# SVM

svm = SVC(kernel = 'linear')
svm.fit(training_X,training_Y)
print('SVM results:')
print(' ')
accuracy_report(testing_X, testing_Y, svm)

SVM results:
 
Model score:
0.998490195045733
 
Confusion Matrix (UL: True Negative, UR: False Positive, BL: False Negative, BR: True Positive)
[[56839    25]
 [   61    36]]
 
Accuracy:
0.998490195045733
 
Recall/Sensitivity (% of late arrivals predicted):
0.3711340206185567
 
Precision (% of on-time arrivals predicted accurately):
0.5901639344262295
 
AUC score:0.685
 
Model cross-valuation:
[0.9878873  0.99877118 0.99833216 0.99833216 0.99833202]


In [6]:
# Decision Tree

dtree = tree.DecisionTreeClassifier(criterion='entropy', max_features=.7, max_depth=3)
dtree.fit(training_X,training_Y)
print('Decision Tree results:')
print(' ')
accuracy_report(testing_X, testing_Y, dtree)

Decision Tree results:
 
Model score:
0.999280209265989
 
Confusion Matrix (UL: True Negative, UR: False Positive, BL: False Negative, BR: True Positive)
[[56838    26]
 [   15    82]]
 
Accuracy:
0.999280209265989
 
Recall/Sensitivity (% of late arrivals predicted):
0.845360824742268
 
Precision (% of on-time arrivals predicted accurately):
0.7592592592592593
 
AUC score:0.922
 
Model cross-valuation:
[0.99833231 0.99885895 0.99920997 0.99912219 0.9992099 ]


In [56]:
# Naive Bayes

bnb = BernoulliNB()
bnb.fit(training_X,training_Y)
print('Naive Bayes results:')
print(' ')
accuracy_report(testing_X, testing_Y, bnb)

Naive Bayes results:
 
Model score:
0.9989993153210092
 
Confusion Matrix (UL: True Negative, UR: False Positive, BL: False Negative, BR: True Positive)
[[56839    25]
 [   32    65]]
 
Accuracy:
0.9989993153210092
 
Recall/Sensitivity (% of late arrivals predicted):
0.6701030927835051
 
Precision (% of on-time arrivals predicted accurately):
0.7222222222222222
 
AUC score:0.835
 
Model cross-valuation:
[0.99824454 0.99964891 0.99885885 0.99885885 0.99947327]


#### KNN
Recall: 0.092

AUC: .546

#### Random Forest Classifier
Recall: .794

AUC: .897

#### Logistic Regression
Recall: .639

AUC: .819

#### Gradient Boosted Classifier
Recall: .474

AUC: .737

#### Linear SVM
Recall: .371

AUC: .685

#### Decision Tree
Recall: .84

AUC: .922

#### Naive Bayes
Recall: .67

AUC: .835

Random Forest and Decision Tree algorithms resulted in moderately high accuracy within the bounds of the dataset provided.  Naive Bayes and Logistic Regression resulted in mediocre but still above-average accuracy.  All other algorithms result in unacceptably high false negative results.  

Random Forest and Decision Tree both benefitted from being ensemble models, which was likely what gave them the ability to figure out the signs that a transaction was fraudulent.  

Logistic Regression benefitted from the fact that it has a built-in mathematical boost towards modeling binary solutions.  However, judging by the fact that Naive Bayes, the simplest model in this selection, performed better than all the others except for RFC and DT, there's likely a relatively simple means of calculating whether a transaction is fraudulent, but the more complicated models (except for DT & RF) got distracted by other variables, lowering their accuracy.