In [26]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

%matplotlib inline
import time
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('creditcard.csv')
df.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
23286,32673.0,-0.529618,0.725155,1.047404,0.740384,0.263997,-0.367547,0.616825,0.124157,-0.576075,...,0.16317,0.418526,-0.067885,-0.024396,-0.201015,-0.311315,0.38373,0.216041,46.86,0
140506,83755.0,1.18785,-0.412885,1.134586,0.117393,-1.11064,-0.022826,-0.832832,0.22768,1.211235,...,-0.104282,-0.09323,0.106181,0.127574,0.014481,1.023644,-0.021363,0.009866,1.0,0
207450,136679.0,1.94664,-0.325712,-0.966408,0.389814,-0.106747,-0.359471,-0.159013,-0.081225,0.495162,...,0.01082,0.090104,0.115069,-0.462295,-0.172069,0.231497,-0.048982,-0.061179,42.9,0
210766,138114.0,1.954918,-0.14338,-1.808844,0.30712,0.485033,-0.673555,0.474469,-0.250511,0.281957,...,0.074285,0.292888,0.018333,0.797739,0.335862,-0.282048,-0.052115,-0.061012,49.99,0
277257,167552.0,0.629452,-0.707467,0.629494,-1.942313,-0.772135,-0.575656,-0.093094,-0.488101,-1.615828,...,-0.042167,0.632069,0.092358,0.018898,-1.64259,-0.296532,-0.146872,-0.180971,10.0,0


In [5]:
X = df.drop(['Class'], axis=1)
y = df[['Class']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print("Train: {}\nTest: {}".format(X_train.shape, X_test.shape))

In [22]:
# Establish and fit the model, with a single, 1000 perceptron layer.
start_time = time.time()

mlp = MLPClassifier(hidden_layer_sizes=(1000, 100, 10), verbose=False, random_state=42)
mlp.fit(X_train, y_train)

print("Took %s seconds" % (time.time() - start_time))

Took 696.9698178768158 seconds


In [23]:
start_time = time.time()
mlp_results = cross_val_score(mlp, X_train, y_train, cv=3)
print(mlp_results)
print("3-fold cross validation average accuracy: %.4f" % (mlp_results.mean()))
print("Took %s seconds" % (time.time() - start_time))

[0.99593148 0.99827514 0.99827514]
3-fold cross validation average accuracy: 0.9975
Took 1187.7164778709412 seconds


In [27]:
start_time = time.time()
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_results = cross_val_score(rfc, X_train, y_train, cv=3)
print(rfc_results)
print("3-fold cross validation average accuracy: %.4f" % (rfc_results.mean()))
print("Took %s seconds" % (time.time() - start_time))

[0.99947333 0.99942066 0.99957866]
3-fold cross validation average accuracy: 0.9995
Took 52.73906207084656 seconds


In [28]:
y_pred_rfc = rfc.predict(X_test)
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.76      0.85        98

   micro avg       1.00      1.00      1.00     56962
   macro avg       0.99      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962



In [30]:
y_pred_mlp = mlp.predict(X_test)
print(classification_report(y_test, y_pred_mlp))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.00      0.00      0.00        98

   micro avg       1.00      1.00      1.00     56962
   macro avg       0.50      0.50      0.50     56962
weighted avg       1.00      1.00      1.00     56962



**Conclusion**  
The simple perceptron neural network predicts credit card fraud (classification) with near perfect accuracy (99.8%), while a random forest classifier (with no parameter tuning) achieves an accuracy of 80.5%. However, it is important to note that random forests are less prone to overfitting than the 3-layer perceptron network, and for this amount of data (~300k) training a perceptron network took about 30 minutes (with 3 fold cross-validation) while training the random forest model took less than 2 minutes. 