In [1]:
# Core Libraries:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
from io import BytesIO
from zipfile import ZipFile
import requests

# Modelling Libraries:

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Models:

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from keras.models import Model, load_model
from keras.layers import Input, Dense

In [2]:
# Loading Credit Card Fraud Dataset:

url_fraud = 'https://github.com/manugaco/ML_User_Cases/blob/master/Datasets/fraud.zip?raw=true'
zip_fraud = ZipFile(BytesIO(requests.get(url_fraud).content), 'r')
df_fraud = pd.read_csv(zip_fraud.open(zip_fraud.namelist()[0]), low_memory=False)

In [3]:
df_fraud.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df_fraud['Class'].value_counts().reset_index().apply(lambda x: x/x.sum())

Unnamed: 0,index,Class
0,0.0,0.998273
1,1.0,0.001727


In [5]:
df_fraud = df_fraud.drop([ 'Time'], axis = 1)
df_fraud['Amount'] = StandardScaler().fit_transform(df_fraud['Amount'].values.reshape(-1, 1))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_fraud.drop('Class',1) , df_fraud['Class'], test_size=0.2, random_state=0)

In [7]:
y_test.value_counts().reset_index().apply(lambda x: x/x.sum())

Unnamed: 0,index,Class
0,0.0,0.998227
1,1.0,0.001773


In [8]:
y_train.value_counts().reset_index().apply(lambda x: x/x.sum())

Unnamed: 0,index,Class
0,0.0,0.998284
1,1.0,0.001716


In [9]:
X_train_nofraud = X_train[y_train==0]
X_train_fraud = X_train[y_train==1]

In [10]:
input_layer = Input(shape=(X_train.shape[1], ))
encoded = Dense(10, activation='relu')(input_layer)
decoded = Dense(X_train.shape[1], activation='relu')(encoded)
autoencoder = Model(input_layer,decoded)

In [11]:
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [12]:
autoencoder.fit(X_train_nofraud, X_train_nofraud, epochs = 10, batch_size=128, validation_data=(X_train_nofraud,X_train_nofraud))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe8d3691610>

In [13]:
predictions = autoencoder.predict(X_train)
mse = np.mean(np.power(X_train - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_train})
error_df.groupby('true_class').describe()

Unnamed: 0_level_0,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
true_class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,227454.0,0.745711,3.232892,0.038295,0.256246,0.405271,0.637952,512.212162
1,391.0,28.612698,40.956755,0.11065,4.598432,10.198034,27.414136,287.808595


In [14]:
test_predictions = autoencoder.predict(X_test)
mse = np.mean((X_test - test_predictions)**2, axis=1)
y_pred = [(lambda er: 1 if er>=11.078922  else 0)(er) for er in mse]

In [15]:
acc = metrics.accuracy_score(y_test, y_pred)
print('Accuracy score =', acc)
rec = metrics.recall_score(y_test, y_pred)
print('Recall score =', rec)
prec  =metrics.precision_score(y_test, y_pred)
print('Precision score =', prec)
f1 = metrics.f1_score(y_test, y_pred)
print('F1 score =', f1)

Accuracy score = 0.9943119974719988
Recall score = 0.5247524752475248
Precision score = 0.16109422492401215
F1 score = 0.24651162790697675
