In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from sklearn.preprocessing import StandardScaler
import uuid 

import adm_load as load
import adm_preproc as pre
import adm_datagen as dg

RANDOM_SEED = 498
# load all transaction data from csv
df = load.transactions()


Using TensorFlow backend.
  if self.run_code(code, result):


In [2]:
sus_idx = load.get_sus(df)
fraud_idx = load.get_fraud(df)
normal_idx = load.get_norm(df)

In [3]:
amts_sus = dg.gen_post_amounts(df,sus_idx)
df.update(amts_sus)

In [4]:

n_sus = len(sus_idx.tolist())
ones_flags = np.ones(n_sus, dtype='int64')
sus_ff = pd.DataFrame({'FraudFlag': ones_flags},index=sus_idx)
df.update(sus_ff)

In [5]:
dfn = pre.preproc(df)
dfn['POSTAMOUNT'] = StandardScaler().fit_transform(dfn['POSTAMOUNT'].values.reshape(-1, 1))

In [6]:
dfn.describe()

Unnamed: 0,PROCESSORACCOUNT,POSTAMOUNT,POSTSUCCESS,HOLDACTION,DUPLICATEFLAG,TRANSACTIONCODE,RESPONSECODEOUT,REVERSALFLAG,ADJUSTMENTFLAG,ORIGINALAMOUNT,TRANSUBTYPE,OURPREAUTHCODE,OURRESPONSECODE,FORCEPOST,CURRENCYAMOUNT,ISAFEEAMOUNT,CASHBACKAMOUNT,FraudFlag
count,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0,90017.0
mean,240.864514,-2.525897e-18,0.555128,1.532111,1.1e-05,31941.382606,0.19613,0.005754,0.010709,0.429333,9.533477,1.530411,30.367331,0.401169,0.39155,0.008103,0.320509,0.000689
std,139.778094,1.000006,0.496954,1.234365,0.003333,140531.938709,7.360145,0.07564,0.10293,14.787008,4.441568,1.206131,242.753371,0.490138,117.476106,0.30841,3.869722,0.026235
min,3.0,-3.930804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.95,0.0,0.0
25%,128.0,-0.05182492,0.0,1.0,0.0,2000.0,0.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,241.0,-0.04343282,1.0,1.0,0.0,2000.0,0.0,0.0,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,357.0,-0.02236279,1.0,3.0,0.0,3000.0,0.0,0.0,0.0,0.0,13.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0
max,492.0,142.6454,1.0,6.0,1.0,911000.0,2012.0,1.0,1.0,2233.09,13.0,3.0,2076.0,1.0,35246.16,58.53,300.0,1.0


In [7]:
# split the data into training set (80%) and test set (20%)

normal = dfn.loc[dfn.FraudFlag == 0]
frauds = dfn.loc[dfn.FraudFlag == 1]

X_train, X_test = train_test_split(normal, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_valid = train_test_split(X_train, test_size=0.2, random_state=RANDOM_SEED)

In [8]:
# add the fraud cases to test set and validation set
X_test = X_test.append(frauds)[0:30]
X_valid = X_valid.append(frauds)[30:]

# drop the class feature on the test data
y_test = X_test['FraudFlag']
y_valid = X_valid['FraudFlag']

X_train = X_train.drop(['FraudFlag'], axis=1)
X_test = X_test.drop(['FraudFlag'], axis=1)
X_valid = X_valid.drop(['FraudFlag'], axis=1)



In [9]:
X_train = X_train.values
X_test = X_test.values
X_valid = X_valid.values




In [10]:
input_dim = X_train.shape[1] # number of features
input_dim 


17

In [11]:
# num neurons in first encoding layer
encoding_dim = 12 

# input layer
input_layer = Input(shape=(input_dim, ))

# 12 neuron encoding layer
encoder = Dense(encoding_dim, activation='tanh',
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
# 6 neuron encoding layer
encoder = Dense(int(encoding_dim / 2), activation='relu')(encoder)

# 6 neuron decoding layer
decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)

# 17 layer decoding layer (output layer, predicts all features)
decoder = Dense(input_dim, activation='relu')(decoder)


In [14]:

# put it all together
autoencoder = Model(inputs=input_layer, outputs=decoder)

model_id = str(uuid.uuid4())
model_filepath = './checkpoints/autoencoder_' + model_id + '.h5'



In [15]:

nb_epoch = 10 # number of iterations over the entire training data
batch_size = 32 # number of samples per gradient update


In [16]:
autoencoder.compile(optimizer='adam',
                    loss='mean_squared_error',
                    metrics=['accuracy'])


In [17]:
checkpointer = ModelCheckpoint(filepath='./checkpoints/autoencoder.h5',
                               verbose=0,
                               save_best_only=True)

In [18]:
tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

In [19]:
# fit the model to itself
history = autoencoder.fit(X_train, X_train,
                         epochs=nb_epoch,
                         batch_size=batch_size,
                         shuffle=True,
                         validation_data=(X_valid,X_valid),
                         verbose=1,
                         callbacks=[checkpointer, tensorboard]).history

Train on 57571 samples, validate on 14425 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
history_filepath = './checkpoints/autoencoder_history_' + model_id
with open(history_filepath, 'wb') as file_pi:
    pickle.dump(history, file_pi)

In [None]:
%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 14, 8

RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right');

In [None]:
predictions = autoencoder.predict(X_test)

In [None]:
mse = np.mean(np.power(X_test - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})

In [None]:
error_df.describe()


In [None]:
# reconstruct error without fraud

fig = plt.figure()
ax = fig.add_subplot(111)
normal_error_df = error_df[(error_df['true_class']== 0)]
_ = ax.hist(normal_error_df.reconstruction_error.values, bins=10)

In [None]:

fig = plt.figure()
ax = fig.add_subplot(111)
fraud_error_df = error_df[error_df['true_class'] == 1]
_ = ax.hist(fraud_error_df.reconstruction_error.values, bins=10)

In [None]:
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

In [None]:
precision, recall, th = precision_recall_curve(error_df.true_class, error_df.reconstruction_error)
plt.plot(recall, precision, 'b', label='Precision-Recall curve')
plt.title('Recall vs Precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
plt.plot(th, precision[1:], 'b', label='Threshold-Precision curve')
plt.title('Precision for different threshold values')
plt.xlabel('Threshold')
plt.ylabel('Precision')
plt.show()

In [None]:
threshold = 2.9

groups = error_df.groupby('true_class')
fig, ax = plt.subplots()

for name, group in groups:
    ax.plot(group.index, group.reconstruction_error, marker='o', ms=3.5, linestyle='',
            label= "Fraud" if name == 1 else "Normal")
ax.hlines(threshold, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend()
plt.title("Reconstruction error for different classes")
plt.ylabel("Reconstruction error")
plt.xlabel("Data point index")
plt.ylim([])
plt.show();

In [None]:
error_df.reconstruction_error


In [None]:
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
conf_matrix = confusion_matrix(error_df.true_class, y_pred)

plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

In [27]:
df.loc['POSTAMOUNT'] = np.log(np.abs(df['POSTAMOUNT'] + 1))

In [28]:
neg = df.index[df['POSTAMOUNT'] < 0]
df

Unnamed: 0,PROCESSORACCOUNT,OURACCOUNT,POSTAMOUNT,POSTSUCCESS,HOLDACTION,DUPLICATEFLAG,TRANSACTIONCODE,RESPONSECODEIN,RESPONSECODEOUT,AUTHIDRESPONSE,...,CCAFEEAMOUNT,ICAFEEAMOUNT,CASHBACKAMOUNT,CARDACCEPTORCITY,CARDACCEPTORCOUNTRY,CARDACCEPTORIDCODE,CARDACCEPTORNAME,CARDACCEPTORSTATE,CARDACCEPTORSTREET,FraudFlag
0,4.888889e+15,3.243617e+09,3.931826,0.0,0.0,0.0,2000.0,,2012.0,012461,...,0.0,0.0,0.0,FORT WRIGHT,US,7660002867,WAL Wal-Mart Store 441780,KY,3450 VALLEY PLAZA,1.0
1,4.888884e+15,6.585870e+09,3.931826,1.0,0.0,0.0,2000.0,,0.0,012461,...,0.0,0.0,0.0,FORT WRIGHT,US,7660002867,WAL Wal-Mart Store 441780,KY,3450 VALLEY PLAZA,1.0
2,4.888887e+15,1.898540e+09,6.061457,0.0,1.0,0.0,3000.0,,0.0,042316,...,0.0,0.0,0.0,DEWITT,US,4445000548644,US MEIJER #209,MI,MEIJER #209,1.0
3,4.888887e+15,1.898540e+09,5.257704,0.0,1.0,0.0,3000.0,,0.0,042316,...,0.0,0.0,0.0,DEWITT,US,4445000548644,US MEIJER #209,MI,MEIJER #209,1.0
4,4.888887e+15,1.898540e+09,3.574590,0.0,1.0,0.0,3000.0,,0.0,042316,...,0.0,0.0,0.0,MASON,US,542929809022346,US SPEEDWAY 02234 N C,MI,SPEEDWAY 02234 N C,1.0
5,4.888889e+15,3.243617e+09,4.615121,1.0,0.0,0.0,12000.0,,0.0,050544,...,0.0,0.0,0.0,EAST LANSING,US,MICHIGAN STATE,MICHIGAN STATE UNIV FCU,MI,3775 COOLIDGE ROAD,0.0
6,4.888889e+15,3.243617e+09,3.713572,1.0,0.0,0.0,12000.0,,0.0,082055,...,0.0,0.0,0.0,HOLT,US,CO-OP NETWORK,7ELEVEN-FC,MI,1997 AURELIUS RD,0.0
7,4.888889e+15,3.243617e+09,5.707110,1.0,0.0,0.0,12000.0,,0.0,064202,...,0.0,0.0,0.0,EAST LANSING,US,MICHIGAN STATE,MICHIGAN STATE UNIV FCU,MI,3775 COOLIDGE ROAD,0.0
8,4.888889e+15,3.243617e+09,4.615121,1.0,0.0,0.0,12000.0,,0.0,012246,...,0.0,0.0,0.0,EAST LANSING,US,MICHIGAN STATE,MICHIGAN STATE UNIV FCU,MI,3777 WEST ROAD,0.0
9,4.888889e+15,3.243617e+09,5.707110,1.0,0.0,0.0,12000.0,,0.0,014831,...,0.0,0.0,0.0,MASON,US,MICHIGAN STATE,MICHIGAN STATE UNIV FCU,MI,1133 S CEDAR ST,0.0


  """Entry point for launching an IPython kernel.


-inf