Install package

In [13]:
pip install py2neo



Load libraries

In [14]:
import numpy
import pandas
from py2neo import Graph
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.decomposition import PCA

Load Dataset

In [15]:
bankData = pandas.read_csv("data.csv")

Dataset Info

In [16]:
bankData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   step         594643 non-null  int64  
 1   customer     594643 non-null  object 
 2   age          594643 non-null  object 
 3   gender       594643 non-null  object 
 4   zipcodeOri   594643 non-null  object 
 5   merchant     594643 non-null  object 
 6   zipMerchant  594643 non-null  object 
 7   category     594643 non-null  object 
 8   amount       594643 non-null  float64
 9   fraud        594643 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 45.4+ MB


In [17]:
# Number of unique values per column in the banksim dataset
bankData.nunique()

step             180
customer        4112
age                8
gender             4
zipcodeOri         1
merchant          50
zipMerchant        1
category          15
amount         23767
fraud              2
dtype: int64

In [18]:
bankData['fraud'].value_counts()

fraud
0    587443
1      7200
Name: count, dtype: int64

In [19]:
refinedData = bankData.drop(['step', 'customer', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)

In [20]:
refinedData.head()

Unnamed: 0,age,gender,merchant,category,amount
0,'4','M','M348934600','es_transportation',4.55
1,'2','M','M348934600','es_transportation',39.68
2,'4','F','M1823072687','es_transportation',26.89
3,'3','M','M348934600','es_transportation',17.25
4,'5','M','M348934600','es_transportation',35.72


In [21]:
# Hot encoding the categorical variables

refinedData = pandas.get_dummies(refinedData, columns=['age', 'gender', 'category', 'merchant'])
refinedData.head()

Unnamed: 0,amount,age_'0',age_'1',age_'2',age_'3',age_'4',age_'5',age_'6',age_'U',gender_'E',...,merchant_'M732195782',merchant_'M78078399',merchant_'M840466850',merchant_'M855959430',merchant_'M857378720',merchant_'M85975013',merchant_'M923029380',merchant_'M933210764',merchant_'M97925176',merchant_'M980657600'
0,4.55,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,39.68,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,26.89,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,17.25,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,35.72,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
# Standardizing the features
standard_scaler = StandardScaler()
scaledData = pandas.DataFrame(standard_scaler.fit_transform(refinedData), columns = refinedData.columns)

scaledData.head()

Unnamed: 0,amount,age_'0',age_'1',age_'2',age_'3',age_'4',age_'5',age_'6',age_'U',gender_'E',...,merchant_'M732195782',merchant_'M78078399',merchant_'M840466850',merchant_'M855959430',merchant_'M857378720',merchant_'M85975013',merchant_'M923029380',merchant_'M933210764',merchant_'M97925176',merchant_'M980657600'
0,-0.299276,-0.064347,-0.329165,-0.678119,-0.57339,2.110495,-0.343144,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624
1,0.016067,-0.064347,-0.329165,1.474668,-0.57339,-0.473822,-0.343144,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624
2,-0.098742,-0.064347,-0.329165,-0.678119,-0.57339,2.110495,-0.343144,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624
3,-0.185275,-0.064347,-0.329165,-0.678119,1.744015,-0.473822,-0.343144,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624
4,-0.01948,-0.064347,-0.329165,-0.678119,-0.57339,-0.473822,2.914227,-0.217136,-0.044553,-0.044553,...,-0.031992,-0.052072,-0.048562,-0.10179,-0.014325,-0.214919,-0.023313,-0.010773,-0.031754,-0.054624


In [23]:
# Performing dimensionality reduction using PCA

# Limiting the number of components such that 95% of the variance is explained
pca = PCA(0.95, svd_solver='full')
scaledData = pca.fit_transform(scaledData)

scaledData.shape

(594643, 55)

Training Using Intrintic features from Dataset

In [24]:
# Retrieving the class attribute from the dataframe
Y_before_smote = bankData['fraud']
Y_before_smote.head()

0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64

Logistic Regression Classifier

In [25]:
k_fold = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

random_forest = RandomForestClassifier(max_depth=20, n_estimators=150)
svm = SVC(gamma="auto")
logistic_regression = LogisticRegression(solver='lbfgs', max_iter=5000)
labels = Y_before_smote

In [26]:
for train_index, test_index in k_fold.split(scaledData, labels):

    X_train, X_test = scaledData[train_index], scaledData[test_index]
    y_train, y_test = labels[train_index], labels[test_index]


    clf = logistic_regression.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.77      0.74      0.76      1440

    accuracy                           0.99    118929
   macro avg       0.89      0.87      0.88    118929
weighted avg       0.99      0.99      0.99    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.86      0.74      0.80      1440

    accuracy                           1.00    118929
   macro avg       0.93      0.87      0.90    118929
weighted avg       1.00      1.00      1.00    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.89      0.75      0.82      1440

    accuracy                           1.00    118929
   macro avg       0.95      0.87      0.91    118929
weighted avg       1.00      1.00      1.00    118929

              preci

In [27]:
# Testing the logistic regression classifier after performing oversampling on the training data using SMOTE

X_train, X_test, y_train, y_test = train_test_split(scaledData, labels, test_size=0.20)#, random_state=42, stratify='array-like')


print('Original dataset shape %s' % Counter(Y_before_smote))

sm = SMOTE()

# Applying smote to dataset, result is nparray
X_after_smote, Y_after_smote = sm.fit_resample(X_train, y_train)


print('Dataset shape after smote %s' % Counter(Y_after_smote))

clf = logistic_regression.fit(X_after_smote, Y_after_smote)

predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

Original dataset shape Counter({0: 587443, 1: 7200})
Dataset shape after smote Counter({0: 469971, 1: 469971})
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    117472
           1       0.29      0.99      0.44      1457

    accuracy                           0.97    118929
   macro avg       0.64      0.98      0.71    118929
weighted avg       0.99      0.97      0.98    118929



Random Forest Classifier

In [28]:

X_train, X_test, y_train, y_test = train_test_split(scaledData, labels, test_size=0.20)
print('Original dataset shape %s' % Counter(Y_before_smote))

sm = SMOTE()

# Applying smote to dataset, result is nparray
X_after_smote, Y_after_smote = sm.fit_resample(X_train, y_train)


print('Dataset shape after smote %s' % Counter(Y_after_smote))

clf = random_forest.fit(X_after_smote, Y_after_smote)

predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

Original dataset shape Counter({0: 587443, 1: 7200})
Dataset shape after smote Counter({0: 469955, 1: 469955})
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    117488
           1       0.36      0.95      0.52      1441

    accuracy                           0.98    118929
   macro avg       0.68      0.97      0.75    118929
weighted avg       0.99      0.98      0.98    118929



SVM Classifier

In [29]:
X_train, X_test, y_train, y_test = train_test_split(scaledData, labels, test_size=0.20)

clf = svm.fit(X_train, y_train)
predictions = clf.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117507
           1       0.86      0.70      0.77      1422

    accuracy                           1.00    118929
   macro avg       0.93      0.85      0.88    118929
weighted avg       0.99      1.00      0.99    118929

