In [2]:
import numpy as np
import pandas as pd
from py2neo import Graph
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from imblearn.over_sampling import SMOTE 
from collections import Counter
from sklearn.decomposition import PCA

# Extracting features from graph network

In [2]:
graph = Graph(password="password")

# Query to fetch the network features from Neo4j
query = """
MATCH (p:Placeholder)
RETURN p.id AS id, p.degree AS degree, p.pagerank as pagerank, p.community AS community 
"""

data = graph.run(query)

records = {}

for record in data:
    records[record['id']] = {'degree': record['degree'], 'pagerank': record['pagerank'], 'community': record['community']}

In [3]:
banksim_df = pd.read_csv("../DATA/bs140513_032310.csv")

In [4]:
def load_degree(record):
    return records[record.split("'")[1]]['degree']
def load_community(record):
    return str(records[record.split("'")[1]]['community'])
def load_pagerank(record):
    return records[record.split("'")[1]]['pagerank']

In [5]:
banksim_df['merchant_degree'] = banksim_df['merchant'].apply(load_degree)
banksim_df['customer_degree'] = banksim_df['customer'].apply(load_degree)
banksim_df['merchant_pagerank'] = banksim_df['merchant'].apply(load_pagerank)
banksim_df['customer_pagerank'] = banksim_df['customer'].apply(load_pagerank)
banksim_df['merchant_community'] = banksim_df['merchant'].apply(load_community)
banksim_df['customer_community'] = banksim_df['customer'].apply(load_community)

In [6]:
banksim_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud,merchant_degree,customer_degree,merchant_pagerank,customer_pagerank,merchant_community,customer_community
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0,7858,14,46.540997,0.15,600005,600005
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0,7858,24,46.540997,0.15,600005,600005
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0,7146,28,41.39083,0.15,600006,600005
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0,7858,22,46.540997,0.15,600005,600005
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0,7858,28,46.540997,0.15,600005,600005


In [7]:
banksim_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   step                594643 non-null  int64  
 1   customer            594643 non-null  object 
 2   age                 594643 non-null  object 
 3   gender              594643 non-null  object 
 4   zipcodeOri          594643 non-null  object 
 5   merchant            594643 non-null  object 
 6   zipMerchant         594643 non-null  object 
 7   category            594643 non-null  object 
 8   amount              594643 non-null  float64
 9   fraud               594643 non-null  int64  
 10  merchant_degree     594643 non-null  int64  
 11  customer_degree     594643 non-null  int64  
 12  merchant_pagerank   594643 non-null  float64
 13  customer_pagerank   594643 non-null  float64
 14  merchant_community  594643 non-null  object 
 15  customer_community  594643 non-nul

# Data Cleaning and Encoding

In [8]:
labels = banksim_df['fraud']

# Dropping the unnecessary columns
feature_df = banksim_df.drop(['step', 'age', 'gender', 'customer', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)

# One hot encoding the categorical variables
feature_df = pd.get_dummies(feature_df, columns=['category', 'merchant', 'merchant_community', 'customer_community'])

In [9]:
# Standardizing the features
standard_scaler = StandardScaler()
scaled_df = pd.DataFrame(standard_scaler.fit_transform(feature_df), columns = feature_df.columns)

scaled_df.head()

Unnamed: 0,amount,merchant_degree,customer_degree,merchant_pagerank,customer_pagerank,category_'es_barsandrestaurants',category_'es_contents',category_'es_fashion',category_'es_food',category_'es_health',...,merchant_community_600051,merchant_community_600052,merchant_community_600053,merchant_community_600054,customer_community_600005,customer_community_600006,customer_community_600007,customer_community_600008,customer_community_600011,customer_community_600012
0,-0.299276,0.627143,-1.451389,0.686701,-1.665335e-16,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,-0.017449,-0.019238,-0.010773,-0.002246,0.123083,-0.104175,-0.059075,-0.019369,-0.017497,-0.005347
1,0.016067,0.627143,0.203358,0.686701,-1.665335e-16,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,-0.017449,-0.019238,-0.010773,-0.002246,0.123083,-0.104175,-0.059075,-0.019369,-0.017497,-0.005347
2,-0.098742,0.102754,0.865257,0.060981,-1.665335e-16,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,-0.017449,-0.019238,-0.010773,-0.002246,0.123083,-0.104175,-0.059075,-0.019369,-0.017497,-0.005347
3,-0.185275,0.627143,-0.127592,0.686701,-1.665335e-16,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,-0.017449,-0.019238,-0.010773,-0.002246,0.123083,-0.104175,-0.059075,-0.019369,-0.017497,-0.005347
4,-0.01948,0.627143,0.865257,0.686701,-1.665335e-16,-0.104084,-0.038607,-0.10475,-0.214919,-0.166994,...,-0.017449,-0.019238,-0.010773,-0.002246,0.123083,-0.104175,-0.059075,-0.019369,-0.017497,-0.005347


In [10]:
scaled_df = scaled_df.values
labels = labels.values

# Training the models using intrinsic and graph-network based features

In [11]:
k_fold = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

random_forest = RandomForestClassifier(max_depth=20, n_estimators=150)
svm = SVC(gamma="auto")
logistic_regression = LogisticRegression(solver='lbfgs', max_iter=5000)

In [17]:
# Logistic Regression Classifier

for train_index, test_index in k_fold.split(scaled_df, labels):

    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf = logistic_regression.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.81      0.79      0.80      1440

    accuracy                           1.00    118929
   macro avg       0.90      0.89      0.90    118929
weighted avg       1.00      1.00      1.00    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.87      0.80      0.84      1440

    accuracy                           1.00    118929
   macro avg       0.94      0.90      0.92    118929
weighted avg       1.00      1.00      1.00    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.90      0.79      0.84      1440

    accuracy                           1.00    118929
   macro avg       0.95      0.89      0.92    118929
weighted avg       1.00      1.00      1.00    118929

              preci

In [18]:
# Random Forest Classifier
    
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)

clf = random_forest.fit(X_train, y_train)
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117525
           1       0.89      0.79      0.84      1404

    accuracy                           1.00    118929
   macro avg       0.94      0.90      0.92    118929
weighted avg       1.00      1.00      1.00    118929



In [12]:
# SVM Classifier
    
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)

clf = svm.fit(X_train, y_train)
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117536
           1       0.89      0.76      0.82      1393

    accuracy                           1.00    118929
   macro avg       0.94      0.88      0.91    118929
weighted avg       1.00      1.00      1.00    118929

