In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import scatter_matrix
import seaborn as sns
import plotly.express as px

import scipy.stats as stats

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve, auc as pr_auc, PrecisionRecallDisplay, roc_curve, roc_auc_score, f1_score, precision_score, recall_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import optuna

import lime
from lime.lime_tabular import LimeTabularExplainer
import shap 

import pymc as pm

In [3]:
df = pd.read_csv("dataset/fraud_detection_data.csv")

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            4000 non-null   int64  
 1   type            4000 non-null   string 
 2   amount          4000 non-null   float64
 3   nameOrig        4000 non-null   string 
 4   oldbalanceOrg   4000 non-null   float64
 5   newbalanceOrig  4000 non-null   float64
 6   nameDest        4000 non-null   string 
 7   oldbalanceDest  4000 non-null   float64
 8   newbalanceDest  4000 non-null   float64
 9   isFraud         4000 non-null   int64  
dtypes: float64(5), int64(2), string(3)
memory usage: 312.6 KB


In [6]:
# No nulls to handle but dtype 'object' requires handling

In [7]:
df = df[:4000] # For computational ease to significantly reduce df size 

In [8]:
round(df["isFraud"].value_counts()/df.shape[0]*100,2) # Check that dataset reduction has maintained some the minority class

isFraud
0    99.4
1     0.6
Name: count, dtype: float64

In [9]:
obj_cols = df.select_dtypes(include="object")
df[obj_cols.columns] = obj_cols.astype("string")

In [10]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,1.3925,97284.46,975732.9,998635.7,804087.4,972664.7,0.006,0.0
std,0.614441,224977.3,2178887.0,2226924.0,2327466.0,2793190.0,0.077237,0.0
min,1.0,6.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,3980.838,457.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,10694.76,25681.9,17211.29,0.0,0.0,0.0,0.0
75%,2.0,108054.4,270091.3,268555.7,354971.7,267966.8,0.0,0.0
max,3.0,3776389.0,12225880.0,12458650.0,19516120.0,19169200.0,1.0,0.0


In [11]:
df = df.drop("isFlaggedFraud", axis=1)

In [12]:
## Data Encoding

In [60]:
X = df.iloc[:, :-1]
y = df["isFraud"]

In [61]:
# For reference as encode:

X.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0


In [62]:
X["type"].value_counts()

type
PAYMENT     2077
CASH_IN      818
CASH_OUT     513
TRANSFER     360
DEBIT        232
Name: count, dtype: Int64

In [63]:
# No ordinality observed so to one-hot encode 'type' feature
X_encoded = pd.get_dummies(X, columns=["type"])

In [64]:
# Check 'nameOrig' to determine uniquness

X_encoded["nameOrig"].is_unique

True

In [65]:
# Check whether there's any 'nameOrig' entries that don't involve numbers prefixed with a 'C' followed by a number 
# between 1-9 (ensures that if 'C' is dropped then won't lose any leading zeroes as dtype int) and then the rest numbers

X_encoded[
    ~X_encoded["nameOrig"].str.contains("^C[1-9][0-9]*$", regex=True)
]  

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER


In [66]:
# Remove 'C' prefix

X_encoded["nameOrig"] = X_encoded["nameOrig"].str.replace("C", "").astype("int")

In [67]:
# Get all letter prefixes and related counts from "nameDest"

X_encoded["nameDest"].str.findall("[A-Za-z]").value_counts()

nameDest
[M]    2077
[C]    1923
Name: count, dtype: int64

In [68]:
# Remove prefixes and one-hot encode information as no ordinality present

X_encoded["nameDestLabel"] = X_encoded["nameDest"].str[0]
X_encoded = pd.get_dummies(X_encoded, columns=["nameDestLabel"])
X_encoded["nameDest"] = X_encoded["nameDest"].str.replace("[MC]", "", regex=True).astype("int")

In [69]:
X_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   step             4000 non-null   int64  
 1   amount           4000 non-null   float64
 2   nameOrig         4000 non-null   int32  
 3   oldbalanceOrg    4000 non-null   float64
 4   newbalanceOrig   4000 non-null   float64
 5   nameDest         4000 non-null   int32  
 6   oldbalanceDest   4000 non-null   float64
 7   newbalanceDest   4000 non-null   float64
 8   type_CASH_IN     4000 non-null   bool   
 9   type_CASH_OUT    4000 non-null   bool   
 10  type_DEBIT       4000 non-null   bool   
 11  type_PAYMENT     4000 non-null   bool   
 12  type_TRANSFER    4000 non-null   bool   
 13  nameDestLabel_C  4000 non-null   bool   
 14  nameDestLabel_M  4000 non-null   bool   
dtypes: bool(7), float64(5), int32(2), int64(1)
memory usage: 246.2 KB
