Phase 1 - Dataset Preprocessing

In [20]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv("fraudDataset.csv")
df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [22]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [23]:
df.dropna(inplace=True)
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [24]:
duplicated = df.duplicated()
print(duplicated)

0          False
1          False
2          False
3          False
4          False
           ...  
6362615    False
6362616    False
6362617    False
6362618    False
6362619    False
Length: 6362620, dtype: bool


In [25]:
#Checking the class imbalances 

imbalance = df["isFraud"].value_counts()
print(imbalance)

isFraud
0    6354407
1       8213
Name: count, dtype: int64


From the above, we have found out that the dataset is highly unbalanced and thus we are going to prepare the dataset for making it balanced

In [26]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

x = df[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']]
y = df["isFraud"]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42, stratify=y)


In [27]:
x["type"].value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

type_train_encoded = encoder.fit_transform(x_train[['type']])
type_test_encoded = encoder.transform(x_test[['type']])

type_columns = encoder.get_feature_names_out(['type'])

type_train_df = pd.DataFrame(type_train_encoded, columns=type_columns, index=x_train.index)
type_test_df = pd.DataFrame(type_test_encoded, columns=type_columns, index=x_test.index)

x_train = x_train.drop(columns=['type'])
x_test = x_test.drop(columns=['type'])

x_train = pd.concat([x_train, type_train_df], axis=1)
x_test = pd.concat([x_test, type_test_df], axis=1)


In [29]:
x_train

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
4310249,308,260485.23,32371.00,292856.23,4633560.35,4373075.12,0,1.0,0.0,0.0,0.0,0.0
318938,16,1387060.19,432673.78,0.00,0.00,1033440.62,0,0.0,0.0,0.0,0.0,1.0
3375139,254,1140661.98,59947.00,0.00,15000.85,1155662.83,0,0.0,0.0,0.0,0.0,1.0
5492781,380,169390.39,80047.10,249437.49,1467039.66,1297649.27,0,1.0,0.0,0.0,0.0,0.0
807263,40,4867.26,5989226.14,5994093.40,1609385.28,1604518.02,0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1422253,139,175520.68,977.00,0.00,91159.52,266680.20,0,0.0,0.0,0.0,0.0,1.0
5834821,402,347110.99,103785.00,0.00,87871.75,434982.74,0,0.0,1.0,0.0,0.0,0.0
4182953,304,13259.63,0.00,0.00,0.00,0.00,0,0.0,0.0,0.0,1.0,0.0
3985280,298,24122.92,0.00,0.00,0.00,0.00,0,0.0,0.0,0.0,1.0,0.0


In [None]:
#Applying the SMOTE for oversampling and balancing 

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

