Phase 1 - Dataset Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("fraudDataset.csv")
df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [4]:
df.dropna(inplace=True)
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [5]:
duplicated = df.duplicated()
print(duplicated)

0          False
1          False
2          False
3          False
4          False
           ...  
6362615    False
6362616    False
6362617    False
6362618    False
6362619    False
Length: 6362620, dtype: bool


In [6]:
#Checking the class imbalances 

imbalance = df["isFraud"].value_counts()
print(imbalance)

isFraud
0    6354407
1       8213
Name: count, dtype: int64


From the above, we have found out that the dataset is highly unbalanced and thus we are going to prepare the dataset for making it balanced

In [7]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

x = df[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']]
y = df["isFraud"]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42, stratify=y)

columns_to_drop = ['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
x_train = x_train.drop(columns=columns_to_drop)
x_test = x_test.drop(columns=columns_to_drop)


In [8]:
x["type"].value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [9]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

type_train_encoded = encoder.fit_transform(x_train[['type']])
type_test_encoded = encoder.transform(x_test[['type']])

type_columns = encoder.get_feature_names_out(['type'])

type_train_df = pd.DataFrame(type_train_encoded, columns=type_columns, index=x_train.index)
type_test_df = pd.DataFrame(type_test_encoded, columns=type_columns, index=x_test.index)

x_train = x_train.drop(columns=['type'])
x_test = x_test.drop(columns=['type'])

x_train = pd.concat([x_train, type_train_df], axis=1)
x_test = pd.concat([x_test, type_test_df], axis=1)



In [10]:
x_train

Unnamed: 0,step,amount,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
4310249,308,260485.23,0,1.0,0.0,0.0,0.0,0.0
318938,16,1387060.19,0,0.0,0.0,0.0,0.0,1.0
3375139,254,1140661.98,0,0.0,0.0,0.0,0.0,1.0
5492781,380,169390.39,0,1.0,0.0,0.0,0.0,0.0
807263,40,4867.26,0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1422253,139,175520.68,0,0.0,0.0,0.0,0.0,1.0
5834821,402,347110.99,0,0.0,1.0,0.0,0.0,0.0
4182953,304,13259.63,0,0.0,0.0,0.0,1.0,0.0
3985280,298,24122.92,0,0.0,0.0,0.0,1.0,0.0


In [11]:
#Applying the SMOTE for oversampling and balancing 

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)



In [12]:
df['DayOfWeek'] = (df['step'] // 24) % 7 

In [13]:
df['hourOfDay'] = df['step'] % 24

Phase 2 - Models

In [14]:
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV


parameters_grid = {
    'n_estimators':[100,50,20],
    'max_depth':[None,20,10],
    'bootstrap':[True, False],
    'min_samples_leaf':[1 ,2]
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(), 
    param_distributions=parameters_grid,
    n_iter=10,  
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
random_search.fit(x_train_resampled, y_train_resampled)

print("Best Parameters:", random_search.best_params_)
print("Best Estimator:", random_search.best_estimator_)


model_rf = RandomForestClassifier(n_estimators=20, random_state=42, max_depth=20, n_jobs=-1)
model_rf.fit(x_train_resampled, y_train_resampled)

y_pred = model_rf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
classifiy_report = classification_report(y_test, y_pred)

print(acc)
print(classifiy_report)

"""

'\n\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, classification_report\nfrom sklearn.model_selection import RandomizedSearchCV\n\n\nparameters_grid = {\n    \'n_estimators\':[100,50,20],\n    \'max_depth\':[None,20,10],\n    \'bootstrap\':[True, False],\n    \'min_samples_leaf\':[1 ,2]\n}\n\nrandom_search = RandomizedSearchCV(\n    RandomForestClassifier(), \n    param_distributions=parameters_grid,\n    n_iter=10,  \n    cv=3,\n    verbose=2,\n    random_state=42,\n    n_jobs=-1\n)\nrandom_search.fit(x_train_resampled, y_train_resampled)\n\nprint("Best Parameters:", random_search.best_params_)\nprint("Best Estimator:", random_search.best_estimator_)\n\n\nmodel_rf = RandomForestClassifier(n_estimators=20, random_state=42, max_depth=20, n_jobs=-1)\nmodel_rf.fit(x_train_resampled, y_train_resampled)\n\ny_pred = model_rf.predict(x_test)\n\nacc = accuracy_score(y_test, y_pred)\nclassifiy_report = classification_report(y_test, y_pred)\n\n

In [15]:
"""
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import numpy as np

# Convert your training data to float32 if not already
x_train_resampled = x_train_resampled.astype(np.float32)

params = {
    'n_estimators': [20, 50, 100],
    'max_depth': [3, 4, 6, 7],
    'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],            # Use a fraction of data per tree
    'colsample_bytree': [0.6, 0.8, 1.0]      # Use a fraction of features per tree
}

xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    tree_method='gpu_hist',  # Use GPU acceleration
    gpu_id=0
)

random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=params,
    n_iter=10,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(x_train_resampled, y_train_resampled)

print("Best parameters found: ", random_search.best_params_)
"""

'\nfrom sklearn.model_selection import RandomizedSearchCV\nimport xgboost as xgb\nimport numpy as np\n\n# Convert your training data to float32 if not already\nx_train_resampled = x_train_resampled.astype(np.float32)\n\nparams = {\n    \'n_estimators\': [20, 50, 100],\n    \'max_depth\': [3, 4, 6, 7],\n    \'learning_rate\': [0.01, 0.05, 0.07, 0.1, 0.2],\n    \'subsample\': [0.6, 0.8, 1.0],            # Use a fraction of data per tree\n    \'colsample_bytree\': [0.6, 0.8, 1.0]      # Use a fraction of features per tree\n}\n\nxgb_clf = xgb.XGBClassifier(\n    objective=\'binary:logistic\',\n    tree_method=\'gpu_hist\',  # Use GPU acceleration\n    gpu_id=0\n)\n\nrandom_search = RandomizedSearchCV(\n    estimator=xgb_clf,\n    param_distributions=params,\n    n_iter=10,\n    scoring=\'f1\',\n    cv=3,\n    verbose=2,\n    n_jobs=-1,\n    random_state=42\n)\n\nrandom_search.fit(x_train_resampled, y_train_resampled)\n\nprint("Best parameters found: ", random_search.best_params_)\n'

LIGHTGBM Model

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, classification_report

params = {
    'objective':'binary',
    'boosting_type':'gbdt',
    'learning_rate':0.05,
    'max_depth':-1,
    'num_leaves':31,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'device':'cpu',
    'verbose':-1
}

train_data = lgb.Dataset(x_train_resampled, label=y_train_resampled)
valid_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
callbacks = [lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=100)]

LGB_model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    num_boost_round=1000,
    callbacks=callbacks,
    
)

y_pred = LGB_model.predict(x_test)
print(y_pred)

y_pred = (y_pred >= 0.5).astype(int)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.150761
[200]	valid_0's binary_logloss: 0.135855
[300]	valid_0's binary_logloss: 0.130629
[400]	valid_0's binary_logloss: 0.127441
[500]	valid_0's binary_logloss: 0.125089
[600]	valid_0's binary_logloss: 0.123577
[700]	valid_0's binary_logloss: 0.122289
[800]	valid_0's binary_logloss: 0.121236
[900]	valid_0's binary_logloss: 0.120385
[1000]	valid_0's binary_logloss: 0.119675
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.119675
[6.86322983e-07 2.23134541e-06 1.60954424e-06 ... 7.87078492e-02
 5.46166721e-01 4.60088125e-01]
Accuracy: 0.9555
F1 Score: 0.0442
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98   1906322
           1       0.02      0.80      0.04      2464

    accuracy                           0.96   1908786
   macro avg       0.51      0.88      0.51   1908786
weighted avg