Phase 1 - Dataset Preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("fraudDataset.csv")
df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [5]:
df.dropna(inplace=True)
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [6]:
duplicated = df.duplicated()
print(duplicated)

0          False
1          False
2          False
3          False
4          False
           ...  
6362615    False
6362616    False
6362617    False
6362618    False
6362619    False
Length: 6362620, dtype: bool


In [7]:
#Checking the class imbalances 

imbalance = df["isFraud"].value_counts()
print(imbalance)

isFraud
0    6354407
1       8213
Name: count, dtype: int64


From the above, we have found out that the dataset is highly unbalanced and thus we are going to prepare the dataset for making it balanced

In [8]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

x = df[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']]
y = df["isFraud"]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42, stratify=y)

columns_to_drop = ['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
x_train = x_train.drop(columns=columns_to_drop)
x_test = x_test.drop(columns=columns_to_drop)


In [9]:
x["type"].value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [10]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

type_train_encoded = encoder.fit_transform(x_train[['type']])
type_test_encoded = encoder.transform(x_test[['type']])

type_columns = encoder.get_feature_names_out(['type'])

type_train_df = pd.DataFrame(type_train_encoded, columns=type_columns, index=x_train.index)
type_test_df = pd.DataFrame(type_test_encoded, columns=type_columns, index=x_test.index)

x_train = x_train.drop(columns=['type'])
x_test = x_test.drop(columns=['type'])

x_train = pd.concat([x_train, type_train_df], axis=1)
x_test = pd.concat([x_test, type_test_df], axis=1)



In [11]:
x_train

Unnamed: 0,step,amount,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
4310249,308,260485.23,0,1.0,0.0,0.0,0.0,0.0
318938,16,1387060.19,0,0.0,0.0,0.0,0.0,1.0
3375139,254,1140661.98,0,0.0,0.0,0.0,0.0,1.0
5492781,380,169390.39,0,1.0,0.0,0.0,0.0,0.0
807263,40,4867.26,0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1422253,139,175520.68,0,0.0,0.0,0.0,0.0,1.0
5834821,402,347110.99,0,0.0,1.0,0.0,0.0,0.0
4182953,304,13259.63,0,0.0,0.0,0.0,1.0,0.0
3985280,298,24122.92,0,0.0,0.0,0.0,1.0,0.0


In [12]:
#Applying the SMOTE for oversampling and balancing 

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)



In [13]:
df['DayOfWeek'] = (df['step'] // 24) % 7 

In [14]:
df['hourOfDay'] = df['step'] % 24

Phase 2 - Models

LIGHTGBM Model

In [15]:
neg_count = np.sum(y_train_resampled==0)
pos_count = np.sum(y_train_resampled==1)

scale_pos_weight_value = neg_count/pos_count


In [17]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, classification_report

params = {
    'objective':'binary',
    'boosting_type':'gbdt',
    'learning_rate':0.05,
    'scale_pos_weight':scale_pos_weight_value,
    'max_depth':-1,
    'num_leaves':31,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'device':'cpu',
    'verbose':-1
}

train_data = lgb.Dataset(x_train_resampled, label=y_train_resampled)
valid_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
callbacks = [lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=100)]

LGB_model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    num_boost_round=1000,
    callbacks=callbacks,
    
)

y_pred = LGB_model.predict(x_test)
print(y_pred)

y_pred = (y_pred >= 0.2).astype(int)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.150761
[200]	valid_0's binary_logloss: 0.135855
[300]	valid_0's binary_logloss: 0.130629
[400]	valid_0's binary_logloss: 0.127441
[500]	valid_0's binary_logloss: 0.125089
[600]	valid_0's binary_logloss: 0.123577
[700]	valid_0's binary_logloss: 0.122289
[800]	valid_0's binary_logloss: 0.121236
[900]	valid_0's binary_logloss: 0.120385
[1000]	valid_0's binary_logloss: 0.119675
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.119675
[6.86322983e-07 2.23134541e-06 1.60954424e-06 ... 7.87078492e-02
 5.46166721e-01 4.60088125e-01]
Accuracy: 0.8690
F1 Score: 0.0175
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.87      0.93   1906322
           1       0.01      0.90      0.02      2464

    accuracy                           0.87   1908786
   macro avg       0.50      0.89      0.47   1908786
weighted avg

In [20]:
#Using Random Search cv for checking the best model
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
import numpy as np
import scipy

model = lgb.LGBMClassifier(
    objective='binary',
    random_state=42,
    scale_pos_weight=scale_pos_weight_value,
    device='cpu'
)
params_dist ={
    'num_leaves':np.arange(20,100,10),
    'max_depth':np.arange(3,15,2),
    'learning_rate':np.linspace(0.01, 0.3, 10),
    'n_estimators':np.arange(50,200,25),
    'subsample': np.linspace(0.5, 1.0, 6),
    'colsample_bytree': np.linspace(0.5, 1.0, 6),
    'min_child_samples': np.arange(20, 100, 20),
    'reg_alpha': np.linspace(0, 1, 5),
    'reg_lambda': np.linspace(0, 1, 5)
}
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=params_dist,
    n_iter=20,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=1,
    random_state=42
)

random_search.fit(x_train_resampled,y_train_resampled)

print("Best parameters:", random_search.best_params_)
print("Best F1 score: ", random_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END colsample_bytree=0.8, learning_rate=0.23555555555555557, max_depth=5, min_child_samples=20, n_estimators=50, num_leaves=80, reg_alpha=0.5, reg_lambda=1.0, subsample=0.9; total time=   5.3s
[CV] END colsample_bytree=0.8, learning_rate=0.23555555555555557, max_depth=5, min_child_samples=20, n_estimators=50, num_leaves=80, reg_alpha=0.5, reg_lambda=1.0, subsample=0.9; total time=   2.5s
[CV] END colsample_bytree=0.8, learning_rate=0.23555555555555557, max_depth=5, min_child_samples=20, n_estimators=50, num_leaves=80, reg_alpha=0.5, reg_lambda=1.0, subsample=0.9; total time=   3.9s
[CV] END colsample_bytree=0.8, learning_rate=0.23555555555555557, max_depth=5, min_child_samples=20, n_estimators=50, num_leaves=80, reg_alpha=0.5, reg_lambda=1.0, subsample=0.9; total time=   4.0s
[CV] END colsample_bytree=0.8, learning_rate=0.23555555555555557, max_depth=5, min_child_samples=20, n_estimators=50, num_leaves=80, reg_alpha=0.5

In [21]:
best_params = {
    'subsample': 0.7,
    'reg_lambda': 0.5,
    'reg_alpha': 1.0,
    'num_leaves': 80,
    'n_estimators': 150,
    'min_child_samples': 40,
    'max_depth': 13,
    'learning_rate': 0.23555555555555557,
    'colsample_bytree': 0.8,
    'objective': 'binary',  # or your specific objective
    'random_state': 42,
    'n_jobs': -1
}


best_model = lgb.LGBMClassifier(**best_params)
best_model.fit(x_train_resampled, y_train_resampled)


0,1,2
,boosting_type,'gbdt'
,num_leaves,80
,max_depth,13
,learning_rate,0.23555555555555557
,n_estimators,150
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = best_model.predict(x_test)

y_pred = (y_pred >= 0.2).astype(int)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9569
F1 Score: 0.0449
[0 0 0 ... 0 1 1]


In [31]:
sample_input = np.array([[150, 0, 2000.5, 1, 0.7, 45, 0.2, 0]])
predicted = best_model.predict(sample_input)

print(predicted[0])

0


