# ONLINE FRAUD DETECTION

Importing all the libraries

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler

In [2]:
df=pd.read_csv('../data/onlinefraud.csv')

In [3]:
df.drop(columns=['isFlaggedFraud'], inplace=True)

In [4]:
rename_map = {
    'step': 'elapsed_hours',
    'type': 'type',
    'amount': 'amount',
    'nameOrig': 'sender_id',
    'oldbalanceOrg': 'sender_old_balance',
    'newbalanceOrig': 'sender_new_balance',
    'nameDest': 'recipient_id',
    'oldbalanceDest': 'recipient_old_balance',
    'newbalanceDest': 'recipient_new_balance',
    'isFraud': 'isFraud'
}
df.rename(columns=rename_map, inplace=True)

In [5]:
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

In [7]:
X_train.shape, X_test.shape

((5090096, 9), (1272524, 9))

In [8]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [9]:
train = X_train.copy()
test = X_test.copy()

In [10]:
print("Train fraud count:", y_train.value_counts())
print("Test fraud count:", y_test.value_counts())

Train fraud count: isFraud
0    5083526
1       6570
Name: count, dtype: int64
Test fraud count: isFraud
0    1270881
1       1643
Name: count, dtype: int64


In [11]:
print("Train fraud rate:", y_train.mean())
print("Test fraud rate:", y_test.mean())

Train fraud rate: 0.0012907418642005967
Test fraud rate: 0.0012911347840983745


In [12]:
numeric_cols=['amount', 'sender_old_balance', 'sender_new_balance', 'recipient_old_balance', 'recipient_new_balance']
cat_col = ['type']

In [13]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

ohe.fit(X_train[cat_col])

train_type_encoded=ohe.transform(X_train[cat_col])
test_type_encoded=ohe.transform(X_test[cat_col])

ohe_cols=ohe.get_feature_names_out(cat_col)

train_type_df=pd.DataFrame(train_type_encoded, columns=ohe_cols)
test_type_df = pd.DataFrame(test_type_encoded, columns=ohe_cols)

X_train = pd.concat([X_train.drop(columns=cat_col), train_type_df], axis=1)
X_test = pd.concat([X_test.drop(columns=cat_col), test_type_df], axis=1)

In [15]:
X_train.shape, X_test.shape

((5090096, 13), (1272524, 13))

In [16]:
X_train.head()

Unnamed: 0,elapsed_hours,amount,sender_id,sender_old_balance,sender_new_balance,recipient_id,recipient_old_balance,recipient_new_balance,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,15,9914.74,C482751146,44248.0,34333.26,M1651188591,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20,6854.53,C188264521,0.0,0.0,M1469015863,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,231,361211.8,C593201095,0.0,0.0,C1985763166,489745.16,850956.95,0.0,1.0,0.0,0.0,0.0
3,236,7083.51,C1617277615,0.0,0.0,M1529547196,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,143,218019.51,C1705563354,13045685.58,13263705.09,C2121401221,2438123.98,2220104.47,1.0,0.0,0.0,0.0,0.0


In [23]:
scaler = RobustScaler()

scaler.fit(X_train[numeric_cols])

X_train_scaled = scaler.transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numeric_cols, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numeric_cols, index=X_test.index)

X_train[numeric_cols] = X_train_scaled_df
X_test[numeric_cols] = X_test_scaled_df

In [20]:
X_train.shape, X_test.shape

((5090096, 13), (1272524, 13))

In [26]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
elapsed_hours,5090096.0,243.362623,142.319824,1.0,155.0,239.0,335.0,743.0
amount,5090096.0,0.538006,3.084902,-0.383226,-0.314657,0.0,0.685343,472.986559
sender_old_balance,5090096.0,7.635068,26.901949,-0.132404,-0.132404,0.0,0.867596,554.983588
sender_new_balance,5090096.0,5.930178,20.277537,0.0,0.0,0.0,1.0,343.937572
recipient_old_balance,5090096.0,1.025902,3.608656,-0.140805,-0.140805,0.0,0.859195,377.333342
recipient_new_balance,5090096.0,0.907637,3.304977,-0.193264,-0.193264,0.0,0.806736,319.957234
type_CASH_IN,5090096.0,0.219844,0.414141,0.0,0.0,0.0,0.0,1.0
type_CASH_OUT,5090096.0,0.351724,0.477508,0.0,0.0,0.0,1.0,1.0
type_DEBIT,5090096.0,0.006512,0.080434,0.0,0.0,0.0,0.0,1.0
type_PAYMENT,5090096.0,0.338154,0.473081,0.0,0.0,0.0,1.0,1.0


In [24]:
X_train

Unnamed: 0,elapsed_hours,amount,sender_id,sender_old_balance,sender_new_balance,recipient_id,recipient_old_balance,recipient_new_balance,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,15,-0.332457,C482751146,0.279826,0.238146,M1651188591,-0.140805,-0.193264,0.0,0.0,0.0,1.0,0.0
1,20,-0.348127,C188264521,-0.132404,0.000000,M1469015863,-0.140805,-0.193264,0.0,0.0,0.0,1.0,0.0
2,231,1.466369,C593201095,-0.132404,0.000000,C1985763166,0.378458,0.571616,0.0,1.0,0.0,0.0,0.0
3,236,-0.346954,C1617277615,-0.132404,0.000000,M1529547196,-0.140805,-0.193264,0.0,0.0,0.0,1.0,0.0
4,143,0.733149,C1705563354,121.405966,92.001267,C2121401221,2.444272,1.802269,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5090091,153,-0.373517,C1302053063,-0.132404,0.000000,M1494049570,-0.140805,-0.193264,0.0,0.0,0.0,1.0,0.0
5090092,402,1.394166,C788162540,0.834495,0.000000,C1771727447,-0.047637,0.197719,0.0,1.0,0.0,0.0,0.0
5090093,304,-0.315329,C545341724,-0.132404,0.000000,M253734860,-0.140805,-0.193264,0.0,0.0,0.0,1.0,0.0
5090094,298,-0.259703,C1831600144,-0.132404,0.000000,M1382629737,-0.140805,-0.193264,0.0,0.0,0.0,1.0,0.0


In [27]:
X_train['hour_of_day'] = X_train['elapsed_hours'] % 24
X_test['hour_of_day'] = X_test['elapsed_hours'] % 24

In [28]:
X_train['day_of_week'] = (X_train['elapsed_hours'] // 24) % 7
X_test['day_of_week'] = (X_test['elapsed_hours'] // 24) % 7

In [29]:
X_train['is_weekend'] = X_train['day_of_week'].isin([5, 6]).astype(int)
X_test['is_weekend'] = X_test['day_of_week'].isin([5, 6]).astype(int)

In [30]:
X_train['is_night'] = (X_train['hour_of_day'] < 6).astype(int)
X_test['is_night'] = (X_test['hour_of_day'] < 6).astype(int)

In [31]:
median_hour = X_train['elapsed_hours'].median()

In [32]:
X_train['is_late_phase'] = (X_train['elapsed_hours'] > median_hour).astype(int)
X_test['is_late_phase'] = (X_test['elapsed_hours'] > median_hour).astype(int)

In [35]:
print(X_train[['elapsed_hours', 'hour_of_day', 'day_of_week', 'is_weekend', 'is_night', 'is_late_phase']].head())

   elapsed_hours  hour_of_day  day_of_week  is_weekend  is_night  \
0             15           15            0           0         0   
1             20           20            0           0         0   
2            231           15            2           0         0   
3            236           20            2           0         0   
4            143           23            5           1         0   

   is_late_phase  
0              0  
1              0  
2              0  
3              0  
4              0  


In [34]:
X_train.shape

(5090096, 18)

In [None]:
X_train['sender_balance_change'] = X_train['sender_new_balance'] - X_train['sender_old_balance']
X_test['sender_balance_change'] = X_test['sender_new_balance'] - X_test['sender_old_balance']

In [37]:
X_train['recipient_balance_change'] = X_train['recipient_new_balance'] - X_train['recipient_old_balance']
X_test['recipient_balance_change'] = X_test['recipient_new_balance'] - X_test['recipient_old_balance']

In [38]:
X_train['is_sender_drained'] = (X_train['sender_new_balance'] == 0).astype(int)
X_test['is_sender_drained'] = (X_test['sender_new_balance'] == 0).astype(int)

In [39]:
X_train['amount_ratio_sender'] = X_train['amount'] / (X_train['sender_old_balance'] + 1)
X_test['amount_ratio_sender'] = X_test['amount'] / (X_test['sender_old_balance'] + 1)

In [40]:
X_train['amount_ratio_recipient'] = X_train['amount'] / (X_train['recipient_old_balance'] + 1)
X_test['amount_ratio_recipient'] = X_test['amount'] / (X_test['recipient_old_balance'] + 1)

In [41]:
X_train['recipient_balance_jump'] = (X_train['recipient_balance_change'] > X_train['amount']).astype(int)
X_test['recipient_balance_jump'] = (X_test['recipient_balance_change'] > X_test['amount']).astype(int)

In [42]:
X_train['sender_old_zero'] = (X_train['sender_old_balance'] == 0).astype(int)
X_test['sender_old_zero'] = (X_test['sender_old_balance'] == 0).astype(int)

In [46]:
X_train['is_high_risk_type'] = (X_train['type_TRANSFER']==1) | (X_train['type_CASH_OUT']==1)
X_test['is_high_risk_type'] = (X_train['type_TRANSFER']==1) | (X_train['type_CASH_OUT']==1)

In [47]:
X_train.shape

(5090096, 26)

In [49]:
X_train.head()

Unnamed: 0,elapsed_hours,amount,sender_id,sender_old_balance,sender_new_balance,recipient_id,recipient_old_balance,recipient_new_balance,type_CASH_IN,type_CASH_OUT,...,is_night,is_late_phase,sender_balance_change,recipient_balance_change,is_sender_drained,amount_ratio_sender,amount_ratio_recipient,recipient_balance_jump,sender_old_zero,is_high_risk_type
0,15,-0.332457,C482751146,0.279826,0.238146,M1651188591,-0.140805,-0.193264,0.0,0.0,...,0,0,-0.04168,-0.052458,0,-0.259767,-0.38694,1,0,False
1,20,-0.348127,C188264521,-0.132404,0.0,M1469015863,-0.140805,-0.193264,0.0,0.0,...,0,0,0.132404,-0.052458,1,-0.401254,-0.405178,1,0,False
2,231,1.466369,C593201095,-0.132404,0.0,C1985763166,0.378458,0.571616,0.0,1.0,...,0,0,0.132404,0.193157,1,1.690153,1.063775,0,0,True
3,236,-0.346954,C1617277615,-0.132404,0.0,M1529547196,-0.140805,-0.193264,0.0,0.0,...,0,0,0.132404,-0.052458,1,-0.399903,-0.403813,1,0,False
4,143,0.733149,C1705563354,121.405966,92.001267,C2121401221,2.444272,1.802269,1.0,0.0,...,0,0,-29.404699,-0.642003,0,0.005989,0.21286,0,0,False


In [50]:
X_train.columns

Index(['elapsed_hours', 'amount', 'sender_id', 'sender_old_balance',
       'sender_new_balance', 'recipient_id', 'recipient_old_balance',
       'recipient_new_balance', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT',
       'type_PAYMENT', 'type_TRANSFER', 'hour_of_day', 'day_of_week',
       'is_weekend', 'is_night', 'is_late_phase', 'sender_balance_change',
       'recipient_balance_change', 'is_sender_drained', 'amount_ratio_sender',
       'amount_ratio_recipient', 'recipient_balance_jump', 'sender_old_zero',
       'is_high_risk_type'],
      dtype='object')

In [51]:
X_train.drop(columns=['sender_id', 'recipient_id'], inplace=True)
X_test.drop(columns=['sender_id', 'recipient_id'], inplace=True)

In [53]:
from sklearn.utils.class_weight import compute_class_weight

weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),
    y=y_train
)

weights

array([  0.50064621, 387.37412481])