### Preprocessing steps

In [1]:
# import libraries 
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer  # (kept for template; you said no nulls)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, classification_report

# High-cardinality encoder (fast, memory-safe)
from category_encoders.hashing import HashingEncoder
#from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import OneHotEncoder
# Convert numeric numpy array into a sparse matrix (required for hstack with OHE/HashingEncoder)
from scipy.sparse import csr_matrix,hstack

In [2]:
#!pip install category_encoders

In [3]:
data = pd.read_csv('eda_dataset.csv')

In [4]:
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,amt_clean,distance_cust_merchant_km
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,2019,1,1,0,0,31,adult,Tuesday,4.97,78.6
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,2019,1,1,0,0,41,adult,Tuesday,107.23,30.21
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,2019,1,1,0,0,57,senior,Tuesday,47.45,108.21
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,2019,1,1,0,1,52,senior,Tuesday,45.0,95.67
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,2019,1,1,0,3,33,adult,Tuesday,41.96,77.56


In [5]:
data.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'tr_year', 'tr_month', 'tr_day', 'tr_hour', 'tr_minute', 'age',
       'age_group', 'tr_day_name', 'amt_clean', 'distance_cust_merchant_km'],
      dtype='object')

In [6]:
#Number of transactions per card
data['card_txn_count'] = data.groupby('cc_num')['cc_num'].transform('count')

In [7]:
#data['full_name'] = data['first'] + " " + data['last']

In [8]:
# drop amt_clean because i have removed outlier from it and in fraud cases amount is important so instead of removing im capping upper limit here with fic amount 
data = data.drop(columns = ['amt_clean'])

# will create new clean_amount column after splitting data to prevent data leakage (means we nor use test data for capping so that we can get realistic model performance)

In [9]:
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,distance_cust_merchant_km,card_txn_count
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,2019,1,1,0,0,31,adult,Tuesday,78.6,2927
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,2019,1,1,0,0,41,adult,Tuesday,30.21,4362
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,2019,1,1,0,0,57,senior,Tuesday,108.21,735
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,2019,1,1,0,1,52,senior,Tuesday,95.67,743
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,2019,1,1,0,3,33,adult,Tuesday,77.56,2922


In [10]:
# data_cat = data.select_dtypes(include=['object'])

# # category columns cardinality number
# for c in data_cat:
#     print( c ,':',data_cat[c].nunique())

In [11]:
# data_num = data.select_dtypes(include=['number'])

# for c in data_num:
#     print( c ,':',data_num[c].nunique())

In [12]:
#data.columns

In [13]:
# Credit card number is identifier it does not show any trend so i dropped it
#'merch_lat','merch_long', 'merch_lat','merch_long' alredy derived distance from it so decided to drop it 
# dob , extraxted age from it so we can drop it
# trans_num is unique identifier so delete it
# droped unix time because difference between tras_date_time is too large 2557 days it might be due to error in synthetic data 
## droped 'trans_date_trans_time' because already calculated age and extracted tr_hr,tr_year,tr_month, tr_minute etc.
# claculated age so dropping age group column
# first, last,'street has high cordinality(many distinct values) so its do not have any pattern in it so deleting it???????????????
# dropped 'tr_day_name' because already we have tr_day meaning is same. so keep numerical column because most ml model works on numerical value only.
# should i drop tr_day_name and age_group : yes you can drop because we already have age and day number

data = data.drop(columns = ['first','last','street','cc_num','lat','long','dob','trans_num',
                            'unix_time','merch_lat', 'merch_long','age_group','tr_day_name',"age_group"], axis=1)

In [14]:
data.columns

Index(['merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip',
       'city_pop', 'job', 'is_fraud', 'tr_year', 'tr_month', 'tr_day',
       'tr_hour', 'tr_minute', 'age', 'distance_cust_merchant_km',
       'card_txn_count'],
      dtype='object')

In [15]:
data.shape

(1852394, 18)

In [16]:
data.head()

Unnamed: 0,merchant,category,amt,gender,city,state,zip,city_pop,job,is_fraud,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,distance_cust_merchant_km,card_txn_count
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,28654,3495,"Psychologist, counselling",0,2019,1,1,0,0,31,78.6,2927
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,99160,149,Special educational needs teacher,0,2019,1,1,0,0,41,30.21,4362
2,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,83252,4154,Nature conservation officer,0,2019,1,1,0,0,57,108.21,735
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,59632,1939,Patent attorney,0,2019,1,1,0,1,52,95.67,743
4,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,24433,99,Dance movement psychotherapist,0,2019,1,1,0,3,33,77.56,2922


In [17]:
data_c = data.select_dtypes(include=['object'])

In [18]:
for c in data_c:
    print( c ,':',data_c[c].nunique())

merchant : 693
category : 14
gender : 2
city : 906
state : 51
job : 497


In [19]:
data_n = data.select_dtypes(include=['number'])

In [20]:
for c in data_n:
    print( c ,':',data_n[c].nunique())

amt : 60616
zip : 985
city_pop : 891
is_fraud : 2
tr_year : 2
tr_month : 12
tr_day : 7
tr_hour : 24
tr_minute : 60
age : 83
distance_cust_merchant_km : 14547
card_txn_count : 140


In [21]:
# can not cap amount column because amount with outlier has fraud and it is 7318 if i will cap it, model will not abke to catch the fraud properly.
# Q1 = data['amt'].quantile(0.25)
# Q3 = data['amt'].quantile(0.75)
# IQR = Q3 - Q1

# upper_lim = Q3 + 1.5 * IQR
# lower_lim = Q1 - 1.5 * IQR

In [22]:
# data['amt_clean'] = data['amt'].clip(lower=lower_lim, upper=upper_lim)

# data = data.drop(columns=['amt'])

In [23]:
data.head()

Unnamed: 0,merchant,category,amt,gender,city,state,zip,city_pop,job,is_fraud,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,distance_cust_merchant_km,card_txn_count
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,28654,3495,"Psychologist, counselling",0,2019,1,1,0,0,31,78.6,2927
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,99160,149,Special educational needs teacher,0,2019,1,1,0,0,41,30.21,4362
2,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,83252,4154,Nature conservation officer,0,2019,1,1,0,0,57,108.21,735
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,59632,1939,Patent attorney,0,2019,1,1,0,1,52,95.67,743
4,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,24433,99,Dance movement psychotherapist,0,2019,1,1,0,3,33,77.56,2922


In [24]:
# target = "is_fraud"

# cat_low  = ["state", "category","gender"]      # OHE (<= ~100 uniques)
# cat_high = ["merchant", "city", "job",'street']                             # Hashing (693/906/497 uniques)
# # 'gender' is low-card too; treat as binary OHE with drop='if_binary'

# num_cols = ["city_pop", "tr_month", "tr_day", "tr_hour", "tr_minute","age", "amt", "distance_cust_merchant_km"]

In [25]:

# MONTH (1–12)
data['tr_month_sin'] = np.sin(2 * np.pi * data['tr_month'] / 12)
data['tr_month_cos'] = np.cos(2 * np.pi * data['tr_month'] / 12)

# DAY OF MONTH (1–31)
data['tr_day_sin'] = np.sin(2 * np.pi * data['tr_day'] / 31)
data['tr_day_cos'] = np.cos(2 * np.pi * data['tr_day'] / 31)

# HOUR (0–23)
data['tr_hour_sin'] = np.sin(2 * np.pi * data['tr_hour'] / 24)
data['tr_hour_cos'] = np.cos(2 * np.pi * data['tr_hour'] / 24)

# MINUTE (0–59) – optional
data['tr_minute_sin'] = np.sin(2 * np.pi * data['tr_minute'] / 60)
data['tr_minute_cos'] = np.cos(2 * np.pi * data['tr_minute'] / 60)

In [26]:
data.drop(['tr_month', 'tr_day', 'tr_hour', 'tr_minute'], axis=1, inplace=True)

In [27]:
target = "is_fraud"
cat_low  = ["state", "category", "gender"]
cat_high = ["merchant", "city", "job","street"]   # remove 'street' from model side

num_cols = ["tr_month_sin", "tr_month_cos","tr_day_sin", "tr_day_cos","tr_hour_sin", "tr_hour_cos",
    "tr_minute_sin", "tr_minute_cos"]
scale_cols = ['amt', 'distance_cust_merchant_km', 'age', 'city_pop']

In [28]:
# ========= 3) Split FIRST (avoid leakage) =========
# Keep the original rare class ratio by stratifying
# stratify = y This forces all splits to have the same fraud ratio as the original dataset
# Stratify creates splits that keep the same class distribution as the target variable y.

X = data.drop(columns=[target])
y = data[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify = y,random_state=42)

In [29]:
#X_train.columns

Always fit and fit_transform on training data only.
Always transform on validation and test data.

In [30]:
scale_cols = ['amt', 'distance_cust_merchant_km', 'age', 'city_pop','card_txn_count']

In [31]:
from sklearn.preprocessing import StandardScaler

# -NUMERIC SCALER ----------
# with_mean=False ,Because subtracting mean creates non-zero values,turning the sparse matrix into a dense matrix → RAM explodes → model becomes extremely slow.

num_scaler = StandardScaler(with_mean=False)# keep sparse-friendly

# # Fit ONLY on numeric columns of TRAIN
X_train_num_scaled = num_scaler.fit_transform(X_train[scale_cols])

# Transform VAL and TEST using same scaler

X_test_num_scaled = num_scaler.transform(X_test[scale_cols])

In [32]:
type(X_train_num_scaled)

numpy.ndarray

In [33]:
# handle_unknown="ignore" If test data contains a category not seen during training Instead of error, ignore it (give all zeros)
#  sparse_output=True ,It does not print full matrix,Only shows non-zero positions,Very memory friendly, good for large data set
# sparse_output=False, → Output is a normal dense NumPy array, sparse_output=True → don’t store zeros sparse=False → store everything
cat_low  = ["state", "category", "gender",'tr_year']     # OHE (<= ~100 uniques)
ohe_low = OneHotEncoder(handle_unknown="ignore", sparse_output=True,dtype=np.float32)

X_train_cat_low = ohe_low.fit_transform(X_train[cat_low])
#X_val_cat_low   = ohe_low.transform(X_val[cat_low])
X_test_cat_low  = ohe_low.transform(X_test[cat_low])

In [34]:
type(X_train_cat_low )

scipy.sparse._csr.csr_matrix

In [35]:
# High-card categorical (Hashing)
X_train['zip'] = X_train['zip'].astype(str)
X_test['zip']  = X_test['zip'].astype(str)
cat_high = ["merchant", "city", "job",'zip']  # Hashing (693/906/497 uniques)
from category_encoders.hashing import HashingEncoder

hash_high = HashingEncoder(n_components=64)

X_train_cat_high = hash_high.fit_transform(X_train[cat_high])
#X_val_cat_high   = hash_high.transform(X_val[cat_high])
X_test_cat_high  = hash_high.transform(X_test[cat_high])
X_test_cat_high.columns

Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7',
       'col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_13', 'col_14',
       'col_15', 'col_16', 'col_17', 'col_18', 'col_19', 'col_20', 'col_21',
       'col_22', 'col_23', 'col_24', 'col_25', 'col_26', 'col_27', 'col_28',
       'col_29', 'col_30', 'col_31', 'col_32', 'col_33', 'col_34', 'col_35',
       'col_36', 'col_37', 'col_38', 'col_39', 'col_40', 'col_41', 'col_42',
       'col_43', 'col_44', 'col_45', 'col_46', 'col_47', 'col_48', 'col_49',
       'col_50', 'col_51', 'col_52', 'col_53', 'col_54', 'col_55', 'col_56',
       'col_57', 'col_58', 'col_59', 'col_60', 'col_61', 'col_62', 'col_63'],
      dtype='object')

In [36]:
type(X_train_cat_high)

pandas.core.frame.DataFrame

In [37]:
cols = cat_low + cat_high + scale_cols
X_train_rest = X_train.drop(columns =cols )
X_test_rest = X_test.drop(columns =cols )
X_train_rest.columns

Index(['tr_month_sin', 'tr_month_cos', 'tr_day_sin', 'tr_day_cos',
       'tr_hour_sin', 'tr_hour_cos', 'tr_minute_sin', 'tr_minute_cos'],
      dtype='object')

In [38]:
X_test_rest.columns

Index(['tr_month_sin', 'tr_month_cos', 'tr_day_sin', 'tr_day_cos',
       'tr_hour_sin', 'tr_hour_cos', 'tr_minute_sin', 'tr_minute_cos'],
      dtype='object')

In [39]:
A = csr_matrix(X_train_num_scaled.astype(np.float32))
B = X_train_cat_low                                
C = csr_matrix(X_train_cat_high.astype(np.float32))
D = csr_matrix(X_train_rest.astype(np.float32))

print(type(A), type(B),type(C),type(D))

X_train_final = hstack([A, B, C, D]).tocsr()

<class 'scipy.sparse._csr.csr_matrix'> <class 'scipy.sparse._csr.csr_matrix'> <class 'scipy.sparse._csr.csr_matrix'> <class 'scipy.sparse._csr.csr_matrix'>


In [40]:
A1 = csr_matrix(X_test_num_scaled.astype(np.float32))
B1 = X_test_cat_low                                
C1 = csr_matrix(X_test_cat_high.astype(np.float32))
D1 = csr_matrix(X_test_rest.astype(np.float32))
print(type(A), type(B),type(C),type(D))

X_test_final = hstack([A1, B1, C1, D1]).tocsr()


<class 'scipy.sparse._csr.csr_matrix'> <class 'scipy.sparse._csr.csr_matrix'> <class 'scipy.sparse._csr.csr_matrix'> <class 'scipy.sparse._csr.csr_matrix'>


In [41]:
X_train_final.shape

(1296675, 146)

In [42]:
X_test_final.shape

(555719, 146)

In [43]:
type(X_train_final)

scipy.sparse._csr.csr_matrix

In [44]:
# train and test dataset without scalling numeric features 

cols_woscaling = cat_low + cat_high
X_train_woscaling = X_train.drop(columns =cols_woscaling )
X_test_woscaling = X_test.drop(columns =cols_woscaling )
print(X_test_woscaling)

B = X_train_cat_low                                
C = csr_matrix(X_train_cat_high.astype(np.float32))
D = csr_matrix(X_train_woscaling.astype(np.float32))

X_train_final_wo_scaling = hstack( [B, C, D]).tocsr()

B1 = X_test_cat_low                                
C1 = csr_matrix(X_test_cat_high.astype(np.float32))
D1 = csr_matrix(X_test_woscaling.astype(np.float32))
print(type(B),type(C),type(D))

X_test_final_wo_scaling = hstack([B1, C1, D1]).tocsr()


            amt  city_pop  age  distance_cust_merchant_km  card_txn_count  \
1503399  112.65      3996   43                      73.25            2917   
158457     5.11     59705   36                      93.87             739   
201199     7.74      1523   35                      75.86            4380   
1691368    7.53       516   54                      86.24            2917   
1620696   98.67    192805   57                     107.76            1466   
...         ...       ...  ...                        ...             ...   
1219419    3.55     67858   25                      59.73             734   
962937    72.50       566   45                     124.39            1472   
212242    18.79     19408   26                      77.86            2922   
1004172    1.10     18182   45                       6.22            2920   
99989     15.06      1523   35                      55.36            4380   

         tr_month_sin  tr_month_cos  tr_day_sin  tr_day_cos  tr_hour_sin  \

In [45]:
# from sklearn.linear_model import LogisticRegression

In [46]:

# log_reg = LogisticRegression(
#     solver='saga',          # BEST for big sparse data
#     penalty='l2',           # stable, fast, recommended
#     C=0.1,                  # stronger regularization = faster convergence
#     class_weight='balanced',
#     max_iter=1000,          # required
#     n_jobs=-1
# )
# log_reg.fit(X_train_final, y_train)
 

In [47]:
# from sklearn.metrics import classification_report, roc_auc_score

# # 1. Predict test labels
# y_pred = log_reg.predict(X_test_final)

# # 2. Predict probability for fraud class
# y_prob = log_reg.predict_proba(X_test_final)[:, 1]

# # 3. Evaluate correctly
# print("ROC_AUC:", roc_auc_score(y_test, y_prob))
# print(classification_report(y_test, y_pred))

# giving memory issue error

In [48]:
# here we are using smote for oversampling because our data is imbalance so we can do it here 1:1 ration because its not pratical at all so here we are using 
# 0.1 means 10 % for fraud cased and 90 % for no fraud cased
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy=0.1, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_final, y_train)

# Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
rf_basic  = RandomForestClassifier(n_estimators= 10 ,max_depth = 20,min_samples_split =20, 
                   min_samples_leaf =10, max_features='sqrt',n_jobs =-1, random_state=42, class_weight='balanced')
rf_basic.fit(X_train_final_wo_scaling,y_train)
y_pred = rf_basic.predict(X_test_final_wo_scaling)
y_proba = rf_basic.predict_proba(X_test_final_wo_scaling)[:,1]
print(classification_report(y_test,y_pred))
print('Roc_auc_score', roc_auc_score(y_test,y_proba))


              precision    recall  f1-score   support

           0       1.00      0.99      1.00    552824
           1       0.40      0.84      0.54      2895

    accuracy                           0.99    555719
   macro avg       0.70      0.91      0.77    555719
weighted avg       1.00      0.99      0.99    555719

Roc_auc_score 0.9826958066176251
