### Preprocessing steps

In [1]:
# import libraries 
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer  # (kept for template; you said no nulls)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, classification_report

# High-cardinality encoder (fast, memory-safe)
from category_encoders.hashing import HashingEncoder
#from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import OneHotEncoder

In [2]:
#!pip install category_encoders

In [3]:
data = pd.read_csv('eda_dataset.csv')

In [4]:
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,amt_clean,distance_cust_merchant_km
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,2019,1,1,0,0,31,adult,Tuesday,4.97,78.6
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,2019,1,1,0,0,41,adult,Tuesday,107.23,30.21
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,2019,1,1,0,0,57,senior,Tuesday,47.45,108.21
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,2019,1,1,0,1,52,senior,Tuesday,45.0,95.67
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,2019,1,1,0,3,33,adult,Tuesday,41.96,77.56


In [5]:
data.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'tr_year', 'tr_month', 'tr_day', 'tr_hour', 'tr_minute', 'age',
       'age_group', 'tr_day_name', 'amt_clean', 'distance_cust_merchant_km'],
      dtype='object')

In [6]:
# drop amt_clean because i have removed outlier from it and in fraud cases amount is important so instead of removing im capping upper limit here with fic amount 
data = data.drop(columns = ['amt_clean'])

# will create new clean_amount column after splitting data to prevent data leakage (means we nor use test data for capping so that we can get realistic model performance)

In [7]:
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,is_fraud,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,distance_cust_merchant_km
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,0,2019,1,1,0,0,31,adult,Tuesday,78.6
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,0,2019,1,1,0,0,41,adult,Tuesday,30.21
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,0,2019,1,1,0,0,57,senior,Tuesday,108.21
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,0,2019,1,1,0,1,52,senior,Tuesday,95.67
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,0,2019,1,1,0,3,33,adult,Tuesday,77.56


In [8]:
data_cat = data.select_dtypes(include=['object'])

In [9]:
# category columns cardinality number
for c in data_cat:
    print( c ,':',data_cat[c].nunique())

merchant : 693
category : 14
first : 355
last : 486
gender : 2
street : 999
city : 906
state : 51
job : 497
dob : 984
trans_num : 1852394
unix_time : 1819583
age_group : 3
tr_day_name : 7


In [10]:
data_num = data.select_dtypes(include=['number'])

In [11]:
data_num.columns

Index(['cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop', 'merch_lat',
       'merch_long', 'is_fraud', 'tr_year', 'tr_month', 'tr_day', 'tr_hour',
       'tr_minute', 'age', 'distance_cust_merchant_km'],
      dtype='object')

In [12]:
for c in data_num:
    print( c ,':',data_num[c].nunique())

cc_num : 999
amt : 60616
zip : 985
lat : 983
long : 983
city_pop : 891
merch_lat : 1754157
merch_long : 1809753
is_fraud : 2
tr_year : 2
tr_month : 12
tr_day : 7
tr_hour : 24
tr_minute : 60
age : 83
distance_cust_merchant_km : 14547


In [13]:
data.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'tr_year', 'tr_month', 'tr_day', 'tr_hour', 'tr_minute', 'age',
       'age_group', 'tr_day_name', 'distance_cust_merchant_km'],
      dtype='object')

In [14]:
# Credit card number is identifier it does not show any trend so i dropped it
#'merch_lat','merch_long', 'merch_lat','merch_long' alredy derived distance from it so decided to drop it 
# dob , extraxted age from it so we can drop it
# trans_num is unique identifier so delete it
# droped unix time because difference between tras_date_time is too large 2557 days it might be due to error in synthetic data 
## droped 'trans_date_trans_time' because already calculated age and extracted tr_hr,tr_year,tr_month, tr_minute etc.
# claculated age so dropping age group column
# first, last,'street has high cordinality(many distinct values) so its do not have any pattern in it so deleting it???????????????
# dropped 'tr_day_name' because already we have tr_day meaning is same. so keep numerical column because most ml model works on numerical value only.
# should i drop tr_day_name and age_group or not ??????????????????????????????

data = data.drop(columns = ['cc_num','first', 'last','street','lat', 
                            'long','dob','trans_num','unix_time','merch_lat', 'merch_long'])

In [15]:
data.columns

Index(['merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip',
       'city_pop', 'job', 'is_fraud', 'tr_year', 'tr_month', 'tr_day',
       'tr_hour', 'tr_minute', 'age', 'age_group', 'tr_day_name',
       'distance_cust_merchant_km'],
      dtype='object')

In [16]:
data.shape

(1852394, 19)

In [17]:
data.head()

Unnamed: 0,merchant,category,amt,gender,city,state,zip,city_pop,job,is_fraud,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,distance_cust_merchant_km
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,28654,3495,"Psychologist, counselling",0,2019,1,1,0,0,31,adult,Tuesday,78.6
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,99160,149,Special educational needs teacher,0,2019,1,1,0,0,41,adult,Tuesday,30.21
2,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,83252,4154,Nature conservation officer,0,2019,1,1,0,0,57,senior,Tuesday,108.21
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,59632,1939,Patent attorney,0,2019,1,1,0,1,52,senior,Tuesday,95.67
4,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,24433,99,Dance movement psychotherapist,0,2019,1,1,0,3,33,adult,Tuesday,77.56


In [18]:
data_c = data.select_dtypes(include=['object'])

In [19]:
for c in data_c:
    print( c ,':',data_c[c].nunique())

merchant : 693
category : 14
gender : 2
city : 906
state : 51
job : 497
age_group : 3
tr_day_name : 7


In [20]:
data_n = data.select_dtypes(include=['number'])

In [21]:
for c in data_n:
    print( c ,':',data_n[c].nunique())

amt : 60616
zip : 985
city_pop : 891
is_fraud : 2
tr_year : 2
tr_month : 12
tr_day : 7
tr_hour : 24
tr_minute : 60
age : 83
distance_cust_merchant_km : 14547


In [22]:
Q1 = data['amt'].quantile(0.25)
Q3 = data['amt'].quantile(0.75)
IQR = Q3 - Q1

upper_lim = Q3 + 1.5 * IQR
lower_lim = Q1 - 1.5 * IQR

In [23]:
data['amt_clean'] = data['amt'].clip(lower=lower_lim, upper=upper_lim)

data = data.drop(columns=['amt'])

In [24]:
data.head()

Unnamed: 0,merchant,category,gender,city,state,zip,city_pop,job,is_fraud,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,distance_cust_merchant_km,amt_clean
0,"fraud_Rippin, Kub and Mann",misc_net,F,Moravian Falls,NC,28654,3495,"Psychologist, counselling",0,2019,1,1,0,0,31,adult,Tuesday,78.6,4.97
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,F,Orient,WA,99160,149,Special educational needs teacher,0,2019,1,1,0,0,41,adult,Tuesday,30.21,107.23
2,fraud_Lind-Buckridge,entertainment,M,Malad City,ID,83252,4154,Nature conservation officer,0,2019,1,1,0,0,57,senior,Tuesday,108.21,193.29
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,M,Boulder,MT,59632,1939,Patent attorney,0,2019,1,1,0,1,52,senior,Tuesday,95.67,45.0
4,fraud_Keeling-Crist,misc_pos,M,Doe Hill,VA,24433,99,Dance movement psychotherapist,0,2019,1,1,0,3,33,adult,Tuesday,77.56,41.96


In [25]:
target = "is_fraud"

cat_low  = ["state", "category", "age_group", "tr_day_name","gender"]      # OHE (<= ~100 uniques)
cat_high = ["merchant", "city", "job"]                             # Hashing (693/906/497 uniques)
# 'gender' is low-card too; treat as binary OHE with drop='if_binary'

num_cols = ["zip", "city_pop", "tr_year", "tr_month", "tr_day", "tr_hour", "tr_minute","age", "amt_clean", "distance_cust_merchant_km"]

In [26]:
# ========= 3) Split FIRST (avoid leakage) =========
# Keep the original rare class ratio by stratifying
# stratify = y This forces all splits to have the same fraud ratio as the original dataset
# Stratify creates splits that keep the same class distribution as the target variable y.

X = data.drop(columns=[target])
y = data[target].astype(int)

X_train, X_temp, y_train, y_temp = train_test_split(X,y, test_size=0.25, stratify =y,random_state=42)

In [27]:
# Sizes: Train 75%, Val 12.5%, Test 12.5%
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp , test_size=0.5, stratify= y_temp, random_state=42)


In [28]:
print(X_train.shape)
print(X_test.shape,X_val.shape)

(1389295, 18)
(231550, 18) (231549, 18)


In [29]:
data['is_fraud'].value_counts()
data['is_fraud'].sum()
fraud_percent = data['is_fraud'].sum()/len(data['is_fraud'])*100
print(f'farud percent is {fraud_percent:.2f}%')

farud percent is 0.52%


In [30]:
# Create imputer , fir only on train and transform on test and validation.
#num_imputer = SimpleImputer(strategy="median")

# ---- FIT ONLY ON TRAIN ----
#X_train_num_imputed = num_imputer.fit_transform(X_train[num_cols])

# ---- TRANSFORM VAL & TEST ----
#X_val_num_imputed  = num_imputer.transform(X_val[num_cols])
#X_test_num_imputed = num_imputer.transform(X_test[num_cols])

First understand: What is data leakage?
Leakage happens when test/validation information is used while training.
Examples of leakage:
Computing mean/median using full dataset
Computing outlier thresholds using full dataset
Scaling using full dataset
Target encoding using full dataset
This lets your model “peek” at future data → giving fake high accuracy.

In [31]:
#X_train.head()

Always fit and fit_transform on training data only.
Always transform on validation and test data.

In [32]:
X_train[num_cols].head()

Unnamed: 0,zip,city_pop,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,amt_clean,distance_cust_merchant_km
1049265,56592,516,2020,3,1,20,35,54,41.17,109.21
518201,79539,178,2019,8,6,22,12,36,1.58,56.36
1291723,29127,8333,2020,6,4,13,48,26,38.19,65.27
466743,46765,5341,2019,7,2,23,41,25,148.76,67.34
1461770,38761,2870,2020,8,0,23,47,21,32.52,99.79


In [33]:
# -NUMERIC SCALER ----------
# with_mean=False ,Because subtracting mean creates non-zero values,turning the sparse matrix into a dense matrix → RAM explodes → model becomes extremely slow.

num_scaler = StandardScaler(with_mean=False)# keep sparse-friendly

# Fit ONLY on numeric columns of TRAIN
X_train_num_scaled = num_scaler.fit_transform(X_train[num_cols])

# Transform VAL and TEST using same scaler
X_val_num_scaled  = num_scaler.transform(X_val[num_cols])
X_test_num_scaled = num_scaler.transform(X_test[num_cols])

In [34]:
#from sklearn.preprocessing import OneHotEncoder

In [35]:
# cat_low  = ["state", "category", "age_group", "tr_day_name"]      # OHE (<= ~100 uniques)
# cat_high = ["merchant", "city", "job"]                             # Hashing (693/906/497 uniques)

In [36]:
# handle_unknown="ignore" If test data contains a category not seen during training Instead of error, ignore it (give all zeros)
#  sparse_output=True ,It does not print full matrix,Only shows non-zero positions,Very memory friendly, good for large data set
# sparse_output=False, → Output is a normal dense NumPy array, sparse_output=True → don’t store zeros sparse=False → store everything

ohe_low = OneHotEncoder(handle_unknown="ignore", sparse_output=True,dtype=np.float32)

X_train_cat_low = ohe_low.fit_transform(X_train[cat_low])
X_val_cat_low   = ohe_low.transform(X_val[cat_low])
X_test_cat_low  = ohe_low.transform(X_test[cat_low])

In [43]:
# High-card categorical (Hashing)

from category_encoders.hashing import HashingEncoder

hash_high = HashingEncoder(n_components=64)

X_train_cat_high = hash_high.fit_transform(X_train[cat_high])
X_val_cat_high   = hash_high.transform(X_val[cat_high])
X_test_cat_high  = hash_high.transform(X_test[cat_high])


In [38]:
# We use hstack (horizontal stack) to combine encoded/scaled feature matrices.
# merge() is only for DataFrames with a key column (like SQL join).
# After encoding/scaling, features become sparse matrices → they have no index/key.
# hstack simply puts matrices side-by-side (column-wise) while keeping sparsity.
# Therefore:
#    merge() = join DataFrames by key
#    hstack() = join matrices by columns (used in ML feature engineering)

In [44]:
#Combine all parts
from scipy.sparse import hstack

X_train_final = hstack([X_train_num_scaled,X_train_cat_low, X_train_cat_high])

X_val_final = hstack([X_val_num_scaled,X_val_cat_low,X_val_cat_high])

X_test_final = hstack([X_test_num_scaled, X_test_cat_low, X_test_cat_high])

In [45]:
X_train_final.shape

(1389295, 151)

In [46]:
X_val_final.shape

(231549, 151)

In [47]:
type(X_train_final)

scipy.sparse._coo.coo_matrix