### Preprocessing steps

In [39]:
# import libraries 
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer  # (kept for template; you said no nulls)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, classification_report

# High-cardinality encoder (fast, memory-safe)
from category_encoders.hashing import HashingEncoder
#from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import OneHotEncoder

In [40]:
#!pip install category_encoders

In [41]:
data = pd.read_csv('eda_dataset.csv')

In [42]:
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,amt_clean,distance_cust_merchant_km
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,2019,1,1,0,0,31,adult,Tuesday,4.97,78.6
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,2019,1,1,0,0,41,adult,Tuesday,107.23,30.21
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,2019,1,1,0,0,57,senior,Tuesday,47.45,108.21
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,2019,1,1,0,1,52,senior,Tuesday,45.0,95.67
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,2019,1,1,0,3,33,adult,Tuesday,41.96,77.56


In [43]:
data.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'tr_year', 'tr_month', 'tr_day', 'tr_hour', 'tr_minute', 'age',
       'age_group', 'tr_day_name', 'amt_clean', 'distance_cust_merchant_km'],
      dtype='object')

In [44]:
data['full_name'] = data['first'] + " " + data['last']

In [45]:
# drop amt_clean because i have removed outlier from it and in fraud cases amount is important so instead of removing im capping upper limit here with fic amount 
data = data.drop(columns = ['amt_clean'])

# will create new clean_amount column after splitting data to prevent data leakage (means we nor use test data for capping so that we can get realistic model performance)

In [46]:
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,distance_cust_merchant_km,full_name
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,2019,1,1,0,0,31,adult,Tuesday,78.6,Jennifer Banks
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,2019,1,1,0,0,41,adult,Tuesday,30.21,Stephanie Gill
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,2019,1,1,0,0,57,senior,Tuesday,108.21,Edward Sanchez
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,2019,1,1,0,1,52,senior,Tuesday,95.67,Jeremy White
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,2019,1,1,0,3,33,adult,Tuesday,77.56,Tyler Garcia


In [47]:
data_cat = data.select_dtypes(include=['object'])

In [48]:
# category columns cardinality number
for c in data_cat:
    print( c ,':',data_cat[c].nunique())

merchant : 693
category : 14
first : 355
last : 486
gender : 2
street : 999
city : 906
state : 51
job : 497
dob : 984
trans_num : 1852394
unix_time : 1819583
age_group : 3
tr_day_name : 7
full_name : 989


In [49]:
data_num = data.select_dtypes(include=['number'])

In [50]:
data_num.columns

Index(['cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop', 'merch_lat',
       'merch_long', 'is_fraud', 'tr_year', 'tr_month', 'tr_day', 'tr_hour',
       'tr_minute', 'age', 'distance_cust_merchant_km'],
      dtype='object')

In [51]:
for c in data_num:
    print( c ,':',data_num[c].nunique())

cc_num : 999
amt : 60616
zip : 985
lat : 983
long : 983
city_pop : 891
merch_lat : 1754157
merch_long : 1809753
is_fraud : 2
tr_year : 2
tr_month : 12
tr_day : 7
tr_hour : 24
tr_minute : 60
age : 83
distance_cust_merchant_km : 14547


In [52]:
data.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'tr_year', 'tr_month', 'tr_day', 'tr_hour', 'tr_minute', 'age',
       'age_group', 'tr_day_name', 'distance_cust_merchant_km', 'full_name'],
      dtype='object')

In [53]:
# Credit card number is identifier it does not show any trend so i dropped it
#'merch_lat','merch_long', 'merch_lat','merch_long' alredy derived distance from it so decided to drop it 
# dob , extraxted age from it so we can drop it
# trans_num is unique identifier so delete it
# droped unix time because difference between tras_date_time is too large 2557 days it might be due to error in synthetic data 
## droped 'trans_date_trans_time' because already calculated age and extracted tr_hr,tr_year,tr_month, tr_minute etc.
# claculated age so dropping age group column
# first, last,'street has high cordinality(many distinct values) so its do not have any pattern in it so deleting it???????????????
# dropped 'tr_day_name' because already we have tr_day meaning is same. so keep numerical column because most ml model works on numerical value only.
# should i drop tr_day_name and age_group : yes you can drop because we already have age and day number

data = data.drop(columns = ['cc_num','zip','lat','long','dob','trans_num','unix_time','merch_lat', 'merch_long','age_group','tr_year','tr_day_name'], axis=1)

In [54]:
data.columns

Index(['merchant', 'category', 'amt', 'first', 'last', 'gender', 'street',
       'city', 'state', 'city_pop', 'job', 'is_fraud', 'tr_month', 'tr_day',
       'tr_hour', 'tr_minute', 'age', 'distance_cust_merchant_km',
       'full_name'],
      dtype='object')

In [55]:
data.shape

(1852394, 19)

In [56]:
data.head()

Unnamed: 0,merchant,category,amt,first,last,gender,street,city,state,city_pop,job,is_fraud,tr_month,tr_day,tr_hour,tr_minute,age,distance_cust_merchant_km,full_name
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,3495,"Psychologist, counselling",0,1,1,0,0,31,78.6,Jennifer Banks
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,149,Special educational needs teacher,0,1,1,0,0,41,30.21,Stephanie Gill
2,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,4154,Nature conservation officer,0,1,1,0,0,57,108.21,Edward Sanchez
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,1939,Patent attorney,0,1,1,0,1,52,95.67,Jeremy White
4,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,99,Dance movement psychotherapist,0,1,1,0,3,33,77.56,Tyler Garcia


In [57]:
data_c = data.select_dtypes(include=['object'])

In [58]:
for c in data_c:
    print( c ,':',data_c[c].nunique())

merchant : 693
category : 14
first : 355
last : 486
gender : 2
street : 999
city : 906
state : 51
job : 497
full_name : 989


In [59]:
data_n = data.select_dtypes(include=['number'])

In [60]:
for c in data_n:
    print( c ,':',data_n[c].nunique())

amt : 60616
city_pop : 891
is_fraud : 2
tr_month : 12
tr_day : 7
tr_hour : 24
tr_minute : 60
age : 83
distance_cust_merchant_km : 14547


In [61]:
# plt.figure(figsize= (40,20))
# sns.heatmap(data_n.corr(), annot = True)
# plt.show

In [62]:
# can not cap amount column because amount with outlier has fraud and it is 7318 if i will cap it, model will not abke to catch the fraud properly.
# Q1 = data['amt'].quantile(0.25)
# Q3 = data['amt'].quantile(0.75)
# IQR = Q3 - Q1

# upper_lim = Q3 + 1.5 * IQR
# lower_lim = Q1 - 1.5 * IQR

In [63]:
# data['amt_clean'] = data['amt'].clip(lower=lower_lim, upper=upper_lim)

# data = data.drop(columns=['amt'])

In [64]:
data.head()

Unnamed: 0,merchant,category,amt,first,last,gender,street,city,state,city_pop,job,is_fraud,tr_month,tr_day,tr_hour,tr_minute,age,distance_cust_merchant_km,full_name
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,3495,"Psychologist, counselling",0,1,1,0,0,31,78.6,Jennifer Banks
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,149,Special educational needs teacher,0,1,1,0,0,41,30.21,Stephanie Gill
2,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,4154,Nature conservation officer,0,1,1,0,0,57,108.21,Edward Sanchez
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,1939,Patent attorney,0,1,1,0,1,52,95.67,Jeremy White
4,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,99,Dance movement psychotherapist,0,1,1,0,3,33,77.56,Tyler Garcia


In [65]:
target = "is_fraud"

cat_low  = ["state", "category","gender"]      # OHE (<= ~100 uniques)
cat_high = ["merchant", "city", "job",'street']                             # Hashing (693/906/497 uniques)
# 'gender' is low-card too; treat as binary OHE with drop='if_binary'

num_cols = ["city_pop", "tr_month", "tr_day", "tr_hour", "tr_minute","age", "amt", "distance_cust_merchant_km"]

In [66]:

# MONTH (1–12)
data['tr_month_sin'] = np.sin(2 * np.pi * data['tr_month'] / 12)
data['tr_month_cos'] = np.cos(2 * np.pi * data['tr_month'] / 12)

# DAY OF MONTH (1–31)
data['tr_day_sin'] = np.sin(2 * np.pi * data['tr_day'] / 31)
data['tr_day_cos'] = np.cos(2 * np.pi * data['tr_day'] / 31)

# HOUR (0–23)
data['tr_hour_sin'] = np.sin(2 * np.pi * data['tr_hour'] / 24)
data['tr_hour_cos'] = np.cos(2 * np.pi * data['tr_hour'] / 24)

# MINUTE (0–59) – optional
data['tr_minute_sin'] = np.sin(2 * np.pi * data['tr_minute'] / 60)
data['tr_minute_cos'] = np.cos(2 * np.pi * data['tr_minute'] / 60)

In [67]:
data.drop(['tr_month', 'tr_day', 'tr_hour', 'tr_minute'], axis=1, inplace=True)

In [68]:
target = "is_fraud"
cat_low  = ["state", "category", "gender"]
cat_high = ["merchant", "city", "job","street"]   # remove 'street' from model side

num_cols = ["tr_month_sin", "tr_month_cos","tr_day_sin", "tr_day_cos","tr_hour_sin", "tr_hour_cos",
    "tr_minute_sin", "tr_minute_cos"]
scale_cols = ['amt', 'distance_cust_merchant_km', 'age', 'city_pop']

In [69]:
# ========= 3) Split FIRST (avoid leakage) =========
# Keep the original rare class ratio by stratifying
# stratify = y This forces all splits to have the same fraud ratio as the original dataset
# Stratify creates splits that keep the same class distribution as the target variable y.

X = data.drop(columns=[target])
y = data[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, stratify = y,random_state=42)

In [70]:
# Sizes: Train 75%, Val 12.5%, Test 12.5%
#X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp , test_size=0.5, stratify= y_temp, random_state=42)


In [71]:
# print(X_train.shape)
# print(X_test.shape,X_val.shape)

In [72]:
data['is_fraud'].value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

In [73]:
data['is_fraud'].value_counts()
data['is_fraud'].sum()
fraud_percent = data['is_fraud'].sum()/len(data['is_fraud'])*100
print(f'farud percent is {fraud_percent:.2f}%')

farud percent is 0.52%


In [74]:
# Create imputer , fir only on train and transform on test and validation.
#num_imputer = SimpleImputer(strategy="median")

# ---- FIT ONLY ON TRAIN ----
#X_train_num_imputed = num_imputer.fit_transform(X_train[num_cols])

# ---- TRANSFORM VAL & TEST ----
#X_val_num_imputed  = num_imputer.transform(X_val[num_cols])
#X_test_num_imputed = num_imputer.transform(X_test[num_cols])

First understand: What is data leakage?
Leakage happens when test/validation information is used while training.
Examples of leakage:
Computing mean/median using full dataset
Computing outlier thresholds using full dataset
Scaling using full dataset
Target encoding using full dataset
This lets your model “peek” at future data → giving fake high accuracy.

In [75]:
#X_train.head()

Always fit and fit_transform on training data only.
Always transform on validation and test data.

In [76]:
X_train[num_cols].head()

Unnamed: 0,tr_month_sin,tr_month_cos,tr_day_sin,tr_day_cos,tr_hour_sin,tr_hour_cos,tr_minute_sin,tr_minute_cos
1049265,1.0,6.123234000000001e-17,0.201299,0.97953,-0.866025,0.5,-0.5,-0.866025
518201,-0.8660254,-0.5,0.937752,0.347305,-0.5,0.866025,0.951057,0.309017
1291723,1.224647e-16,-1.0,0.724793,0.688967,-0.258819,-0.965926,-0.951057,0.309017
466743,-0.5,-0.8660254,0.394356,0.918958,-0.258819,0.965926,-0.913545,-0.406737
1461770,-0.8660254,-0.5,0.0,1.0,-0.258819,0.965926,-0.978148,0.207912


In [77]:
# -NUMERIC SCALER ----------
# with_mean=False ,Because subtracting mean creates non-zero values,turning the sparse matrix into a dense matrix → RAM explodes → model becomes extremely slow.

# num_scaler = StandardScaler(with_mean=False)# keep sparse-friendly

# # Fit ONLY on numeric columns of TRAIN
# X_train_num_scaled = num_scaler.fit_transform(X_train[num_cols])

# # Transform VAL and TEST using same scaler
# #X_val_num_scaled  = num_scaler.transform(X_val[num_cols])
# X_test_num_scaled = num_scaler.transform(X_test[num_cols])

#not needed----------------------

In [78]:
scale_cols = ['amt', 'distance_cust_merchant_km', 'age', 'city_pop']

In [79]:
from sklearn.preprocessing import StandardScaler
#from scipy.sparse import csr_matrix

# -NUMERIC SCALER ----------
# with_mean=False ,Because subtracting mean creates non-zero values,turning the sparse matrix into a dense matrix → RAM explodes → model becomes extremely slow.

num_scaler = StandardScaler(with_mean=False)# keep sparse-friendly

# # Fit ONLY on numeric columns of TRAIN
X_train_num_scaled = num_scaler.fit_transform(X_train[scale_cols])

# Transform VAL and TEST using same scaler

X_test_num_scaled = num_scaler.transform(X_test[scale_cols])

In [80]:
#from sklearn.preprocessing import OneHotEncoder

In [81]:
# cat_low  = ["state", "category", "age_group", "tr_day_name"]      # OHE (<= ~100 uniques)
# cat_high = ["merchant", "city", "job"]                             # Hashing (693/906/497 uniques)

In [82]:
# handle_unknown="ignore" If test data contains a category not seen during training Instead of error, ignore it (give all zeros)
#  sparse_output=True ,It does not print full matrix,Only shows non-zero positions,Very memory friendly, good for large data set
# sparse_output=False, → Output is a normal dense NumPy array, sparse_output=True → don’t store zeros sparse=False → store everything

ohe_low = OneHotEncoder(handle_unknown="ignore", sparse_output=True,dtype=np.float32)

X_train_cat_low = ohe_low.fit_transform(X_train[cat_low])
#X_val_cat_low   = ohe_low.transform(X_val[cat_low])
X_test_cat_low  = ohe_low.transform(X_test[cat_low])

In [83]:
# High-card categorical (Hashing)

from category_encoders.hashing import HashingEncoder

hash_high = HashingEncoder(n_components=64)

X_train_cat_high = hash_high.fit_transform(X_train[cat_high])
#X_val_cat_high   = hash_high.transform(X_val[cat_high])
X_test_cat_high  = hash_high.transform(X_test[cat_high])


In [84]:
X_train_num = X_train[num_cols].values
#X_val_num   = X_val[num_cols].values
X_test_num  = X_test[num_cols].values

#.values converts to NumPy array — required for hstack().


In [85]:
# Convert numeric numpy array into a sparse matrix (required for hstack with OHE/HashingEncoder)
from scipy.sparse import csr_matrix

X_train_num = csr_matrix(X_train_num)
#X_val_num   = csr_matrix(X_val_num)
X_test_num  = csr_matrix(X_test_num)


In [86]:
# We use hstack (horizontal stack) to combine encoded/scaled feature matrices.
# merge() is only for DataFrames with a key column (like SQL join).
# After encoding/scaling, features become sparse matrices → they have no index/key.
# hstack simply puts matrices side-by-side (column-wise) while keeping sparsity.
# Therefore:
#    merge() = join DataFrames by key
#    hstack() = join matrices by columns (used in ML feature engineering)

In [87]:
#Combine all parts
from scipy.sparse import hstack

X_train_final = hstack([X_train_num,X_train_num_scaled,X_train_cat_low, X_train_cat_high])

#X_val_final = hstack([X_val_num_scaled,X_val_cat_low,X_val_cat_high])

X_test_final = hstack([X_test_num ,X_test_num_scaled, X_test_cat_low, X_test_cat_high])

In [88]:
X_train_final.shape

(1389295, 143)

In [89]:
# X_val_final.shape

In [90]:
type(X_train_final)

scipy.sparse._coo.coo_matrix

In [91]:
from sklearn.linear_model import LogisticRegression

In [92]:

log_reg = LogisticRegression(class_weight='balanced',max_iter=100,n_jobs=-1)

log_reg.fit(X_train_final, y_train)

In [93]:
from sklearn.metrics import classification_report, roc_auc_score

# 1. Predict test labels
y_pred = log_reg.predict(X_test_final)

# 2. Predict probability for fraud class
y_prob = log_reg.predict_proba(X_test_final)[:, 1]

# 3. Evaluate correctly
print("ROC_AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))

# this model is not good at all because catching only0.02 % of fraud cases and for fraid cases every sccore is not good.

ROC_AUC: 0.9482744753886994
              precision    recall  f1-score   support

           0       1.00      0.88      0.94    460686
           1       0.04      0.84      0.07      2413

    accuracy                           0.88    463099
   macro avg       0.52      0.86      0.50    463099
weighted avg       0.99      0.88      0.93    463099



In [94]:
# here we are using smote for oversampling because our data is imbalance so we can do it here 1:1 ration because its not pratical at all so here we are using 
# 0.1 means 10 % for fraud cased and 90 % for no fraud cased
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy=0.1, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_final, y_train)

In [95]:
log_reg = LogisticRegression(class_weight='balanced',max_iter=200,n_jobs=-1)
log_reg.fit(X_train_res, y_train_res)

In [96]:
y_pred1 = log_reg.predict(X_test_final)


# 2. Predict probability for fraud class
y_prob1 = log_reg.predict_proba(X_test_final)[:, 1]

# 3. Evaluate correctly
print("ROC_AUC:", roc_auc_score(y_test, y_prob1))
print(classification_report(y_test, y_pred1))

ROC_AUC: 0.9474305988180198
              precision    recall  f1-score   support

           0       1.00      0.88      0.94    460686
           1       0.04      0.84      0.07      2413

    accuracy                           0.88    463099
   macro avg       0.52      0.86      0.50    463099
weighted avg       0.99      0.88      0.93    463099

