In [1]:
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# High-cardinality encoder (fast, memory-safe)
from category_encoders.hashing import HashingEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  # (kept for template; we do not have nulls)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, classification_report
#from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [2]:
# data = pd.read_csv('Semi_time_scaled_data.csv')

In [3]:
data = pd.read_csv('eda_dataset.csv')

In [4]:
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,amt_clean,distance_cust_merchant_km
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,2019,1,1,0,0,31,adult,Tuesday,4.97,78.6
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,2019,1,1,0,0,41,adult,Tuesday,107.23,30.21
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,2019,1,1,0,0,57,senior,Tuesday,47.45,108.21
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,2019,1,1,0,1,52,senior,Tuesday,45.0,95.67
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,2019,1,1,0,3,33,adult,Tuesday,41.96,77.56


In [5]:
data.shape

(1852394, 31)

In [7]:

data = data.drop(columns = ['merchant','distance_cust_merchant_km','cc_num','first', 'last','street','lat', 'long','dob','trans_num','unix_time','merch_lat', 'merch_long',
                           'age_group','tr_day','tr_minute'],axis=1)
# foe now dropping  tr_minute

In [8]:
data.shape

(1852394, 15)

In [9]:
data.columns

Index(['category', 'amt', 'gender', 'city', 'state', 'zip', 'city_pop', 'job',
       'is_fraud', 'tr_year', 'tr_month', 'tr_hour', 'age', 'tr_day_name',
       'amt_clean'],
      dtype='object')

Data Sampling by maintaning (nonfraud to fraud 10:1 ratio)

In [10]:
# step 1 to balance and reduce data 
fraud = data[data['is_fraud'] == 1]
non_fraud = data[data['is_fraud'] == 0]

In [11]:
# Step 2: Sample non-fraud rows to reduce dataset size and control class imbalance
# Here we're keeping a 10:1 ratio of non-fraud to fraud (can change to 5, 20, etc.)
# random_state=42  # for reproducibility
non_fraud_sampled = non_fraud.sample(n = min(len(non_fraud), len(fraud)*6), random_state = 42)

In [12]:
# Step 3: Combine fraud and sampled non-fraud into one dataset
reduced_data = pd.concat([fraud, non_fraud_sampled])


In [13]:
# Step 4: Shuffle the combined dataset so fraud and non-fraud are mixed
reduced_data = reduced_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
# Optional: Check class balance
print(reduced_data['is_fraud'].value_counts(normalize=True))  # See class distribution
print(reduced_data.shape)  # Check total rows

is_fraud
0    0.857143
1    0.142857
Name: proportion, dtype: float64
(67557, 15)


In [15]:
reduced_data.shape

(67557, 15)

In [16]:
reduced_data.columns

Index(['category', 'amt', 'gender', 'city', 'state', 'zip', 'city_pop', 'job',
       'is_fraud', 'tr_year', 'tr_month', 'tr_hour', 'age', 'tr_day_name',
       'amt_clean'],
      dtype='object')

In [17]:
#reduced_data['merchant'] = reduced_data['merchant'].str.replace('fraud_', '', regex=False)

In [18]:
reduced_data.head()

Unnamed: 0,category,amt,gender,city,state,zip,city_pop,job,is_fraud,tr_year,tr_month,tr_hour,age,tr_day_name,amt_clean
0,gas_transport,9.08,M,Houston,TX,77038,2906700,Careers adviser,1,2019,10,3,23,Sunday,9.08
1,misc_pos,20.7,M,Paulding,OH,45879,6284,Secondary school teacher,0,2019,4,10,83,Wednesday,20.7
2,shopping_pos,8.07,M,Milner,GA,30257,4138,Field seismologist,0,2019,5,1,50,Friday,8.07
3,misc_pos,38.55,F,Mulberry Grove,IL,62262,1810,Race relations officer,0,2020,1,18,46,Sunday,38.55
4,travel,1.51,M,Marion,CT,6444,370,Health service manager,1,2019,10,22,57,Thursday,1.51


In [19]:
reduced_data.shape

(67557, 15)

In [20]:
reduced_data_c = reduced_data.select_dtypes(include=['object'])
for c in reduced_data_c:
    print( c ,':',reduced_data_c[c].nunique())

category : 14
gender : 2
city : 906
state : 51
job : 497
tr_day_name : 7


In [21]:
reduced_data_n = reduced_data.select_dtypes(include=['number'])
for c in reduced_data_n:
    print( c ,':',reduced_data_n[c].nunique())

amt : 23748
zip : 985
city_pop : 891
is_fraud : 2
tr_year : 2
tr_month : 12
tr_hour : 24
age : 83
amt_clean : 14505


In [22]:
# cat_low  = ["state", "category","gender", 'tr_year']      # OHE (<= ~100 uniques)
# cat_high = ["merchant", "city", "job","zip"]                             # Hashing (693/906/497 uniques)
# # 'gender' is low-card too; treat as binary OHE with drop='if_binary'

# scale_cols = ["city_pop","age", "amt", "distance_cust_merchant_km",'card_txn_count']

In [23]:
target = 'is_fraud'

numeric_base = ['amt','city_pop','age']
#numeric_base = ['amt','city_pop','age','tr_year']

low_card_cat = ['category','gender', 'state','tr_month','tr_day_name','tr_hour']
#low_card_cat = ['category', 'gender', 'state','tr_month','tr_day_name','tr_hour']
high_card_cat = ['city', 'job']
#high_card_cat = ['merchant', 'city', 'job', 'zip']
# all_numeric = numeric_base + cyclic_cols
#cyclic_cols = ['tr_month_sin','tr_month_cos','tr_day_sin','tr_day_cos','tr_hour_sin','tr_hour_cos','tr_minute_sin','tr_minute_cos']


In [24]:
X = reduced_data.drop(columns=[target])
y = reduced_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42,stratify=y)

In [25]:
#Step 1 – Transformers
numeric_tranformer = Pipeline(steps = [('scaler', StandardScaler())])
low_cat_transformer = Pipeline(steps = [('ohe', OneHotEncoder(handle_unknown ='ignore'))])
high_cat_transformer = Pipeline(steps = [('hash',HashingEncoder(n_components =64))])

In [26]:
preprocess_non_tree = ColumnTransformer(transformers = [
    ('num',numeric_tranformer,numeric_base),
    ('low_cat',low_cat_transformer,low_card_cat),
    ('high_cat',high_cat_transformer,high_card_cat)], remainder = 'drop')
# keep sin/cos, no scaling
#  remainder = 'drop' :Drop all columns that are not mentioned in the transformers list. and viceversa remainder='passthrough'

In [27]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, roc_auc_score

dummy = DummyClassifier(strategy='most_frequent') 
dummy.fit(X_train, y_train)

y_pred_dummy = dummy.predict(X_test)
y_proba_dummy = dummy.predict_proba(X_test)[:, 1]

print("=== Dummy Model ===")
print(classification_report(y_test, y_pred_dummy))
print("ROC_AUC:", roc_auc_score(y_test, y_proba_dummy))

=== Dummy Model ===
              precision    recall  f1-score   support

           0       0.86      1.00      0.92     11582
           1       0.00      0.00      0.00      1930

    accuracy                           0.86     13512
   macro avg       0.43      0.50      0.46     13512
weighted avg       0.73      0.86      0.79     13512

ROC_AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=500,class_weight='balanced',n_jobs=-1)

pipe_non_tree = Pipeline(steps=[('preprocess', preprocess_non_tree),('model', log_reg)])


In [29]:
from sklearn.metrics import classification_report, roc_auc_score

pipe_non_tree.fit(X_train, y_train)

y_pred = pipe_non_tree.predict(X_test)
y_proba = pipe_non_tree.predict_proba(X_test)[:, 1]


print(classification_report(y_test, y_pred))
print("ROC_AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.98      0.88      0.93     11582
           1       0.55      0.91      0.69      1930

    accuracy                           0.88     13512
   macro avg       0.77      0.89      0.81     13512
weighted avg       0.92      0.88      0.89     13512

ROC_AUC: 0.960118837252374


In [30]:
# for tree input pipline 
preprocess_tree = ColumnTransformer(
    transformers=[('num', 'passthrough',numeric_base),
        ('low_cat', low_cat_transformer,  low_card_cat),
        ('high_cat', high_cat_transformer, high_card_cat),],remainder='drop')

In [31]:
y_train.shape

(54045,)

In [32]:
X_train.columns

Index(['category', 'amt', 'gender', 'city', 'state', 'zip', 'city_pop', 'job',
       'tr_year', 'tr_month', 'tr_hour', 'age', 'tr_day_name', 'amt_clean'],
      dtype='object')

In [33]:
X_train.head()

Unnamed: 0,category,amt,gender,city,state,zip,city_pop,job,tr_year,tr_month,tr_hour,age,tr_day_name,amt_clean
4195,gas_transport,76.35,M,Fayetteville,NC,28314,238602,"Scientist, research (maths)",2020,2,1,27,Saturday,76.35
52640,food_dining,100.75,F,New Waverly,TX,77358,4993,"Scientist, biomedical",2019,10,20,71,Monday,100.75
46241,shopping_net,7.19,M,Lonsdale,MN,55046,5211,Chief Strategy Officer,2020,8,13,33,Saturday,7.19
44890,personal_care,44.14,F,Tampa,FL,33620,717255,Waste management officer,2020,7,12,54,Friday,44.14
54778,misc_pos,41.22,M,Moorhead,MS,38761,2870,Chartered public finance accountant,2020,12,16,21,Monday,41.22


In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (classification_report,roc_auc_score,confusion_matrix)
dt = DecisionTreeClassifier(max_depth=10,criterion='gini',min_samples_split=20,min_samples_leaf=5, class_weight='balanced', random_state=42)

dt_pipe = Pipeline(steps=[('preprocess', preprocess_tree),('model', dt)])

In [35]:
dt_pipe.fit(X_train, y_train)
y_pred_dt  = dt_pipe.predict(X_test)
y_proba_dt = dt_pipe.predict_proba(X_test)[:, 1] 

print("Decision Tree – classification report")
print(classification_report(y_test, y_pred_dt))

print("Confusion matrix")
print(confusion_matrix(y_test, y_pred_dt))

print("ROC_AUC:", roc_auc_score(y_test, y_proba_dt))

Decision Tree – classification report
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     11582
           1       0.82      0.98      0.89      1930

    accuracy                           0.96     13512
   macro avg       0.91      0.97      0.93     13512
weighted avg       0.97      0.96      0.97     13512

Confusion matrix
[[11155   427]
 [   48  1882]]
ROC_AUC: 0.9878852122688144


In [36]:
rf = RandomForestClassifier(
    n_estimators=30,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
)

rf_pipe = Pipeline(steps=[
    ('preprocess', preprocess_tree),
    ('model', rf)
])

rf_pipe.fit(X_train, y_train)

y_pred_rf  = rf_pipe.predict(X_test)
y_proba_rf = rf_pipe.predict_proba(X_test)[:, 1]

print("=== Random Forest ===")
print(classification_report(y_test, y_pred_rf))
print("ROC_AUC:", roc_auc_score(y_test, y_proba_rf))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))

=== Random Forest ===
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     11582
           1       0.98      0.88      0.92      1930

    accuracy                           0.98     13512
   macro avg       0.98      0.94      0.96     13512
weighted avg       0.98      0.98      0.98     13512

ROC_AUC: 0.9925141567717639
Confusion matrix:
 [[11540    42]
 [  236  1694]]


In [37]:
#!pip install xgboost

In [38]:
# from xgboost import XGBClassifier

# xgb = XGBClassifier(
#     n_estimators=500,
#     max_depth=5,
#     learning_rate=0.05,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     scale_pos_weight=10,
#     random_state=42,
#     n_jobs=-1
# )

# xgb_pipe = Pipeline(steps=[
#     ('preprocess', preprocess_tree),
#     ('model', xgb)
# ])

# xgb_pipe.fit(X_train, y_train)

# y_pred_xgb = xgb_pipe.predict(X_test)
# y_proba_xgb = xgb_pipe.predict_proba(X_test)[:, 1]

# print(classification_report(y_test, y_pred_xgb))
# print("ROC_AUC:", roc_auc_score(y_test, y_proba_xgb))