In [1]:
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# High-cardinality encoder (fast, memory-safe)
from category_encoders.hashing import HashingEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  # (kept for template; we do not have nulls)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, classification_report
#from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [2]:
# data = pd.read_csv('Semi_time_scaled_data.csv')

In [3]:
data = pd.read_csv('eda_dataset.csv')

In [4]:
data.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,tr_year,tr_month,tr_day,tr_hour,tr_minute,age,age_group,tr_day_name,amt_clean,distance_cust_merchant_km
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,...,2019,1,1,0,0,31,adult,Tuesday,4.97,78.6
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,...,2019,1,1,0,0,41,adult,Tuesday,107.23,30.21
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,...,2019,1,1,0,0,57,senior,Tuesday,47.45,108.21
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,...,2019,1,1,0,1,52,senior,Tuesday,45.0,95.67
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,...,2019,1,1,0,3,33,adult,Tuesday,41.96,77.56


In [5]:
data.shape

(1852394, 31)

In [6]:

data = data.drop(columns = ['cc_num','first', 'last','street','lat', 'long','dob','trans_num','unix_time','merch_lat', 'merch_long',
                           'age_group','tr_day','tr_minute'],axis=1)
# foe now dropping  tr_minute

In [32]:
data.shape

(1852394, 17)

In [33]:
data.columns

Index(['merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip',
       'city_pop', 'job', 'is_fraud', 'tr_year', 'tr_month', 'tr_hour', 'age',
       'tr_day_name', 'amt_clean', 'distance_cust_merchant_km'],
      dtype='object')

Data Sampling by maintaning (nonfraud to fraud 10:1 ratio)

In [7]:
# step 1 to balance and reduce data 
fraud = data[data['is_fraud'] == 1]
non_fraud = data[data['is_fraud'] == 0]

In [8]:
# Step 2: Sample non-fraud rows to reduce dataset size and control class imbalance
# Here we're keeping a 10:1 ratio of non-fraud to fraud (can change to 5, 20, etc.)
# random_state=42  # for reproducibility
non_fraud_sampled = non_fraud.sample(n = min(len(non_fraud), len(fraud)*10), random_state = 42)

In [9]:
# Step 3: Combine fraud and sampled non-fraud into one dataset
reduced_data = pd.concat([fraud, non_fraud_sampled])


In [10]:
# Step 4: Shuffle the combined dataset so fraud and non-fraud are mixed
reduced_data = reduced_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [11]:
# Optional: Check class balance
print(reduced_data['is_fraud'].value_counts(normalize=True))  # See class distribution
print(reduced_data.shape)  # Check total rows

is_fraud
0    0.909091
1    0.090909
Name: proportion, dtype: float64
(106161, 17)


In [12]:
reduced_data.shape

(106161, 17)

In [13]:
reduced_data.columns

Index(['merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip',
       'city_pop', 'job', 'is_fraud', 'tr_year', 'tr_month', 'tr_hour', 'age',
       'tr_day_name', 'amt_clean', 'distance_cust_merchant_km'],
      dtype='object')

In [14]:
reduced_data['merchant'] = reduced_data['merchant'].str.replace('fraud_', '', regex=False)

In [15]:
reduced_data.head()

Unnamed: 0,merchant,category,amt,gender,city,state,zip,city_pop,job,is_fraud,tr_year,tr_month,tr_hour,age,tr_day_name,amt_clean,distance_cust_merchant_km
0,"Becker, Harris and Harvey",personal_care,43.45,M,Tallmansville,WV,26237,571,Accounting technician,0,2020,7,14,53,Thursday,43.45,22.77
1,Terry-Huel,shopping_net,8.27,F,Sun City,CA,92585,54287,"Designer, exhibition/display",0,2020,6,12,48,Tuesday,8.27,54.38
2,"Hettinger, McCullough and Fay",home,2.44,F,Houston,TX,77007,2906700,"Copywriter, advertising",0,2019,1,13,35,Sunday,2.44,82.79
3,"Nicolas, Hills and McGlynn",entertainment,35.46,M,Mallie,KY,41836,798,Facilities manager,0,2019,12,23,93,Monday,35.46,70.89
4,"Lang, Towne and Schuppe",kids_pets,19.77,F,Centerview,MO,64019,2368,Electronics engineer,1,2019,1,22,30,Wednesday,19.77,98.49


In [34]:
reduced_data.shape

(106161, 17)

In [16]:
# cat_low  = ["state", "category","gender", 'tr_year']      # OHE (<= ~100 uniques)
# cat_high = ["merchant", "city", "job","zip"]                             # Hashing (693/906/497 uniques)
# # 'gender' is low-card too; treat as binary OHE with drop='if_binary'

# scale_cols = ["city_pop","age", "amt", "distance_cust_merchant_km",'card_txn_count']

In [17]:
target = 'is_fraud'

numeric_base = ['amt','city_pop','age','tr_year']


low_card_cat = ['category', 'gender', 'state','tr_month','tr_day_name','tr_hour']
high_card_cat = ['merchant', 'city', 'job', 'zip']

# all_numeric = numeric_base + cyclic_cols
#cyclic_cols = ['tr_month_sin','tr_month_cos','tr_day_sin','tr_day_cos','tr_hour_sin','tr_hour_cos','tr_minute_sin','tr_minute_cos']


In [18]:
X = reduced_data.drop(columns=[target])
y = reduced_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42,stratify=y)

In [19]:
#Step 1 – Transformers
numeric_tranformer = Pipeline(steps = [('scaler', StandardScaler())])
low_cat_transformer = Pipeline(steps = [('ohe', OneHotEncoder(handle_unknown ='ignore'))])
high_cat_transformer = Pipeline(steps = [('hash',HashingEncoder(n_components =64))])

In [20]:
preprocess_non_tree = ColumnTransformer(transformers = [
    ('num',numeric_tranformer,numeric_base),
    ('low_cat',low_cat_transformer,low_card_cat),
    ('high_cat',high_cat_transformer,high_card_cat)], remainder = 'drop')
# keep sin/cos, no scaling
#  remainder = 'drop' :Drop all columns that are not mentioned in the transformers list. and viceversa remainder='passthrough'

In [21]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=500,class_weight='balanced',n_jobs=-1)

pipe_non_tree = Pipeline(steps=[('preprocess', preprocess_non_tree),('model', log_reg)])


In [22]:
from sklearn.metrics import classification_report, roc_auc_score

pipe_non_tree.fit(X_train, y_train)

y_pred = pipe_non_tree.predict(X_test)
y_proba = pipe_non_tree.predict_proba(X_test)[:, 1]


print(classification_report(y_test, y_pred))
print("ROC_AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.99      0.87      0.92     19303
           1       0.41      0.91      0.56      1930

    accuracy                           0.87     21233
   macro avg       0.70      0.89      0.74     21233
weighted avg       0.94      0.87      0.89     21233

ROC_AUC: 0.9534908128592323


In [23]:
# for tree input pipline 
preprocess_tree = ColumnTransformer(
    transformers=[('num', 'passthrough',numeric_base),
        ('low_cat', low_cat_transformer,  low_card_cat),
        ('high_cat', high_cat_transformer, high_card_cat),],remainder='drop')

In [24]:
y_train.shape

(84928,)

In [25]:
X_train.columns

Index(['merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip',
       'city_pop', 'job', 'tr_year', 'tr_month', 'tr_hour', 'age',
       'tr_day_name', 'amt_clean', 'distance_cust_merchant_km'],
      dtype='object')

In [26]:
X_train.head()

Unnamed: 0,merchant,category,amt,gender,city,state,zip,city_pop,job,tr_year,tr_month,tr_hour,age,tr_day_name,amt_clean,distance_cust_merchant_km
48902,Cormier LLC,health_fitness,58.57,M,Marienville,PA,16239,4172,"Copywriter, advertising",2020,8,20,58,Friday,58.57,13.32
52011,"Swift, Bradtke and Marquardt",grocery_net,50.41,M,Moorhead,MS,38761,2870,Chartered public finance accountant,2020,7,11,21,Tuesday,50.41,81.11
67759,"Bernhard, Grant and Langworth",shopping_pos,1024.17,F,Thrall,TX,76578,1766,Press sub,2020,8,2,44,Tuesday,47.45,74.18
96261,"Haley, Batz and Auer",health_fitness,138.36,M,Moorhead,MS,38761,2870,Chartered public finance accountant,2020,9,22,21,Saturday,138.36,59.6
82735,Bernier and Sons,kids_pets,53.89,M,Easton,KS,66020,1442,Air broker,2020,12,12,38,Wednesday,53.89,1.56


In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (classification_report,roc_auc_score,confusion_matrix)
dt = DecisionTreeClassifier(max_depth=10,criterion='gini',min_samples_split=20,min_samples_leaf=5, class_weight='balanced', random_state=42)

dt_pipe = Pipeline(steps=[('preprocess', preprocess_tree),('model', dt)])

In [28]:
dt_pipe.fit(X_train, y_train)
y_pred_dt  = dt_pipe.predict(X_test)
y_proba_dt = dt_pipe.predict_proba(X_test)[:, 1] 

print("Decision Tree – classification report")
print(classification_report(y_test, y_pred_dt))

print("Confusion matrix")
print(confusion_matrix(y_test, y_pred_dt))

print("ROC_AUC:", roc_auc_score(y_test, y_proba_dt))

Decision Tree – classification report
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     19303
           1       0.75      0.96      0.84      1930

    accuracy                           0.97     21233
   macro avg       0.87      0.96      0.91     21233
weighted avg       0.97      0.97      0.97     21233

Confusion matrix
[[18668   635]
 [   74  1856]]
ROC_AUC: 0.9840024463968258


In [29]:
rf = RandomForestClassifier(
    n_estimators=30,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
)

rf_pipe = Pipeline(steps=[
    ('preprocess', preprocess_tree),
    ('model', rf)
])

rf_pipe.fit(X_train, y_train)

y_pred_rf  = rf_pipe.predict(X_test)
y_proba_rf = rf_pipe.predict_proba(X_test)[:, 1]

print("=== Random Forest ===")
print(classification_report(y_test, y_pred_rf))
print("ROC_AUC:", roc_auc_score(y_test, y_proba_rf))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))

=== Random Forest ===
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     19303
           1       0.98      0.82      0.89      1930

    accuracy                           0.98     21233
   macro avg       0.98      0.91      0.94     21233
weighted avg       0.98      0.98      0.98     21233

ROC_AUC: 0.9900684046266265
Confusion matrix:
 [[19266    37]
 [  357  1573]]


In [30]:
#!pip install xgboost

In [31]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=10,
    random_state=42,
    n_jobs=-1
)

xgb_pipe = Pipeline(steps=[
    ('preprocess', preprocess_tree),
    ('model', xgb)
])

xgb_pipe.fit(X_train, y_train)

y_pred_xgb = xgb_pipe.predict(X_test)
y_proba_xgb = xgb_pipe.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_xgb))
print("ROC_AUC:", roc_auc_score(y_test, y_proba_xgb))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     19303
           1       0.86      0.97      0.91      1930

    accuracy                           0.98     21233
   macro avg       0.93      0.98      0.95     21233
weighted avg       0.98      0.98      0.98     21233

ROC_AUC: 0.9978885265492033
