In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from category_encoders.hashing import HashingEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [2]:
data = pd.read_csv('Semi_time_scaled_data.csv')

In [3]:
data = data.drop(columns=['card_txn_count', 'distance_cust_merchant_km'])

In [4]:
# step 1 to balance and reduce data 
fraud = data[data['is_fraud'] == 1]
non_fraud = data[data['is_fraud'] == 0]

In [5]:
# Step 2: Sample non-fraud rows to reduce dataset size and control class imbalance
# Here we're keeping a 10:1 ratio of non-fraud to fraud (can change to 5, 20, etc.)
# random_state=42  # for reproducibility
non_fraud_sampled = non_fraud.sample(n = min(len(non_fraud), len(fraud)*5), random_state = 42)

In [6]:
# Step 3: Combine fraud and sampled non-fraud into one dataset
reduced_data = pd.concat([fraud, non_fraud_sampled])


In [7]:
# Step 4: Shuffle the combined dataset so fraud and non-fraud are mixed
reduced_data = reduced_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
# Optional: Check class balance
print(reduced_data['is_fraud'].value_counts(normalize=True))  # See class distribution
print(reduced_data.shape)  # Check total rows

is_fraud
0    0.833333
1    0.166667
Name: proportion, dtype: float64
(57906, 20)


In [9]:
# Optional: Check class balance
print(reduced_data['is_fraud'].value_counts(normalize=True))  # See class distribution
print(reduced_data.shape)  # Check total rows

is_fraud
0    0.833333
1    0.166667
Name: proportion, dtype: float64
(57906, 20)


In [10]:
target = 'is_fraud'

# Numeric base (no leakage, no IDs)
numeric_base = ['amt', 'city_pop', 'age', 'tr_year']

# Cyclic time features
cyclic_cols = [
    'tr_month_sin', 'tr_month_cos',
    'tr_day_sin', 'tr_day_cos',
    'tr_hour_sin', 'tr_hour_cos',
    'tr_minute_sin', 'tr_minute_cos'
]

# Low-cardinality categoricals → OneHot
low_card_cat = ['category', 'gender', 'state']

# High-cardinality categoricals → Hashing (merchant, city, job, zip)
high_card_cat = ['merchant', 'city', 'job', 'zip']

# Sanity check: all features accounted for
X_cols = numeric_base + cyclic_cols + low_card_cat + high_card_cat
print("Number of feature columns:", len(X_cols))
print("Any missing columns in data?",
      set(data.drop(columns=[target]).columns) - set(X_cols))

Number of feature columns: 19
Any missing columns in data? set()


In [11]:
X = reduced_data[X_cols].copy()
y = reduced_data[target].astype(int)   # ensure 0/1

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(46324, 19) (46324,)
(11582, 19) (11582,)


In [12]:
# For numeric features (only base numeric, NOT cyclic)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# One-hot for low-card categorical
low_cat_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

# Hashing for high-card categorical
high_cat_transformer = Pipeline(steps=[
    ('hash', HashingEncoder(n_components=64))
])

In [13]:
preprocess_non_tree = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_base),
        ('cyclic', 'passthrough', cyclic_cols),
        ('low_cat', low_cat_transformer, low_card_cat),
        ('high_cat', high_cat_transformer, high_card_cat),
    ],
    remainder='drop'
)

In [14]:
log_reg = LogisticRegression(
    max_iter=100,
    class_weight='balanced',
    n_jobs=-1
)

log_pipe = Pipeline(steps=[
    ('preprocess', preprocess_non_tree),
    ('model', log_reg)
])

log_pipe.fit(X_train, y_train)

y_pred_lr  = log_pipe.predict(X_test)
y_proba_lr = log_pipe.predict_proba(X_test)[:, 1]

print("=== Logistic Regression ===")
print(classification_report(y_test, y_pred_lr))
print("ROC_AUC:", roc_auc_score(y_test, y_proba_lr))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))

=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.96      0.89      0.92      9652
           1       0.58      0.80      0.68      1930

    accuracy                           0.87     11582
   macro avg       0.77      0.84      0.80     11582
weighted avg       0.89      0.87      0.88     11582

ROC_AUC: 0.9301963779957011
Confusion matrix:
 [[8552 1100]
 [ 385 1545]]


In [15]:
preprocess_tree = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_base + cyclic_cols),
        ('low_cat', low_cat_transformer, low_card_cat),
        ('high_cat', high_cat_transformer, high_card_cat),
    ],
    remainder='drop'
)

In [16]:
dt = DecisionTreeClassifier(
    max_depth=10,
    criterion='gini',
    min_samples_split=20,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42
)

dt_pipe = Pipeline(steps=[
    ('preprocess', preprocess_tree),
    ('model', dt)
])

dt_pipe.fit(X_train, y_train)

y_pred_dt  = dt_pipe.predict(X_test)
y_proba_dt = dt_pipe.predict_proba(X_test)[:, 1]

print("=== Decision Tree ===")
print(classification_report(y_test, y_pred_dt))
print("ROC_AUC:", roc_auc_score(y_test, y_proba_dt))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_dt))

=== Decision Tree ===
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      9652
           1       0.82      0.98      0.89      1930

    accuracy                           0.96     11582
   macro avg       0.91      0.97      0.93     11582
weighted avg       0.97      0.96      0.96     11582

ROC_AUC: 0.9888263647470845
Confusion matrix:
 [[9232  420]
 [  35 1895]]


In [17]:
rf = RandomForestClassifier(
    n_estimators=30,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
)

rf_pipe = Pipeline(steps=[
    ('preprocess', preprocess_tree),
    ('model', rf)
])

rf_pipe.fit(X_train, y_train)

y_pred_rf  = rf_pipe.predict(X_test)
y_proba_rf = rf_pipe.predict_proba(X_test)[:, 1]

print("=== Random Forest ===")
print(classification_report(y_test, y_pred_rf))
print("ROC_AUC:", roc_auc_score(y_test, y_proba_rf))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))

=== Random Forest ===
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      9652
           1       0.98      0.85      0.91      1930

    accuracy                           0.97     11582
   macro avg       0.98      0.92      0.95     11582
weighted avg       0.97      0.97      0.97     11582

ROC_AUC: 0.9897810649998176
Confusion matrix:
 [[9619   33]
 [ 287 1643]]


In [18]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, roc_auc_score

dummy = DummyClassifier(strategy='most_frequent') 
dummy.fit(X_train, y_train)

y_pred_dummy = dummy.predict(X_test)
y_proba_dummy = dummy.predict_proba(X_test)[:, 1]

print("=== Dummy Model ===")
print(classification_report(y_test, y_pred_dummy))
print("ROC_AUC:", roc_auc_score(y_test, y_proba_dummy))

=== Dummy Model ===
              precision    recall  f1-score   support

           0       0.83      1.00      0.91      9652
           1       0.00      0.00      0.00      1930

    accuracy                           0.83     11582
   macro avg       0.42      0.50      0.45     11582
weighted avg       0.69      0.83      0.76     11582

ROC_AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
