# Model 1 extension and modification

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import seaborn as sns
import os
import duckdb

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [10]:
cleaned_parquet = "../data/cleaned_data/cleaned_fraud.parquet"

print(f"ðŸ“Š Original size: {os.path.getsize(cleaned_parquet) / (1024**3):.2f} GB")

con = duckdb.connect()

ðŸ“Š Original size: 0.14 GB


In [11]:
# Load full dataset
df = con.execute(f"SELECT * FROM '{cleaned_parquet}'").fetch_df()

# 1. EDA & Pattern Detection
print("Original Class Distribution:")
print(df['is_fraud'].value_counts(normalize=True))

# Check for negative time values (Pattern Detection)
neg_time = df[df['time_since_last_transaction'] < 0]
print(f"\nRows with negative time_since_last_transaction: {len(neg_time)}")
if len(neg_time) > 0:
    print("Negative values might indicate data errors or specific flags. Treating as valid numeric for now.")

Original Class Distribution:
is_fraud
False    0.956244
True     0.043756
Name: proportion, dtype: float64

Rows with negative time_since_last_transaction: 2051331
Negative values might indicate data errors or specific flags. Treating as valid numeric for now.


In [12]:
con.close()

In [None]:


# 2. Data Preparation for SMOTE
# - Drop high cardinality identifiers
# - Encode categorical variables
# - Scale numerical variables

categorical_cols = ['transaction_type', 'merchant_category', 'location', 'device_used', 'payment_channel']
numerical_cols = ['amount', 'time_since_last_transaction', 'spending_deviation_score', 'velocity_score', 'geo_anomaly_score', 'hour', 'day_of_week']
drop_cols = ['sender_account', 'receiver_account', 'ip_address', 'device_hash', 'year', 'month', 'day_of_month'] # high-cardinality identifiers that do not generalize well and can negatively affect the model training. Dropping date because we already extracted more meaningful features like hours and days of the week

# Separate Features and Target
X = df.drop(columns=['is_fraud'] + drop_cols, errors='ignore')
y = df['is_fraud']

# Split Data (Best Practice: Split BEFORE SMOTE)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # stratifying by y to ensure both splits have an equal amount of the target variable y

# Preprocessing Pipeline 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ])




In [None]:
# Next Steps
# After Smote, we can train and evaluate multiple models on the training and test (no-smote) data

In [None]:
# 4. Full Pipeline: Preprocess â†’ SMOTE â†’ Model - automatic preprocessing applying SMOTE in CV
smote_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(max_iter=1000, n_jobs=-1))
])


In [None]:
# Run CROSS VALIDATION - this is training and evaluation loop
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    smote_pipeline,
    X_train,
    y_train,
    cv=cv,
    scoring="f1",
    n_jobs=-1
)

print(scores.mean(), scores.std())

Cross-validation â†’ choose best model â†’ final training â†’ test evaluation
CV only evaluates the pipeline configuration.
`.fit()` trains the final model using that configuration.

In [None]:
# Train final model
smote_pipeline.fit(X_train, y_train) # train on the full training set


# Evaluate
y_pred = smote_pipeline.predict(X_test)

# Using Other ML models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

rf_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        n_jobs=-1,
        random_state=42
    ))
])

In [None]:
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

xgb_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        n_jobs=-1,
        random_state=42
    ))
])

In [None]:

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in {
    "RandomForest": rf_pipeline,
    "XGBoost": xgb_pipeline
}.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)
    print(f"{name}: mean F1={scores.mean():.4f}, std={scores.std():.4f}")

In [None]:
# After which we can then train the models
xgb_pipeline.fit(X_train, y_train)
# OR 
# rf_pipeline.fit(X_train, y_train)

In [None]:
rf_pipeline.fit(X_train, y_train)


#  feature importance across Logistic Regression, RandomForest, and XGBoost

In [None]:
def get_feature_names(preprocessor):
    num_features = preprocessor.named_transformers_['num'].get_feature_names_out()
    cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out()
    return list(num_features) + list(cat_features)

In [None]:
feature_names = get_feature_names(smote_pipeline.named_steps["preprocess"])
feature_names = get_feature_names(rf_pipeline.named_steps["preprocess"])
feature_names = get_feature_names(xgb_pipeline.named_steps["preprocess"])

In [None]:
import pandas as pd
import numpy as np

logreg = smote_pipeline.named_steps["model"]

logreg_importance = pd.DataFrame({
    "feature": feature_names,
    "importance": np.abs(logreg.coef_[0])
}).sort_values("importance", ascending=False)

# (LogReg uses coefficients)
# â€¢ Large positive coefficient â†’ increases fraud probability
# â€¢ Large negative coefficient â†’ decreases fraud probability
# â€¢ Logistic Regression captures linear relationships only

In [None]:
# (RandomForest uses Gini importance)
# â€¢ Measures how much each feature reduces impurity
# â€¢ Captures nonlinear interactions
# â€¢ Tends to favor highâ€‘cardinality oneâ€‘hot encoded features

rf_importance = pd.DataFrame({
    "feature": feature_names,
    "importance": smote_pipeline.named_steps["model"].feature_importances_
}).sort_values("importance", ascending=False)

In [None]:
# XGBoost uses gain, the most meaningful metric
# Gain = how much a feature improves splits
# â€¢ XGBoost captures complex nonlinear patterns
# â€¢ Often the most reliable importance measure for fraud

xgb_model = smote_pipeline.named_steps["model"]

xgb_importance = pd.DataFrame({
    "feature": feature_names,
    "importance": xgb_model.get_booster().get_score(importance_type="gain")
}).sort_values("importance", ascending=False)


In [None]:
# Combine all three into a comparison table

comparison = (
    logreg_importance.rename(columns={"importance": "logreg"})
    .merge(rf_importance.rename(columns={"importance": "rf"}), on="feature", how="outer")
    .merge(xgb_importance.rename(columns={"importance": "xgb"}), on="feature", how="outer")
    .fillna(0)
)

comparison.sort_values("xgb", ascending=False).head(20)