In [None]:
%pip install xgboost imbalanced-learn --quiet

# Tree‑Based Models With SMOTE + Preprocessing (Full Notebook Cells)
Here: We’ll build:
1. RandomForest + SMOTE
2. XGBoost + SMOTE (optional but recommended)
3. Cross‑validation comparison
4. Test‑set evaluation
5. ROC + PR curves
6. Side‑by‑side comparison with your logistic regression model

In [None]:
# Cell 1b: imports

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_curve,
    auc,
    precision_recall_curve,
)

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
#####
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

###
# from sklearn.base import accuracy_score
# from sklearn.metrics import f1_score, precision_score, recall_score
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.pipeline import make_pipeline


In [None]:
# Load data
cleaned_parquet = "../data/cleaned_data/cleaned_fraud.parquet"

df = pd.read_parquet(cleaned_parquet)

X = df.drop(columns=["is_fraud"])
y = df["is_fraud"].astype(int)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Build pipelines
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

numeric_features = [
    "amount", "time_since_last_transaction", "spending_deviation_score",
    "velocity_score", "geo_anomaly_score", "year", "month",
    "day_of_month", "hour", "day_of_week"
]

categorical_features = [
    "sender_account", "receiver_account", "transaction_type",
    "merchant_category", "location", "device_used",
    "payment_channel", "ip_address", "device_hash"
]

numeric_features = [c for c in numeric_features if c in X.columns]
categorical_features = [c for c in categorical_features if c in X.columns]

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# RandomForest + SMOTE pipeline
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    n_jobs=-1,
    random_state=42
)

rf_pipeline = ImbPipeline(
    steps=[
        ("preprocess", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", rf_model),
    ]
)

rf_pipeline

In [None]:
# XGBoost + SMOTE pipeline
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

xgb_pipeline = ImbPipeline(
    steps=[
        ("preprocess", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", xgb_model),
    ]
)

xgb_pipeline

In [None]:
# Logistic Regression + SMOTE
smote_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(max_iter=1000, n_jobs=-1))
])

# RandomForest + SMOTE
rf_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", RandomForestClassifier(
        n_estimators=300, n_jobs=-1, random_state=42
    ))
])

# XGBoost + SMOTE
xgb_pipeline = ImbPipeline([
    ("preprocess", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(
        n_estimators=300, max_depth=6, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric="logloss", n_jobs=-1, random_state=42
    ))
])

In [None]:
# Compare models with cross-validation (RF vs XGB vs Logistic Regression)
from sklearn.metrics import make_scorer


models = {
    "LogReg + SMOTE": smote_pipeline,
    "RandomForest + SMOTE": rf_pipeline,
    "XGBoost + SMOTE": xgb_pipeline,
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_model(name, pipeline):
    print(f"\n=== {name} ===")
    for metric_name, scorer in {
        "precision": precision_score,
        "recall": recall_score,
        "f1": f1_score,
        "accuracy": accuracy_score,
    }.items():
        scores = cross_val_score(
            pipeline,
            X_train,
            y_train,
            cv=cv,
            scoring=make_scorer(scorer),
            n_jobs=-1,
        )
        print(f"{metric_name}: mean={scores.mean():.4f}, std={scores.std():.4f}")

for name, model in models.items():
    evaluate_model(name, model)

In [None]:
# Fit the best tree model on full training data

chosen_tree = xgb_pipeline   # or rf_pipeline

chosen_tree.fit(X_train, y_train)

y_pred_tree = chosen_tree.predict(X_test)
y_proba_tree = chosen_tree.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_tree, digits=4))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_tree)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Non-fraud", "Fraud"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix — Tree Model")
plt.show()

In [None]:
# ROC Curve

fpr, tpr, _ = roc_curve(y_test, y_proba_tree)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}", color="darkorange")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Tree Model")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Precision‑Recall Curve
from sklearn.metrics import average_precision_score


precision, recall, _ = precision_recall_curve(y_test, y_proba_tree)
avg_precision = average_precision_score(y_test, y_proba_tree)

plt.figure(figsize=(7,5))
plt.plot(recall, precision, label=f"AP = {avg_precision:.4f}", color="purple")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve — Tree Model")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Compare ROC/PR of Logistic Regression vs Tree Model

# Logistic regression probabilities
y_proba_lr = smote_pipeline.predict_proba(X_test)[:, 1]

# ROC curves
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)
fpr_tree, tpr_tree, _ = roc_curve(y_test, y_proba_tree)

plt.figure(figsize=(7,5))
plt.plot(fpr_lr, tpr_lr, label="LogReg + SMOTE")
plt.plot(fpr_tree, tpr_tree, label="Tree Model + SMOTE")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Comparison")
plt.legend()
plt.grid(True)
plt.show()

# PR curves
precision_lr, recall_lr, _ = precision_recall_curve(y_test, y_proba_lr)
precision_tree, recall_tree, _ = precision_recall_curve(y_test, y_proba_tree)

plt.figure(figsize=(7,5))
plt.plot(recall_lr, precision_lr, label="LogReg + SMOTE")
plt.plot(recall_tree, precision_tree, label="Tree Model + SMOTE")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("PR Curve Comparison")
plt.legend()
plt.grid(True)
plt.show()

# Feature importance plots
Tree models naturally learn nonlinear interactions, and feature importance plots help us understand which signals actually drive fraud predictions.

In [None]:
# Cell 1: Fit RandomForest pipeline
rf_pipeline.fit(X_train, y_train)

# Extract the trained RandomForest model
rf_model = rf_pipeline.named_steps["model"]

# Extract feature names after preprocessing
ohe = rf_pipeline.named_steps["preprocess"].named_transformers_["cat"].named_steps["onehot"]
ohe_feature_names = ohe.get_feature_names_out(categorical_features)

all_feature_names = np.concatenate([numeric_features, ohe_feature_names])

len(all_feature_names), rf_model.feature_importances_.shape

In [None]:
# Cell 2: RandomForest feature importance plot

importances = rf_model.feature_importances_
feat_imp = pd.DataFrame({
    "feature": all_feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

plt.figure(figsize=(10, 12))
sns.barplot(data=feat_imp.head(25), x="importance", y="feature", palette="viridis")
plt.title("Top 25 Feature Importances — RandomForest")
plt.tight_layout()
plt.show()

feat_imp.head(25)

In [None]:
# Cell 3: Fit XGBoost pipeline
xgb_pipeline.fit(X_train, y_train)

# Extract trained XGBoost model
xgb_model = xgb_pipeline.named_steps["model"]

# Reuse the same feature names from earlier
# (numeric_features + OHE-expanded categorical features)

In [None]:
# Cell 4: XGBoost feature importance plot

xgb_importances = xgb_model.feature_importances_

xgb_feat_imp = pd.DataFrame({
    "feature": all_feature_names,
    "importance": xgb_importances
}).sort_values("importance", ascending=False)

plt.figure(figsize=(10, 12))
sns.barplot(data=xgb_feat_imp.head(25), x="importance", y="feature", palette="magma")
plt.title("Top 25 Feature Importances — XGBoost")
plt.tight_layout()
plt.show()

xgb_feat_imp.head(25)