# Focused on heavily imbalanced data
1. Use stratified train/test splits to preserve fraud ratio in both sets.
2. Apply SMOTE only to the training set — never to the test set.
3. Use F1, precision, recall as evaluation metrics, not just accuracy.
4. Try both SMOTE and class_weight='balanced' in logistic regression to compare which handles imbalance better.
5. Threshold tuning, using the default 0.5 cutoff which may not be optimal for fraud detection.

In [1]:
# Cell 1b: imports (some you already have, but safe to re-run)

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_curve,
    auc,
    precision_recall_curve,
)

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
# Cell 2: load cleaned dataset if needed

cleaned_parquet = "../data/cleaned_data/cleaned_fraud.parquet"

df = pd.read_parquet(cleaned_parquet)

print(df.shape)
print(df['is_fraud'].value_counts())
print(df['is_fraud'].value_counts(normalize=True))
df.head()

In [None]:
# Cell 3: define features (X) and target (y)

target_col = "is_fraud"
X = df.drop(columns=[target_col])
y = df[target_col].astype(int)   # convert True/False -> 1/0

# Define numeric and categorical features based on your column list
numeric_features = [
    "amount",
    "time_since_last_transaction",
    "spending_deviation_score",
    "velocity_score",
    "geo_anomaly_score",
    "year",
    "month",
    "day_of_month",
    "hour",
    "day_of_week",
]

categorical_features = [
    "sender_account",
    "receiver_account",
    "transaction_type",
    "merchant_category",
    "location",
    "device_used",
    "payment_channel",
    "ip_address",
    "device_hash",
]

# Keep only those that actually exist in X
numeric_features = [c for c in numeric_features if c in X.columns]
categorical_features = [c for c in categorical_features if c in X.columns]

numeric_features, categorical_features

In [None]:
# Cell 4: stratified train-test split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train size:", X_train.shape, " Test size:", X_test.shape)
print("Train fraud ratio:\n", y_train.value_counts(normalize=True))
print("Test fraud ratio:\n", y_test.value_counts(normalize=True))

In [None]:
# Cell 5: preprocessing pipelines

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [None]:
# Cell 6: SMOTE + logistic regression pipeline

smote = SMOTE(random_state=42)

log_reg = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
)

smote_pipeline = ImbPipeline(
    steps=[
        ("preprocess", preprocessor),
        ("smote", smote),      # applied only on training data inside CV / fit
        ("model", log_reg),
    ]
)

smote_pipeline

In [None]:
# Cell 7: logistic regression with class_weight='balanced' (no SMOTE)

log_reg_balanced = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    class_weight="balanced",
)

balanced_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", log_reg_balanced),
    ]
)

balanced_pipeline

In [None]:
# Cell 8: cross-validation comparison (SMOTE vs class_weight)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scorers = {
    "accuracy": accuracy_score,
    "precision": precision_score,
    "recall": recall_score,
    "f1": f1_score,
}

def evaluate_cv(pipeline, X_train, y_train, cv, scorers, label):
    print(f"\n=== CV results for {label} ===")
    for name, func in scorers.items():
        scores = cross_val_score(
            pipeline,
            X_train,
            y_train,
            cv=cv,
            scoring=make_scorer(func),
            n_jobs=-1,
        )
        print(f"{name}: mean={scores.mean():.3f}, std={scores.std():.3f}")

from sklearn.metrics import make_scorer

evaluate_cv(smote_pipeline, X_train, y_train, cv, scorers, label="SMOTE + LR")
evaluate_cv(balanced_pipeline, X_train, y_train, cv, scorers, label="class_weight='balanced' + LR")

# We'd pick whichever gives better recall/F1 on the fraud class; suppose you choose SMOTE for the next steps (you can swap easily).

In [None]:
# Cell 9: fit chosen model (here SMOTE pipeline) and evaluate on test set

chosen_pipeline = smote_pipeline   # or balanced_pipeline if it performed better

chosen_pipeline.fit(X_train, y_train)

y_pred = chosen_pipeline.predict(X_test)
y_proba = chosen_pipeline.predict_proba(X_test)[:, 1]

print("=== Classification report (threshold = 0.5) ===\n")
print(classification_report(y_test, y_pred, digits=4))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Non-fraud", "Fraud"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - threshold 0.5")
plt.show()

In [None]:
# Cell 10: threshold tuning

thresholds = np.linspace(0.1, 0.9, 17)  # 0.1, 0.15, ..., 0.9
records = []

for thr in thresholds:
    y_pred_thr = (y_proba >= thr).astype(int)
    prec = precision_score(y_test, y_pred_thr)
    rec = recall_score(y_test, y_pred_thr)
    f1 = f1_score(y_test, y_pred_thr)
    acc = accuracy_score(y_test, y_pred_thr)
    records.append((thr, prec, rec, f1, acc))

thr_df = pd.DataFrame(records, columns=["threshold", "precision", "recall", "f1", "accuracy"])
thr_df

In [None]:
# Cell 11: plot metrics vs threshold

plt.figure(figsize=(8,5))
plt.plot(thr_df["threshold"], thr_df["precision"], label="Precision")
plt.plot(thr_df["threshold"], thr_df["recall"], label="Recall")
plt.plot(thr_df["threshold"], thr_df["f1"], label="F1")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Precision / Recall / F1 vs Threshold")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Cell 12: final evaluation at chosen threshold

best_thr = 0.30  # set this based on thr_df / plot

y_pred_best = (y_proba >= best_thr).astype(int)

print(f"=== Classification report (threshold = {best_thr}) ===\n")
print(classification_report(y_test, y_pred_best, digits=4))

cm_best = confusion_matrix(y_test, y_pred_best)
disp_best = ConfusionMatrixDisplay(confusion_matrix=cm_best, display_labels=["Non-fraud", "Fraud"])
disp_best.plot(cmap="Blues")
plt.title(f"Confusion Matrix - threshold {best_thr}")
plt.show()

# ROC & PR (Precision-Recall) Curves

In [None]:
# Cell A: ROC Curve for chosen model

from sklearn.metrics import roc_curve, auc

# Compute ROC curve values
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], color="navy", lw=1, linestyle="--")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Chosen Model")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Add interpretation for ROC/PR curves 

In [None]:
# Cell B: Precision-Recall Curve

from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
avg_precision = average_precision_score(y_test, y_proba)

plt.figure(figsize=(7, 5))
plt.plot(recall, precision, lw=2, color="purple", label=f"AP = {avg_precision:.4f}")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve — Chosen Model")
plt.legend(loc="upper right")
plt.grid(True)
plt.show()