In [None]:
# first attempt - Co
# Cell 2: imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
)

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # important!

In [2]:
# Cell 3: load cleaned fraud dataset
cleaned_parquet = "../data/cleaned_data/cleaned_fraud.parquet"

df = pd.read_parquet(cleaned_parquet)
print(df.shape)
df.head()

(4103487, 20)


Unnamed: 0,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash,year,month,day_of_month,hour,day_of_week
0,ACC420214,ACC222629,318.12,withdrawal,restaurant,Tokyo,pos,False,-4797.552868,-0.94,16,0.64,UPI,88.85.250.147,D3353785,2023,4,25,14,2
1,ACC759858,ACC433871,25.03,transfer,online,Dubai,pos,False,3705.738348,-0.56,1,0.48,ACH,89.235.76.67,D4950912,2023,8,17,1,4
2,ACC702235,ACC658588,5.33,transfer,online,Toronto,pos,False,2158.906433,0.77,7,0.18,ACH,132.247.155.53,D9285320,2023,12,28,23,4
3,ACC818001,ACC846452,261.11,payment,entertainment,Tokyo,atm,False,-71.393848,0.43,12,0.41,wire_transfer,186.251.230.65,D4842173,2023,8,18,9,5
4,ACC293626,ACC440136,28.61,transfer,retail,London,pos,False,1400.413482,-1.48,18,0.53,UPI,233.115.221.14,D7106200,2023,10,30,9,1


In [3]:
# Cell 4: separate features and target
target_col = "is_fraud"

X = df.drop(columns=[target_col])
y = df[target_col]

# Ensure y is 0/1 (if it's boolean or strings)
y = y.astype(int)

y.value_counts(normalize=True)

is_fraud
0    0.956244
1    0.043756
Name: proportion, dtype: float64

In [4]:
# Cell 5: define numeric and categorical feature lists
numeric_features = [
    "amount",
    "time_since_last_transaction",
    "spending_deviation_score",
    "velocity_score",
    "geo_anomaly_score",
    "year",
    "month",
    "day_of_month",
    "hour",
    "day_of_week"
]

categorical_features = [
    "sender_account",
    "receiver_account",
    "transaction_type",
    "merchant_category",
    "location",
    "device_used",
    "payment_channel",
    "ip_address",
    "device_hash",
]

# Keep only columns that actually exist
numeric_features = [c for c in numeric_features if c in X.columns]
categorical_features = [c for c in categorical_features if c in X.columns]

numeric_features, categorical_features

(['amount',
  'time_since_last_transaction',
  'spending_deviation_score',
  'velocity_score',
  'geo_anomaly_score',
  'year',
  'month',
  'day_of_month',
  'hour',
  'day_of_week'],
 ['sender_account',
  'receiver_account',
  'transaction_type',
  'merchant_category',
  'location',
  'device_used',
  'payment_channel',
  'ip_address',
  'device_hash'])

In [5]:
# Cell 6: stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

(is_fraud
 0    0.956244
 1    0.043756
 Name: proportion, dtype: float64,
 is_fraud
 0    0.956243
 1    0.043757
 Name: proportion, dtype: float64)

In [6]:
# Cell 7: preprocessing pipelines

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [9]:
# Cell 8: SMOTE + logistic regression pipeline

smote = SMOTE(random_state=42)

clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",  # we can later try 'balanced' vs None
    n_jobs=-1,
)

smote_pipeline = ImbPipeline(
    steps=[
        ("preprocess", preprocessor),
        ("smote", smote),
        ("model", clf),
    ]
)

smote_pipeline


In [8]:
# Cell 9: cross-validation with F1 (more meaningful than accuracy for fraud)

from sklearn.metrics import make_scorer


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "f1": make_scorer(f1_score),
    "accuracy": make_scorer(accuracy_score),
}

cv_results = {
    name: cross_val_score(smote_pipeline, X_train, y_train, cv=cv, scoring=scorer, n_jobs=-1)
    for name, scorer in scoring.items()
}

for metric, scores in cv_results.items():
    print(f"{metric}: mean={scores.mean():.3f}, std={scores.std():.3f}")



ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/imblearn/pipeline.py", line 333, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/opt/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'class_weight' parameter of LogisticRegression must be an instance of 'dict', a str among {'balanced'} or None. Got <class 'imblearn.over_sampling._smote.base.SMOTE'> instead.


In [None]:
# Cell 10: fit on full training set
smote_pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = smote_pipeline.predict(X_test)
y_proba = smote_pipeline.predict_proba(X_test)[:, 1]

print("Classification report:\n")
print(classification_report(y_test, y_pred, digits=4))

In [None]:
# Cell 11: confusion matrix
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Non-fraud", "Fraud"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - SMOTE + Logistic Regression")
plt.show()

In [None]:
# Cell 12: ROC and PR curves
from sklearn.metrics import roc_curve, auc, precision_recall_curve

fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

plt.subplot(1,2,2)
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")

plt.tight_layout()
plt.show()

In [None]:
# Cell 13: baseline pipeline without SMOTE
baseline_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LogisticRegression(max_iter=1000, n_jobs=-1)),
    ]
)

baseline_pipeline.fit(X_train, y_train)
y_pred_base = baseline_pipeline.predict(X_test)

print("Baseline (no SMOTE) classification report:\n")
print(classification_report(y_test, y_pred_base, digits=4))