In [1]:
import pandas as pd

fraud = pd.read_csv("../data/processed/fraud_processed.csv")


# Define target and features

In [2]:
X = fraud.drop(columns=['class'])
y = fraud['class']


In [3]:
X = X.drop(columns=['user_id', 'device_id', 'ip_address'], errors='ignore')


# Encode categorical features

In [4]:
X = pd.get_dummies(X, drop_first=True)


MemoryError: Unable to allocate 21.3 GiB for an array with shape (151112, 151112) and data type bool

# Train-test split (MANDATORY: stratified)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


# Apply SMOTE

In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


ModuleNotFoundError: No module named 'imblearn'

# Train model

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_resampled, y_train_resampled)


# Evaluate baseline model

In [None]:
from sklearn.metrics import (
    precision_recall_curve,
    f1_score,
    confusion_matrix,
    average_precision_score
)

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

f1 = f1_score(y_test, y_pred)
auc_pr = average_precision_score(y_test, y_prob)

f1, auc_pr


In [None]:
confusion_matrix(y_test, y_pred)


# Train Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_resampled, y_train_resampled)


# Evaluate ensemble model

In [None]:
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

f1_rf = f1_score(y_test, y_pred_rf)
auc_pr_rf = average_precision_score(y_test, y_prob_rf)

f1_rf, auc_pr_rf


In [None]:
confusion_matrix(y_test, y_pred_rf)


# Cross-validation

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    rf,
    X_train_resampled,
    y_train_resampled,
    scoring='f1',
    cv=cv
)

cv_scores.mean(), cv_scores.std()


# Model comparison & selection

In [None]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "F1 Score": [f1, f1_rf],
    "AUC-PR": [auc_pr, auc_pr_rf]
})

results
