In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, average_precision_score, f1_score
)
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
# Load cleaned fraud dataset
fraud_df = pd.read_csv('../data/cleaned_fraud_df.csv')

In [17]:
y = fraud_df["class"]  # or df["Class"]
X = fraud_df.drop(columns=["class"])  # or df.drop(columns=["Class"])

In [18]:
X = X.select_dtypes(include=["number"])  # keeps only numeric columns

In [19]:
X = pd.get_dummies(X)  # One-hot encoding of categorical columns

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [23]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, average_precision_score

def evaluate_model(y_true, y_pred, model_name):
    print(f"=== {model_name} ===")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print(f"AUC-PR: {average_precision_score(y_true, y_pred):.4f}")
    print()

# Evaluate both models
evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

=== Logistic Regression ===
Confusion Matrix:
[[25856  1537]
 [  908  1922]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95     27393
           1       0.56      0.68      0.61      2830

    accuracy                           0.92     30223
   macro avg       0.76      0.81      0.78     30223
weighted avg       0.93      0.92      0.92     30223

F1 Score: 0.6112
AUC-PR: 0.4074

=== Random Forest ===
Confusion Matrix:
[[27393     0]
 [ 1337  1493]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       1.00      0.53      0.69      2830

    accuracy                           0.96     30223
   macro avg       0.98      0.76      0.83     30223
weighted avg       0.96      0.96      0.95     30223

F1 Score: 0.6907
AUC-PR: 0.5718

