In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_curve, auc, precision_score, recall_score, f1_score
)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, make_scorer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import cross_validate

In [None]:
# Load dataset
df = pd.read_csv("C:/Users/maxhi/Documents/GitHub/Credit_Card_Fraud_Detection/creditcard.csv")

In [None]:
# Basic data check
print(df.shape)
print(df['Class'].value_counts(normalize=True))  # Shows class imbalance - fraud rate of ~0.17%
print(df.isnull().sum())

In [None]:
# Scale 'Amount' and 'Time' only, as v1-v28 are already standardised (via PCA)
scaler = StandardScaler()
df[['Scaled_Amount', 'Scaled_Time']] = scaler.fit_transform(df[['Amount', 'Time']])
df.drop(['Amount', 'Time'], axis=1, inplace=True)

In [None]:
# Define X and y
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
# Train-test split (stratified), 30% test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=10
)

In [None]:
# Handle imbalance with SMOTE -- decided on SMOTE, as planning to use log regression, and this will help with underfitting
print("Before SMOTE:", y_train.value_counts())
smote = SMOTE(random_state=10)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("After SMOTE:", y_train_resampled.value_counts())

In [None]:
# Train logistic regression with class weighting -- decided on log regression as it's a great baseline and should handle the imbalance well. Also, as features are PCA-transformed,
# data may be linearly seperable, also beneficial to log regression
model = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=10)
model.fit(X_train_resampled, y_train_resampled)

In [None]:
# Evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_proba)
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)

print(f"\nROC AUC: {roc_auc:.4f}")
print(f"PR AUC: {pr_auc:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
labels = np.array([[f"{v}\n{p:.1f}%" for v, p in zip(row_raw, row_pct)]
                   for row_raw, row_pct in zip(cm, cm_percent)])

plt.figure(figsize=(6, 4))
sns.heatmap(cm_percent, annot=labels, fmt='', cmap='Blues', cbar=False,
            xticklabels=['Pred 0', 'Pred 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title("Confusion Matrix (Counts + %)")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

As we can see, this is quite a recall-biased model - perfect for situations where failing to capture fraud is costly, and false positives are not costly to follow up. Let's see if some model tuning can improve the precision, without sacrificing too much recall.

In [None]:
# Threshold tuning
thresholds = np.arange(0.0, 1.01, 0.01)
precision_scores = []
recall_scores = []
f1_scores = []

for t in thresholds:
    y_pred_thresh = (y_proba > t).astype(int)
    precision_scores.append(precision_score(y_test, y_pred_thresh, zero_division=0))
    recall_scores.append(recall_score(y_test, y_pred_thresh))
    f1_scores.append(f1_score(y_test, y_pred_thresh))

# Plot precision-recall-f1 vs. threshold
plt.figure(figsize=(8, 5))
plt.plot(thresholds, precision_scores, label="Precision")
plt.plot(thresholds, recall_scores, label="Recall")
plt.plot(thresholds, f1_scores, label="F1 Score")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Threshold Tuning")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Select threshold that maximizes F1 score
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"\nBest Threshold (F1): {best_threshold:.2f}")