In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_curve, auc, precision_score, recall_score, f1_score
)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, make_scorer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import cross_validate

In [None]:
# Load dataset
df = pd.read_csv("C:/Users/maxhi/Documents/GitHub/Credit_Card_Fraud_Detection/creditcard.csv")

In [None]:
# Basic data check
print(df.shape)
print(df['Class'].value_counts(normalize=True))  # Shows class imbalance - fraud rate of ~0.17%
print(df.isnull().sum())

In [None]:
# Scale 'Amount' and 'Time' only, as v1-v28 are already standardised (via PCA)
scaler = StandardScaler()
df[['Scaled_Amount', 'Scaled_Time']] = scaler.fit_transform(df[['Amount', 'Time']])
df.drop(['Amount', 'Time'], axis=1, inplace=True)

In [None]:
# Define X and y
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
# Train-test split (stratified), 30% test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=10
)

In [None]:
# Handle imbalance with SMOTE -- decided on SMOTE, as planning to use log regression, and this will help with underfitting
print("Before SMOTE:", y_train.value_counts())
smote = SMOTE(random_state=10)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("After SMOTE:", y_train_resampled.value_counts())