In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, precision_recall_curve
)
import matplotlib.pyplot as plt

# ===============================
# Load & preprocess
# ===============================
df = pd.read_excel("Credit Card Fraud Risk Analysis.xlsx")

# Drop irrelevant columns
df = df.drop(["Transaction ID", "Customer Name", "Merchant Name"], axis=1)

# Date features
df['Transaction_Month'] = df['Transaction Date'].dt.month
df['Transaction_Weekday'] = df['Transaction Date'].dt.weekday
df = df.drop("Transaction Date", axis=1)

# Define X and y
X = df.drop("IsFraud", axis=1)
y = df["IsFraud"]

# Categorical & numeric features
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# ===============================
# Logistic Regression with GridSearchCV
# ===============================
log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight="balanced"))
])

param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],   # Regularization strength
    'classifier__penalty': ['l2'],         # L1/L2 regularization
    'classifier__solver': ['lbfgs', 'liblinear']
}

grid_lr = GridSearchCV(log_reg, param_grid_lr, scoring='roc_auc', cv=3, n_jobs=-1)
grid_lr.fit(X, y)

print("Best Logistic Regression Params:", grid_lr.best_params_)

# ===============================
# Decision Tree with GridSearchCV
# ===============================
dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(class_weight="balanced", random_state=42))
])

param_grid_dt = {
    'classifier__max_depth': [3, 5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 5]
}

grid_dt = GridSearchCV(dt, param_grid_dt, scoring='roc_auc', cv=3, n_jobs=-1)
grid_dt.fit(X, y)

print("Best Decision Tree Params:", grid_dt.best_params_)

# ===============================
# Train-test split (final eval)
# ===============================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use best estimator from GridSearch
best_lr = grid_lr.best_estimator_
best_dt = grid_dt.best_estimator_

# ===============================
# Threshold tuning function
# ===============================
def evaluate_with_threshold(model, X_test, y_test, name):
    y_proba = model.predict_proba(X_test)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

    # Choose best threshold by maximizing F1
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]

    print(f"Best threshold for {name}: {best_threshold:.2f}")

    # Apply threshold
    y_pred = (y_proba >= best_threshold).astype(int)

    print(f"\n=== {name} Report ===")
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_proba))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Fraud Recall:", recall_score(y_test, y_pred, pos_label=1))
    print("Non-Fraud Recall:", recall_score(y_test, y_pred, pos_label=0))
    print("ACC:", accuracy_score(y_test, y_pred))

# ===============================
# Evaluate models with best threshold
# ===============================
evaluate_with_threshold(best_lr, X_test, y_test, "Logistic Regression")
evaluate_with_threshold(best_dt, X_test, y_test, "Decision Tree")


Best Logistic Regression Params: {'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best Decision Tree Params: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10}
Best threshold for Logistic Regression: 0.51

=== Logistic Regression Report ===
              precision    recall  f1-score   support

           0       0.84      0.76      0.80       143
           1       0.51      0.65      0.57        57

    accuracy                           0.73       200
   macro avg       0.68      0.70      0.69       200
weighted avg       0.75      0.72      0.73       200

ROC-AUC: 0.7352472089314196
Confusion Matrix:
 [[108  35]
 [ 20  37]]
Fraud Recall: 0.6491228070175439
Non-Fraud Recall: 0.7552447552447552
ACC: 0.725
Best threshold for Decision Tree: 0.60

=== Decision Tree Report ===
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       143
           