# Revised Capstone Modeling - Fraud Detection

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import joblib
from xgboost import XGBClassifier

# Load data (use provided splits for consistency)
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
# Non-leaky features
features = ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

# Log transform
for col in ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']:
    df[col] = np.log1p(df[col])

# Encode 'type'
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# Time-based split
df = df.sort_values('step')
split_idx = int(0.8 * len(df))
X_train = df[features].iloc[:split_idx]
X_test = df[features].iloc[split_idx:]
y_train = df['isFraud'].iloc[:split_idx]
y_test = df['isFraud'].iloc[split_idx:]

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

In [None]:
# CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Logistic Regression
logreg = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
param_grid_log = {'C': [0.1, 1, 10], 'solver': ['liblinear']}
grid_log = GridSearchCV(logreg, param_grid_log, cv=cv, scoring='recall')
grid_log.fit(X_train_smote, y_train_smote)
best_log = grid_log.best_estimator_
y_pred_log = best_log.predict(X_test_scaled)
print("\nLogistic Regression:\n", classification_report(y_test, y_pred_log))
cv_scores_log = cross_val_score(best_log, X_train_smote, y_train_smote, cv=cv, scoring='recall')
print("Mean CV Recall (LogReg):", np.mean(cv_scores_log))

In [None]:
# Random Forest
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {'n_estimators': [50, 100], 'max_depth': [10, 20], 'class_weight': ['balanced']}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=cv, scoring='recall')
grid_rf.fit(X_train_smote, y_train_smote)
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_scaled)
print("\nRandom Forest:\n", classification_report(y_test, y_pred_rf))
cv_scores_rf = cross_val_score(best_rf, X_train_smote, y_train_smote, cv=cv, scoring='recall')
print("Mean CV Recall (RF):", np.mean(cv_scores_rf))

In [None]:
# XGBoost
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
param_grid_xgb = {'n_estimators': [50, 100], 'max_depth': [3, 5], 'learning_rate': [0.01, 0.1], 'scale_pos_weight': [len(y_train[y_train==0]) / len(y_train[y_train==1])] }
grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=cv, scoring='recall')
grid_xgb.fit(X_train_smote, y_train_smote)
best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test_scaled)
print("\nXGBoost:\n", classification_report(y_test, y_pred_xgb))
cv_scores_xgb = cross_val_score(best_xgb, X_train_smote, y_train_smote, cv=cv, scoring='recall')
print("Mean CV Recall (XGBoost):", np.mean(cv_scores_xgb))

In [None]:
# Threshold tuning if recall < 0.85
y_prob_xgb = best_xgb.predict_proba(X_test_scaled)[:, 1]
if recall_score(y_test, y_pred_xgb) < 0.85:
    thresholds = np.arange(0.2, 0.8, 0.05)
    best_threshold = 0.5
    best_recall = 0
    for thresh in thresholds:
        y_pred_thresh = (y_prob_xgb >= thresh).astype(int)
        rec = recall_score(y_test, y_pred_thresh)
        if rec > best_recall:
            best_recall = rec
            best_threshold = thresh
    y_pred_xgb = (y_prob_xgb >= best_threshold).astype(int)
    print(f"\nOptimized Threshold: {best_threshold}, Recall: {best_recall}")
    print(classification_report(y_test, y_pred_xgb))

In [None]:
# Holdout validation
holdout_size = int(0.1 * len(X_test))
X_holdout = X_test.iloc[-holdout_size:]
y_holdout = y_test.iloc[-holdout_size:]
X_holdout_scaled = scaler.transform(X_holdout)
y_pred_holdout = best_xgb.predict(X_holdout_scaled)
print("\nHoldout Evaluation (XGBoost):\n", classification_report(y_holdout, y_pred_holdout))

In [None]:
# Feature Importances
importances = best_xgb.feature_importances_
indices = np.argsort(importances)[::-1]
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), np.array(features)[indices], rotation=90)
plt.title("XGBoost Feature Importances")
plt.show()

In [None]:
# Model Comparison
metrics = {
    "Logistic": {"Recall": recall_score(y_test, y_pred_log), "F1": f1_score(y_test, y_pred_log), "ROC-AUC": roc_auc_score(y_test, y_prob_log)},
    "RF": {"Recall": recall_score(y_test, y_pred_rf), "F1": f1_score(y_test, y_pred_rf), "ROC-AUC": roc_auc_score(y_test, y_prob_rf)},
    "XGBoost": {"Recall": recall_score(y_test, y_pred_xgb), "F1": f1_score(y_test, y_pred_xgb), "ROC-AUC": roc_auc_score(y_test, y_prob_xgb)}
}
print(pd.DataFrame(metrics).T.sort_values(by='Recall', ascending=False))

# Save best model
joblib.dump(best_xgb, 'fraud_model_revised.pkl')