# Credit Card Fraud Detection - Modeling Experiments

This notebook contains modeling experiments including baseline models, SMOTE experiments, XGBoost tuning, threshold optimization, and model explanations.

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from src.data_utils import load_data
from src.preprocess import fit_transform_pipeline
from src.train import train_baseline_logistic, train_xgboost_with_smote
from src.evaluate import (
    evaluate_model, plot_roc_curve, plot_precision_recall_curve,
    threshold_tuning, plot_threshold_analysis
)
from src.explain import explain_shap, plot_feature_importance
from src.config import RANDOM_STATE, TEST_SIZE, FEATURE_COLUMNS
from src.utils import set_seed

# Set style and seed
plt.style.use('default')
sns.set_palette("husl")
set_seed(RANDOM_STATE)

print("Imports completed successfully!")

## 1. Data Loading and Preprocessing

In [None]:
# Load and preprocess data
try:
    df = load_data()
    print(f"Data loaded: {df.shape}")
    
    # Fit preprocessing pipeline
    pipeline, X, y = fit_transform_pipeline(df)
    print(f"Features processed: {X.shape}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    
    print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
    print(f"Train fraud rate: {y_train.mean():.4f}")
    print(f"Test fraud rate: {y_test.mean():.4f}")
    
except FileNotFoundError:
    print("Dataset not found. Please download creditcard.csv and place it in data/creditcard.csv")
    print("You can run the training script instead: python -m src.train")

## 2. Baseline Logistic Regression

In [None]:
if 'X_train' in locals():
    print("Training baseline Logistic Regression...")
    
    # Train baseline model
    lr_model, lr_metrics = train_baseline_logistic(X_train, y_train, X_test, y_test)
    
    print("\n📊 Logistic Regression Results:")
    print(f"ROC-AUC: {lr_metrics['roc_auc']:.4f}")
    print(f"PR-AUC: {lr_metrics['pr_auc']:.4f}")
    print(f"Precision: {lr_metrics['precision']:.4f}")
    print(f"Recall: {lr_metrics['recall']:.4f}")
    print(f"F1-Score: {lr_metrics['f1_score']:.4f}")
    
    # Confusion matrix
    print("\nConfusion Matrix:")
    print(lr_metrics['confusion_matrix'])
else:
    print("Data not available. Please load the dataset first.")

## 3. XGBoost with SMOTE

In [None]:
if 'X_train' in locals():
    print("Training XGBoost with SMOTE...")
    
    # Train XGBoost model
    xgb_model, xgb_metrics = train_xgboost_with_smote(X_train, y_train, X_test, y_test)
    
    print("\n🚀 XGBoost Results:")
    print(f"ROC-AUC: {xgb_metrics['roc_auc']:.4f}")
    print(f"PR-AUC: {xgb_metrics['pr_auc']:.4f}")
    print(f"Precision: {xgb_metrics['precision']:.4f}")
    print(f"Recall: {xgb_metrics['recall']:.4f}")
    print(f"F1-Score: {xgb_metrics['f1_score']:.4f}")
    print(f"Best CV Score: {xgb_metrics['best_cv_score']:.4f}")
    
    print("\nBest Parameters:")
    for param, value in xgb_metrics['best_params'].items():
        print(f"  {param}: {value}")
    
    # Confusion matrix
    print("\nConfusion Matrix:")
    print(xgb_metrics['confusion_matrix'])
else:
    print("Data not available. Please load the dataset first.")

## 4. Model Comparison

In [None]:
if 'lr_metrics' in locals() and 'xgb_metrics' in locals():
    # Create comparison table
    comparison_data = {
        'Metric': ['ROC-AUC', 'PR-AUC', 'Precision', 'Recall', 'F1-Score'],
        'Logistic Regression': [
            lr_metrics['roc_auc'],
            lr_metrics['pr_auc'],
            lr_metrics['precision'],
            lr_metrics['recall'],
            lr_metrics['f1_score']
        ],
        'XGBoost + SMOTE': [
            xgb_metrics['roc_auc'],
            xgb_metrics['pr_auc'],
            xgb_metrics['precision'],
            xgb_metrics['recall'],
            xgb_metrics['f1_score']
        ]
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.round(4)
    
    print("📊 Model Comparison:")
    display(comparison_df)
    
    # Plot comparison
    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(comparison_df['Metric']))
    width = 0.35
    
    ax.bar(x - width/2, comparison_df['Logistic Regression'], width, 
           label='Logistic Regression', alpha=0.7)
    ax.bar(x + width/2, comparison_df['XGBoost + SMOTE'], width, 
           label='XGBoost + SMOTE', alpha=0.7)
    
    ax.set_xlabel('Metrics')
    ax.set_ylabel('Score')
    ax.set_title('Model Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(comparison_df['Metric'])
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("Models not trained yet. Please run the previous cells.")

## 5. ROC and Precision-Recall Curves

In [None]:
if 'xgb_model' in locals():
    # Plot ROC curve
    fig1 = plot_roc_curve(xgb_model, X_test, y_test)
    plt.show()
    
    # Plot Precision-Recall curve
    fig2 = plot_precision_recall_curve(xgb_model, X_test, y_test)
    plt.show()
else:
    print("XGBoost model not available. Please train the model first.")

## 6. Threshold Tuning

In [None]:
if 'xgb_model' in locals():
    print("Performing threshold tuning...")
    
    # Get probabilities
    y_proba = xgb_model.predict_proba(X_test)[:, 1]
    
    # Tune threshold
    optimal_threshold, threshold_results = threshold_tuning(
        y_proba, y_test, cost_fp=1, cost_fn=5
    )
    
    print(f"\n🎯 Optimal Threshold: {optimal_threshold:.3f}")
    print(f"Optimal Cost: {threshold_results['optimal_cost']:.2f}")
    
    # Plot threshold analysis
    fig = plot_threshold_analysis(threshold_results)
    plt.show()
    
    # Performance at optimal threshold
    y_pred_optimal = (y_proba >= optimal_threshold).astype(int)
    
    print("\n📊 Performance at Optimal Threshold:")
    print(classification_report(y_test, y_pred_optimal, target_names=['Normal', 'Fraud']))
    
    # Confusion matrix at optimal threshold
    cm_optimal = confusion_matrix(y_test, y_pred_optimal)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm_optimal, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=['Normal', 'Fraud'], yticklabels=['Normal', 'Fraud'])
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(f'Confusion Matrix (Threshold = {optimal_threshold:.3f})')
    plt.show()
else:
    print("XGBoost model not available. Please train the model first.")

## 7. Feature Importance and SHAP Analysis

In [None]:
if 'xgb_model' in locals():
    print("Generating SHAP explanations...")
    
    # Get sample for SHAP analysis
    sample_size = 1000
    sample_indices = np.random.choice(len(X_test), sample_size, replace=False)
    X_sample = X_test[sample_indices]
    
    # Generate SHAP explanations
    shap_results = explain_shap(xgb_model, X_sample, FEATURE_COLUMNS)
    
    if shap_results:
        print("✅ SHAP analysis completed!")
        
        # Plot feature importance from SHAP
        if 'feature_importance' in shap_results:
            fig = plot_feature_importance(
                shap_results['feature_importance'], 
                title="SHAP Feature Importance",
                top_n=15
            )
            plt.show()
        
        print("\n🔍 Top 10 Most Important Features (SHAP):")
        if 'feature_importance' in shap_results:
            sorted_features = sorted(
                shap_results['feature_importance'].items(), 
                key=lambda x: abs(x[1]), 
                reverse=True
            )
            for i, (feature, importance) in enumerate(sorted_features[:10], 1):
                print(f"  {i:2d}. {feature}: {importance:.4f}")
    else:
        print("⚠️ SHAP analysis failed. This might be due to missing SHAP library.")
        print("Install with: pip install shap")
        
        # Fallback to XGBoost feature importance
        print("\nUsing XGBoost built-in feature importance instead:")
        feature_importance = dict(zip(FEATURE_COLUMNS, xgb_model.feature_importances_))
        sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
        
        fig = plot_feature_importance(feature_importance, title="XGBoost Feature Importance")
        plt.show()
        
        print("\n🔍 Top 10 Most Important Features (XGBoost):")
        for i, (feature, importance) in enumerate(sorted_features[:10], 1):
            print(f"  {i:2d}. {feature}: {importance:.4f}")
else:
    print("XGBoost model not available. Please train the model first.")

## 8. Business Impact Analysis

In [None]:
if 'xgb_model' in locals() and 'optimal_threshold' in locals():
    print("💼 Business Impact Analysis")
    print("=" * 40)
    
    # Calculate business metrics
    y_pred_default = xgb_model.predict(X_test)  # Default threshold (0.5)
    y_pred_optimal = (y_proba >= optimal_threshold).astype(int)
    
    # Confusion matrices
    cm_default = confusion_matrix(y_test, y_pred_default)
    cm_optimal = confusion_matrix(y_test, y_pred_optimal)
    
    def calculate_costs(cm, cost_fp=1, cost_fn=5):
        tn, fp, fn, tp = cm.ravel()
        total_cost = fp * cost_fp + fn * cost_fn
        return tn, fp, fn, tp, total_cost
    
    # Default threshold analysis
    tn_def, fp_def, fn_def, tp_def, cost_def = calculate_costs(cm_default)
    
    # Optimal threshold analysis
    tn_opt, fp_opt, fn_opt, tp_opt, cost_opt = calculate_costs(cm_optimal)
    
    print(f"📊 Threshold Comparison:")
    print(f"\nDefault Threshold (0.5):")
    print(f"  • True Positives (Caught Fraud): {tp_def}")
    print(f"  • False Positives (False Alarms): {fp_def}")
    print(f"  • False Negatives (Missed Fraud): {fn_def}")
    print(f"  • Total Cost: {cost_def}")
    print(f"  • Fraud Detection Rate: {tp_def/(tp_def + fn_def):.1%}")
    
    print(f"\nOptimal Threshold ({optimal_threshold:.3f}):")
    print(f"  • True Positives (Caught Fraud): {tp_opt}")
    print(f"  • False Positives (False Alarms): {fp_opt}")
    print(f"  • False Negatives (Missed Fraud): {fn_opt}")
    print(f"  • Total Cost: {cost_opt}")
    print(f"  • Fraud Detection Rate: {tp_opt/(tp_opt + fn_opt):.1%}")
    
    print(f"\n💰 Cost Savings: {cost_def - cost_opt} units ({(cost_def - cost_opt)/cost_def:.1%} reduction)")
    
    # Additional insights
    fraud_amount = df[df['Class'] == 1]['Amount'].sum()
    avg_fraud_amount = df[df['Class'] == 1]['Amount'].mean()
    
    print(f"\n💳 Financial Impact (from training data):")
    print(f"  • Total fraud amount: ${fraud_amount:,.2f}")
    print(f"  • Average fraud amount: ${avg_fraud_amount:.2f}")
    print(f"  • Potential fraud caught (optimal): ${tp_opt * avg_fraud_amount:,.2f}")
    print(f"  • Potential fraud missed (optimal): ${fn_opt * avg_fraud_amount:,.2f}")
else:
    print("Models and threshold analysis not available. Please run previous cells.")

## 9. Summary and Recommendations

In [None]:
if 'xgb_metrics' in locals():
    print("🎯 MODELING SUMMARY & RECOMMENDATIONS")
    print("=" * 50)
    
    print(f"\n📈 Best Model Performance:")
    print(f"  • Model: XGBoost with SMOTE")
    print(f"  • ROC-AUC: {xgb_metrics['roc_auc']:.4f}")
    print(f"  • PR-AUC: {xgb_metrics['pr_auc']:.4f}")
    print(f"  • Recall: {xgb_metrics['recall']:.4f} (fraud detection rate)")
    
    if 'optimal_threshold' in locals():
        print(f"  • Optimal Threshold: {optimal_threshold:.3f}")
    
    print(f"\n🔧 Key Modeling Decisions:")
    print(f"  • SMOTE oversampling to handle class imbalance")
    print(f"  • XGBoost for complex pattern detection")
    print(f"  • RandomizedSearchCV for hyperparameter tuning")
    print(f"  • Cost-based threshold optimization")
    print(f"  • Focus on recall to minimize missed fraud")
    
    print(f"\n💡 Business Recommendations:")
    print(f"  • Deploy XGBoost model with optimal threshold")
    print(f"  • Monitor model performance regularly")
    print(f"  • Implement real-time scoring for transactions")
    print(f"  • Use SHAP explanations for model interpretability")
    print(f"  • Consider ensemble methods for further improvement")
    
    print(f"\n🚀 Next Steps:")
    print(f"  • A/B testing with current fraud detection system")
    print(f"  • Continuous model retraining with new data")
    print(f"  • Feature engineering based on domain expertise")
    print(f"  • Integration with transaction processing system")
    print(f"  • Regular model validation and monitoring")
    
    print("\n" + "=" * 50)
    print("✅ Modeling experiments completed successfully!")
else:
    print("Please run the modeling experiments first to see the summary.")