# Fraud Detection Analysis Notebook

This notebook provides an interactive environment for exploring the fraud detection dataset and analyzing model results.

## Table of Contents
1. [Setup and Data Loading](#setup)
2. [Exploratory Data Analysis](#eda)
3. [Feature Analysis](#features)
4. [Model Performance Analysis](#models)
5. [Business Impact Analysis](#business)
6. [Advanced Analysis](#advanced)

## 1. Setup and Data Loading {#setup}

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import custom modules
from src.data_generation.synthetic_data_generator import FraudDataGenerator
from src.preprocessing.data_processor import FraudDataProcessor
from src.modeling.model_trainer import FraudModelTrainer
from src.evaluation.model_evaluator import FraudModelEvaluator

import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Configuration
DATA_PATH = '../data/raw/'
PROCESSED_PATH = '../data/processed/'
MODELS_PATH = '../data/models/'

# Find the most recent dataset
import glob
csv_files = glob.glob(f"{DATA_PATH}*.csv")
if csv_files:
    latest_file = max(csv_files, key=os.path.getctime)
    print(f"Loading dataset: {latest_file}")
    df = pd.read_csv(latest_file)
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
else:
    print("No dataset found. Please run the data generation pipeline first.")
    df = None

## 2. Exploratory Data Analysis {#eda}

In [None]:
if df is not None:
    # Basic statistics
    print("Dataset Overview")
    print("="*50)
    print(f"Total transactions: {len(df):,}")
    print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print(f"Unique users: {df['user_id'].nunique():,}")
    print(f"Unique merchants: {df['merchant_category'].nunique()}")
    print(f"Countries: {df['country'].nunique()}")
    
    print("\nFraud Distribution:")
    fraud_counts = df['is_fraud'].value_counts()
    fraud_pct = df['is_fraud'].value_counts(normalize=True) * 100
    
    for label, count, pct in zip(['Legitimate', 'Fraudulent'], fraud_counts, fraud_pct):
        print(f"  {label}: {count:,} ({pct:.2f}%)")

In [None]:
if df is not None:
    # Transaction amount distribution
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Amount distribution
    axes[0,0].hist(df[df['is_fraud']==0]['amount'], bins=50, alpha=0.7, label='Legitimate', density=True)
    axes[0,0].hist(df[df['is_fraud']==1]['amount'], bins=50, alpha=0.7, label='Fraudulent', density=True)
    axes[0,0].set_xlabel('Transaction Amount')
    axes[0,0].set_ylabel('Density')
    axes[0,0].set_title('Transaction Amount Distribution')
    axes[0,0].legend()
    axes[0,0].set_xlim(0, df['amount'].quantile(0.95))
    
    # Log amount distribution
    axes[0,1].hist(np.log1p(df[df['is_fraud']==0]['amount']), bins=50, alpha=0.7, label='Legitimate', density=True)
    axes[0,1].hist(np.log1p(df[df['is_fraud']==1]['amount']), bins=50, alpha=0.7, label='Fraudulent', density=True)
    axes[0,1].set_xlabel('Log(Transaction Amount + 1)')
    axes[0,1].set_ylabel('Density')
    axes[0,1].set_title('Log Transaction Amount Distribution')
    axes[0,1].legend()
    
    # Transaction timing
    df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
    hour_fraud = df.groupby(['hour', 'is_fraud']).size().unstack(fill_value=0)
    hour_fraud_pct = hour_fraud.div(hour_fraud.sum(axis=1), axis=0)
    
    axes[1,0].plot(hour_fraud_pct.index, hour_fraud_pct[1], marker='o', color='red', linewidth=2)
    axes[1,0].set_xlabel('Hour of Day')
    axes[1,0].set_ylabel('Fraud Rate')
    axes[1,0].set_title('Fraud Rate by Hour of Day')
    axes[1,0].grid(True, alpha=0.3)
    
    # Merchant category fraud rate
    merchant_fraud = df.groupby('merchant_category')['is_fraud'].agg(['count', 'sum'])
    merchant_fraud['fraud_rate'] = merchant_fraud['sum'] / merchant_fraud['count']
    merchant_fraud = merchant_fraud.sort_values('fraud_rate', ascending=True)
    
    axes[1,1].barh(range(len(merchant_fraud)), merchant_fraud['fraud_rate'], color='coral')
    axes[1,1].set_yticks(range(len(merchant_fraud)))
    axes[1,1].set_yticklabels(merchant_fraud.index, fontsize=8)
    axes[1,1].set_xlabel('Fraud Rate')
    axes[1,1].set_title('Fraud Rate by Merchant Category')
    axes[1,1].grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 3. Feature Analysis {#features}

In [None]:
if df is not None:
    # Create interactive geographic visualization
    if 'latitude' in df.columns and 'longitude' in df.columns:
        # Sample data for better performance
        sample_size = min(5000, len(df))
        df_sample = df.sample(n=sample_size, random_state=42)
        
        fig = px.scatter_mapbox(
            df_sample,
            lat='latitude',
            lon='longitude',
            color='is_fraud',
            color_discrete_map={0: 'blue', 1: 'red'},
            hover_data=['amount', 'merchant_category', 'country'],
            mapbox_style='open-street-map',
            title='Geographic Distribution of Transactions',
            height=600
        )
        
        fig.update_layout(
            mapbox=dict(
                center=dict(lat=40, lon=-95),
                zoom=2
            )
        )
        
        fig.show()

In [None]:
if df is not None:
    # User behavior analysis
    user_stats = df.groupby('user_id').agg({
        'amount': ['count', 'sum', 'mean', 'std'],
        'is_fraud': 'sum',
        'merchant_category': 'nunique'
    }).reset_index()
    
    user_stats.columns = ['user_id', 'txn_count', 'total_amount', 'avg_amount', 'std_amount', 'fraud_count', 'unique_merchants']
    user_stats['fraud_rate'] = user_stats['fraud_count'] / user_stats['txn_count']
    
    # Plot user behavior patterns
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Transaction count distribution
    axes[0,0].hist(user_stats['txn_count'], bins=50, edgecolor='black', alpha=0.7)
    axes[0,0].set_xlabel('Transactions per User')
    axes[0,0].set_ylabel('Number of Users')
    axes[0,0].set_title('Distribution of Transactions per User')
    axes[0,0].axvline(user_stats['txn_count'].mean(), color='red', linestyle='--', label=f'Mean: {user_stats["txn_count"].mean():.1f}')
    axes[0,0].legend()
    
    # Average amount vs fraud rate
    scatter = axes[0,1].scatter(user_stats['avg_amount'], user_stats['fraud_rate'], 
                               alpha=0.6, c=user_stats['txn_count'], cmap='viridis')
    axes[0,1].set_xlabel('Average Transaction Amount')
    axes[0,1].set_ylabel('User Fraud Rate')
    axes[0,1].set_title('Average Amount vs Fraud Rate by User')
    plt.colorbar(scatter, ax=axes[0,1], label='Transaction Count')
    
    # Merchant diversity vs fraud
    axes[1,0].scatter(user_stats['unique_merchants'], user_stats['fraud_rate'], alpha=0.6)
    axes[1,0].set_xlabel('Number of Unique Merchants')
    axes[1,0].set_ylabel('User Fraud Rate')
    axes[1,0].set_title('Merchant Diversity vs Fraud Rate')
    
    # Fraud users vs normal users
    fraud_users = user_stats[user_stats['fraud_count'] > 0]
    normal_users = user_stats[user_stats['fraud_count'] == 0]
    
    axes[1,1].boxplot([normal_users['avg_amount'], fraud_users['avg_amount']], 
                      labels=['Normal Users', 'Users with Fraud'])
    axes[1,1].set_ylabel('Average Transaction Amount')
    axes[1,1].set_title('Average Transaction Amount: Normal vs Fraud Users')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Users with fraud: {len(fraud_users):,} ({len(fraud_users)/len(user_stats)*100:.1f}%)")
    print(f"Average transactions per user: {user_stats['txn_count'].mean():.1f}")
    print(f"Average amount per user: ${user_stats['avg_amount'].mean():.2f}")

## 4. Model Performance Analysis {#models}

In [None]:
# Load model evaluation results if available
model_dirs = glob.glob(f"{MODELS_PATH}trained_models_*")
if model_dirs:
    latest_model_dir = max(model_dirs, key=os.path.getctime)
    print(f"Loading models from: {latest_model_dir}")
    
    # Check for metadata file
    metadata_file = os.path.join(latest_model_dir, 'model_metadata.json')
    if os.path.exists(metadata_file):
        import json
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        
        print(f"Available models: {metadata['models']}")
        
        # Extract evaluation results if available
        if 'evaluation_results' in metadata:
            eval_results = metadata['evaluation_results']
            
            # Create performance comparison
            if eval_results:
                performance_data = []
                for result in eval_results:
                    if 'error' not in result:
                        performance_data.append({
                            'model': result['model_name'],
                            'roc_auc': result['roc_auc'],
                            'f1_score': result['f1_score'],
                            'precision': result['precision'],
                            'recall': result['recall'],
                            'fraud_detection_rate': result['fraud_detection_rate']
                        })
                
                if performance_data:
                    perf_df = pd.DataFrame(performance_data)
                    
                    # Interactive performance comparison
                    fig = go.Figure()
                    
                    metrics = ['roc_auc', 'f1_score', 'precision', 'recall', 'fraud_detection_rate']
                    colors = ['blue', 'green', 'red', 'orange', 'purple']
                    
                    for i, metric in enumerate(metrics):
                        fig.add_trace(go.Bar(
                            name=metric.replace('_', ' ').title(),
                            x=perf_df['model'],
                            y=perf_df[metric],
                            marker_color=colors[i],
                            visible=True if i == 0 else False
                        ))
                    
                    # Add buttons for metric selection
                    buttons = []
                    for i, metric in enumerate(metrics):
                        visibility = [False] * len(metrics)
                        visibility[i] = True
                        buttons.append(dict(
                            label=metric.replace('_', ' ').title(),
                            method='update',
                            args=[{'visible': visibility}]
                        ))
                    
                    fig.update_layout(
                        title='Model Performance Comparison',
                        xaxis_title='Model',
                        yaxis_title='Score',
                        updatemenus=[dict(
                            type='buttons',
                            direction='left',
                            x=0.7,
                            y=1.02,
                            showactive=True,
                            buttons=buttons
                        )]
                    )
                    
                    fig.show()
                    
                    # Display performance table
                    print("\nModel Performance Summary:")
                    display(perf_df.round(3))
                    
                    # Best model recommendation
                    best_auc = perf_df.loc[perf_df['roc_auc'].idxmax()]
                    best_f1 = perf_df.loc[perf_df['f1_score'].idxmax()]
                    
                    print(f"\nBest ROC AUC: {best_auc['model']} ({best_auc['roc_auc']:.3f})")
                    print(f"Best F1 Score: {best_f1['model']} ({best_f1['f1_score']:.3f})")
else:
    print("No trained models found. Please run the model training pipeline first.")

## 5. Business Impact Analysis {#business}

In [None]:
# Business impact simulation
def calculate_business_impact(df, fraud_detection_rate, false_positive_rate, 
                            avg_fraud_amount=500, investigation_cost=50):
    """
    Calculate business impact of fraud detection system.
    """
    total_transactions = len(df)
    total_fraud = df['is_fraud'].sum()
    total_legit = total_transactions - total_fraud
    
    # Model performance
    fraud_detected = int(total_fraud * fraud_detection_rate)
    fraud_missed = total_fraud - fraud_detected
    false_positives = int(total_legit * false_positive_rate)
    
    # Financial impact
    prevented_loss = fraud_detected * avg_fraud_amount
    missed_loss = fraud_missed * avg_fraud_amount
    investigation_costs = (fraud_detected + false_positives) * investigation_cost
    
    net_benefit = prevented_loss - investigation_costs
    total_potential_loss = total_fraud * avg_fraud_amount
    savings_rate = net_benefit / total_potential_loss if total_potential_loss > 0 else 0
    
    return {
        'total_transactions': total_transactions,
        'total_fraud': total_fraud,
        'fraud_detected': fraud_detected,
        'fraud_missed': fraud_missed,
        'false_positives': false_positives,
        'prevented_loss': prevented_loss,
        'missed_loss': missed_loss,
        'investigation_costs': investigation_costs,
        'net_benefit': net_benefit,
        'savings_rate': savings_rate
    }

if df is not None and 'performance_data' in locals():
    print("Business Impact Analysis")
    print("="*50)
    
    # Calculate business impact for each model
    business_results = []
    
    for _, model in perf_df.iterrows():
        impact = calculate_business_impact(
            df, 
            model['fraud_detection_rate'],
            1 - model['precision'] if model['precision'] > 0 else 0.1  # Approximate FPR
        )
        impact['model'] = model['model']
        business_results.append(impact)
    
    # Create business impact DataFrame
    business_df = pd.DataFrame(business_results)
    
    # Visualize business impact
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Net benefit comparison
    axes[0,0].bar(business_df['model'], business_df['net_benefit'], color='green', alpha=0.7)
    axes[0,0].set_xlabel('Model')
    axes[0,0].set_ylabel('Net Benefit ($)')
    axes[0,0].set_title('Net Financial Benefit by Model')
    axes[0,0].tick_params(axis='x', rotation=45)
    
    # Prevented vs missed loss
    x = np.arange(len(business_df))
    width = 0.35
    
    axes[0,1].bar(x - width/2, business_df['prevented_loss'], width, label='Prevented Loss', color='green', alpha=0.7)
    axes[0,1].bar(x + width/2, business_df['missed_loss'], width, label='Missed Loss', color='red', alpha=0.7)
    axes[0,1].set_xlabel('Model')
    axes[0,1].set_ylabel('Loss ($)')
    axes[0,1].set_title('Prevented vs Missed Fraud Loss')
    axes[0,1].set_xticks(x)
    axes[0,1].set_xticklabels(business_df['model'], rotation=45)
    axes[0,1].legend()
    
    # Fraud detection vs false positives
    axes[1,0].scatter(business_df['false_positives'], business_df['fraud_detected'], 
                     s=100, alpha=0.7, c=business_df['net_benefit'], cmap='RdYlGn')
    axes[1,0].set_xlabel('False Positives')
    axes[1,0].set_ylabel('Fraud Detected')
    axes[1,0].set_title('Fraud Detection vs False Positives')
    
    for i, model in enumerate(business_df['model']):
        axes[1,0].annotate(model, (business_df['false_positives'].iloc[i], 
                                  business_df['fraud_detected'].iloc[i]),
                          xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    # Savings rate
    axes[1,1].bar(business_df['model'], business_df['savings_rate'], color='blue', alpha=0.7)
    axes[1,1].set_xlabel('Model')
    axes[1,1].set_ylabel('Savings Rate')
    axes[1,1].set_title('Savings Rate by Model')
    axes[1,1].tick_params(axis='x', rotation=45)
    axes[1,1].axhline(y=0, color='red', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()
    
    # Display business impact summary
    print("\nBusiness Impact Summary:")
    display_cols = ['model', 'fraud_detected', 'false_positives', 'prevented_loss', 'missed_loss', 'net_benefit', 'savings_rate']
    display_business = business_df[display_cols].copy()
    
    # Format currency columns
    for col in ['prevented_loss', 'missed_loss', 'net_benefit']:
        display_business[col] = display_business[col].apply(lambda x: f"${x:,.0f}")
    
    display_business['savings_rate'] = display_business['savings_rate'].apply(lambda x: f"{x:.1%}")
    
    display(display_business)
    
    # Best model for business
    best_business_model = business_df.loc[business_df['net_benefit'].idxmax()]
    print(f"\nBest model for business: {best_business_model['model']}")
    print(f"Net benefit: ${best_business_model['net_benefit']:,.0f}")
    print(f"Savings rate: {best_business_model['savings_rate']:.1%}")
else:
    print("Business impact analysis requires model performance data.")
    print("Please run the model training and evaluation pipeline first.")

## 6. Advanced Analysis {#advanced}

In [None]:
# Threshold analysis
def threshold_analysis(y_true, y_scores, thresholds=None):
    """
    Analyze model performance across different thresholds.
    """
    if thresholds is None:
        thresholds = np.arange(0.01, 1.0, 0.01)
    
    results = []
    
    for threshold in thresholds:
        y_pred = (y_scores >= threshold).astype(int)
        
        # Calculate metrics
        from sklearn.metrics import precision_score, recall_score, f1_score
        
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        # Business metrics
        tp = ((y_pred == 1) & (y_true == 1)).sum()
        fp = ((y_pred == 1) & (y_true == 0)).sum()
        fn = ((y_pred == 0) & (y_true == 1)).sum()
        tn = ((y_pred == 0) & (y_true == 0)).sum()
        
        fraud_detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
        false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        
        results.append({
            'threshold': threshold,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'fraud_detection_rate': fraud_detection_rate,
            'false_positive_rate': false_positive_rate
        })
    
    return pd.DataFrame(results)

# Example threshold analysis (would need actual model predictions)
print("Threshold Analysis Framework")
print("="*50)
print("This section would analyze optimal thresholds for each model based on:")
print("- Business objectives (maximize savings vs minimize false positives)")
print("- Operational constraints (investigation capacity)")
print("- Risk tolerance (acceptable false positive rate)")
print("\nTo run threshold analysis, load model predictions and use the threshold_analysis() function.")

In [None]:
# Feature importance analysis placeholder
print("Feature Importance Analysis")
print("="*50)
print("Key features for fraud detection (based on domain knowledge):")

important_features = [
    ("amount_deviation_from_user_avg", "Deviation from user's normal spending pattern"),
    ("travel_velocity_kmh", "Speed between consecutive transactions"),
    ("user_amount_sum_1h", "Total amount spent in last hour"),
    ("is_night", "Transaction during night hours (11PM-6AM)"),
    ("amount_vs_user_max", "Current amount vs user's historical maximum"),
    ("user_txn_count_1h", "Number of transactions in last hour"),
    ("distance_from_prev_km", "Distance from previous transaction"),
    ("is_unusual_category", "Merchant category unusual for user")
]

for i, (feature, description) in enumerate(important_features, 1):
    print(f"{i}. {feature}: {description}")

print("\nTo see actual feature importance, run the complete pipeline with model training.")

In [None]:
# Summary and recommendations
print("\n" + "="*60)
print("FRAUD DETECTION ANALYSIS SUMMARY")
print("="*60)

if df is not None:
    print(f"Dataset: {len(df):,} transactions from {df['user_id'].nunique():,} users")
    print(f"Fraud rate: {df['is_fraud'].mean()*100:.2f}%")
    print(f"Average transaction: ${df['amount'].mean():.2f}")
    print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

print("\nKey Insights:")
print("1. Fraud patterns are clearly distinguishable from legitimate transactions")
print("2. Time-based features (hour, velocity) are critical for detection")
print("3. User behavioral deviation is the strongest fraud indicator")
print("4. Geographic analysis helps catch impossible travel scenarios")
print("5. Ensemble methods typically provide best overall performance")

print("\nRecommendations:")
print("1. Use XGBoost or ensemble model for best performance")
print("2. Optimize threshold based on business objectives")
print("3. Monitor model performance and retrain regularly")
print("4. Implement real-time feature computation for production")
print("5. Consider model interpretability for regulatory compliance")

print("\nNext Steps:")
print("1. Deploy selected model to production environment")
print("2. Set up monitoring and alerting systems")
print("3. Implement feedback loop for continuous learning")
print("4. Prepare model documentation for stakeholders")
print("5. Plan for regular model updates and maintenance")