In [1]:
# =============================================================================
# FRAUD DETECTION ANALYSIS - STREAMLINED VERSION
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Optional: Plotly for interactive visualizations
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    PLOTLY_AVAILABLE = True
except ImportError:
    PLOTLY_AVAILABLE = False

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay)
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

# =============================================================================
# 1. DATA LOADING & INITIAL ASSESSMENT
# =============================================================================

def load_and_assess_data(train_path, test_path):
    """Load data and perform initial quality assessment"""
    print("="*60)
    print("DATA LOADING & QUALITY ASSESSMENT")
    print("="*60)
    
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    # Quick summary
    print(f"\nTrain shape: {train.shape}, Test shape: {test.shape}")
    print(f"Fraud rate: {train['FraudResult'].mean():.2%}")
    
    # Missing values
    missing_train = train.isnull().sum()
    if missing_train.sum() > 0:
        print(f"\nMissing values in train:\n{missing_train[missing_train > 0]}")
    
    # Data types
    print(f"\nData types: {train.dtypes.value_counts().to_dict()}")
    
    return train, test

# =============================================================================
# 2. COMPREHENSIVE EDA (CONSOLIDATED)
# =============================================================================

def perform_eda(train, create_interactive=False):
    """Consolidated EDA with key visualizations"""
    print("\n" + "="*60)
    print("EXPLORATORY DATA ANALYSIS")
    print("="*60)
    
    # Parse datetime
    train['TransactionStartTime'] = pd.to_datetime(train['TransactionStartTime'])
    train['Hour'] = train['TransactionStartTime'].dt.hour
    train['DayOfWeek'] = train['TransactionStartTime'].dt.day_name()
    train['Date'] = train['TransactionStartTime'].dt.date
    
    # Create comprehensive dashboard
    fig = plt.figure(figsize=(20, 12))
    gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)
    
    # 1. Fraud distribution
    ax1 = fig.add_subplot(gs[0, 0])
    fraud_counts = train['FraudResult'].value_counts()
    ax1.pie(fraud_counts.values, labels=['Legitimate', 'Fraud'], 
            autopct='%1.1f%%', colors=['#2E86C1', '#E74C3C'],
            wedgeprops={'width': 0.4})
    ax1.set_title('Fraud Distribution', fontsize=12, fontweight='bold')
    
    # 2. Hourly fraud rate
    ax2 = fig.add_subplot(gs[0, 1])
    hourly_fraud = train.groupby('Hour')['FraudResult'].agg(['sum', 'count', 'mean'])
    ax2.bar(hourly_fraud.index, hourly_fraud['mean'], color='orange', alpha=0.7)
    ax2.set_xlabel('Hour'); ax2.set_ylabel('Fraud Rate')
    ax2.set_title('Hourly Fraud Pattern')
    ax2.grid(alpha=0.3)
    
    # 3. Daily fraud trend
    ax3 = fig.add_subplot(gs[0, 2:4])
    daily_fraud = train.groupby('Date')['FraudResult'].mean()
    ax3.plot(daily_fraud.index, daily_fraud.values, color='red', linewidth=2)
    ax3.set_xlabel('Date'); ax3.set_ylabel('Fraud Rate')
    ax3.set_title('Daily Fraud Trend')
    ax3.tick_params(axis='x', rotation=45)
    ax3.grid(alpha=0.3)
    
    # 4. Amount distribution
    ax4 = fig.add_subplot(gs[1, 0:2])
    legitimate = train[train['FraudResult']==0]['Amount']
    fraud = train[train['FraudResult']==1]['Amount']
    ax4.hist(legitimate, bins=50, alpha=0.6, label='Legitimate', density=True)
    ax4.hist(fraud, bins=50, alpha=0.6, label='Fraud', density=True)
    ax4.set_xlabel('Amount'); ax4.set_ylabel('Density')
    ax4.set_title('Amount Distribution by Fraud Status')
    ax4.set_xlim(0, train['Amount'].quantile(0.95))
    ax4.legend()
    
    # 5. Amount vs Value scatter
    ax5 = fig.add_subplot(gs[1, 2:4])
    sample = train.sample(min(5000, len(train)))
    scatter = ax5.scatter(sample['Amount'], sample['Value'], 
                         c=sample['FraudResult'], cmap='RdYlBu_r', alpha=0.5, s=10)
    ax5.set_xlabel('Amount'); ax5.set_ylabel('Value')
    ax5.set_title('Amount vs Value (Fraud Colored)')
    plt.colorbar(scatter, ax=ax5)
    
    # 6. Correlation heatmap
    ax6 = fig.add_subplot(gs[2, 0:2])
    numeric_cols = train.select_dtypes(include=[np.number]).columns
    corr_matrix = train[numeric_cols].corr()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', 
                cmap='RdYlBu_r', center=0, ax=ax6, cbar_kws={'shrink': 0.8})
    ax6.set_title('Feature Correlations')
    
    # 7. Feature correlation with target
    ax7 = fig.add_subplot(gs[2, 2:4])
    target_corr = corr_matrix['FraudResult'].drop('FraudResult').sort_values(key=abs)
    colors = ['red' if x > 0 else 'blue' for x in target_corr.values]
    ax7.barh(range(len(target_corr)), target_corr.values, color=colors, alpha=0.7)
    ax7.set_yticks(range(len(target_corr)))
    ax7.set_yticklabels(target_corr.index, fontsize=9)
    ax7.set_xlabel('Correlation with Fraud')
    ax7.set_title('Feature Importance (Correlation)')
    ax7.grid(alpha=0.3)
    ax7.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
    
    plt.suptitle('Fraud Detection - Comprehensive EDA Dashboard', 
                 fontsize=16, fontweight='bold', y=0.995)
    plt.show()
    
    # Print key insights
    print(f"\nðŸ“Š KEY INSIGHTS:")
    print(f"â€¢ Peak fraud hour: {hourly_fraud['mean'].idxmax()}:00 ({hourly_fraud['mean'].max():.1%})")
    print(f"â€¢ Average daily fraud: {daily_fraud.mean():.2%}")
    print(f"â€¢ Amount: Legit avg={legitimate.mean():.2f}, Fraud avg={fraud.mean():.2f}")
    print(f"â€¢ Top correlations: {target_corr.tail(3).to_dict()}")
    
    # Optional: Interactive dashboard
    if create_interactive and PLOTLY_AVAILABLE:
        create_interactive_dashboard(train, hourly_fraud, daily_fraud)
    
    return train

def create_interactive_dashboard(train, hourly_fraud, daily_fraud):
    """Create interactive Plotly dashboard"""
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Fraud Distribution', 'Daily Trend', 
                       'Hourly Pattern', 'Amount by Status'),
        specs=[[{"type": "pie"}, {"type": "xy"}],
               [{"type": "xy"}, {"type": "box"}]]
    )
    
    # Fraud pie
    fraud_counts = train['FraudResult'].value_counts()
    fig.add_trace(go.Pie(labels=['Legitimate', 'Fraud'], 
                         values=[fraud_counts[0], fraud_counts[1]],
                         hole=0.4), row=1, col=1)
    
    # Daily trend
    fig.add_trace(go.Scatter(x=daily_fraud.index, y=daily_fraud.values,
                            mode='lines+markers', name='Daily Rate'),
                 row=1, col=2)
    
    # Hourly pattern
    fig.add_trace(go.Bar(x=hourly_fraud.index, y=hourly_fraud['mean'],
                        name='Hourly Rate'), row=2, col=1)
    
    # Amount boxes
    fig.add_trace(go.Box(y=train[train['FraudResult']==0]['Amount'],
                        name='Legitimate'), row=2, col=2)
    fig.add_trace(go.Box(y=train[train['FraudResult']==1]['Amount'],
                        name='Fraud'), row=2, col=2)
    
    fig.update_layout(height=800, showlegend=True, 
                     title_text="Interactive Fraud Dashboard")
    fig.show()

# =============================================================================
# 3. FEATURE ENGINEERING (CONSOLIDATED)
# =============================================================================

def engineer_features(train):
    """Create all engineered features in one place"""
    print("\n" + "="*60)
    print("FEATURE ENGINEERING")
    print("="*60)
    
    df = train.copy()
    
    # Ensure datetime is parsed
    if df['TransactionStartTime'].dtype == 'object':
        df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
    
    # Time features
    df['Hour'] = df['TransactionStartTime'].dt.hour
    df['DayOfWeek'] = df['TransactionStartTime'].dt.dayofweek
    df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
    df['IsBusinessHour'] = ((df['Hour'] >= 9) & (df['Hour'] <= 17)).astype(int)
    df['IsLateNight'] = ((df['Hour'] >= 22) | (df['Hour'] <= 6)).astype(int)
    
    # Amount/Value features
    df['Amount_Value_Ratio'] = df['Amount'] / (df['Value'] + 1e-6)
    df['Amount_Value_Diff'] = df['Amount'] - df['Value']
    df['Amount_Value_Interaction'] = df['Amount'] * df['Value']
    df['LogAmount'] = np.log1p(df['Amount'])
    df['LogValue'] = np.log1p(df['Value'])
    
    # Risk score (composite feature)
    df['RiskScore'] = (
        df['Amount'].rank(pct=True) * 0.3 +
        df['LogAmount'].rank(pct=True) * 0.3 +
        df['IsLateNight'] * 0.2 +
        df['IsWeekend'] * 0.1 +
        df['Amount_Value_Ratio'].rank(pct=True) * 0.1
    )
    
    print(f"âœ… Created {len(df.columns) - len(train.columns)} new features")
    print(f"Feature list: {[c for c in df.columns if c not in train.columns]}")
    
    return df

# =============================================================================
# 4. MODEL TRAINING & EVALUATION (CONSOLIDATED)
# =============================================================================

def train_and_evaluate_models(train):
    """Train multiple models and compare performance"""
    print("\n" + "="*60)
    print("MODEL TRAINING & EVALUATION")
    print("="*60)
    
    # Prepare features
    feature_cols = [col for col in train.columns 
                   if col not in ['FraudResult', 'TransactionStartTime', 
                                  'Date', 'DayOfWeek'] 
                   and train[col].dtype in [np.number, 'int64', 'float64']]
    
    X = train[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)
    y = train['FraudResult']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    # Define models
    models = {
        "LightGBM": LGBMClassifier(n_estimators=300, learning_rate=0.05,
                                  class_weight="balanced", random_state=42, verbose=-1),
        "XGBoost": XGBClassifier(n_estimators=300, learning_rate=0.05,
                                scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
                                use_label_encoder=False, eval_metric="logloss", 
                                random_state=42),
        "LinearSVM": make_pipeline(StandardScaler(), 
                                  LinearSVC(class_weight='balanced', 
                                           dual=False, max_iter=5000, random_state=42))
    }
    
    results = {}
    
    # Train and evaluate
    fig, axes = plt.subplots(len(models), 2, figsize=(14, 5*len(models)))
    if len(models) == 1:
        axes = axes.reshape(1, -1)
    
    for idx, (name, model) in enumerate(models.items()):
        print(f"\nðŸš€ Training {name}...")
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        # Handle probability/decision function
        if hasattr(model, 'predict_proba'):
            y_proba = model.predict_proba(X_test)[:, 1]
        else:
            y_proba = model.decision_function(X_test)
        
        # Metrics
        roc_auc = roc_auc_score(y_test, y_proba)
        report = classification_report(y_test, y_pred, zero_division=0)
        cm = confusion_matrix(y_test, y_pred)
        
        results[name] = {
            'roc_auc': roc_auc,
            'report': report,
            'confusion_matrix': cm,
            'model': model
        }
        
        # Plot ROC and PR curves
        RocCurveDisplay.from_estimator(model, X_test, y_test, ax=axes[idx, 0])
        axes[idx, 0].set_title(f"{name} - ROC Curve (AUC={roc_auc:.3f})")
        
        PrecisionRecallDisplay.from_estimator(model, X_test, y_test, ax=axes[idx, 1])
        axes[idx, 1].set_title(f"{name} - Precision-Recall Curve")
        
        print(f"\n{name} Results:")
        print(report)
    
    plt.tight_layout()
    plt.show()
    
    # Model comparison
    print("\n" + "="*60)
    print("MODEL COMPARISON SUMMARY")
    print("="*60)
    comparison_df = pd.DataFrame({
        name: {'ROC-AUC': res['roc_auc']} 
        for name, res in results.items()
    }).T.sort_values('ROC-AUC', ascending=False)
    print(comparison_df.to_string())
    
    return results, X_test, y_test

# =============================================================================
# 5. MAIN EXECUTION
# =============================================================================

def main():
    """Main execution pipeline"""
    
    # 1. Load data
    train, test = load_and_assess_data(
        'training.csv',
        'test.csv'
    )
    
    # 2. EDA
    train = perform_eda(train, create_interactive=False)
    
    # 3. Feature engineering
    train = engineer_features(train)
    
    # 4. Model training
    results, X_test, y_test = train_and_evaluate_models(train)
    
    # 5. Save processed data (optional)
    # train.to_csv('processed_fraud_data.csv', index=False)
    
    print("\nâœ… ANALYSIS COMPLETE!")
    print(f"Best model: {max(results.items(), key=lambda x: x[1]['roc_auc'])[0]}")
    
    return train, results

# Run the pipeline
if __name__ == "__main__":
    train, results = main()

ModuleNotFoundError: No module named 'xgboost'