# Model Evaluation Notebook

This notebook demonstrates model evaluation and analysis after training.

Changes to this notebook will trigger devloop to restart Jupyter Lab automatically.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import yaml
import sys
import os
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split

# Add src to path
sys.path.insert(0, os.path.join('..', 'src'))

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## Load Trained Model and Data

In [None]:
# Load the trained model
try:
    model = joblib.load('../models/model.pkl')
    print("✅ Model loaded successfully!")
except FileNotFoundError:
    print("❌ Model not found. Please run the training script first.")
    print("Run: python src/train.py --config configs/model.yaml")
    model = None

In [None]:
# Load training metrics
try:
    with open('../models/metrics.yaml', 'r') as f:
        metrics = yaml.safe_load(f)
    print("Training Metrics:")
    for key, value in metrics.items():
        if key != 'model_params':
            print(f"  {key}: {value}")
except FileNotFoundError:
    print("No metrics file found.")
    metrics = None

In [None]:
# Generate test data for evaluation (in real scenario, this would be your test set)
from train import generate_synthetic_data, preprocess_data

# Generate evaluation dataset
eval_df = generate_synthetic_data()
X_eval, y_eval = preprocess_data(eval_df, {})

print(f"Evaluation data shape: X={X_eval.shape}, y={y_eval.shape}")

## Model Performance Analysis

In [None]:
if model is not None:
    # Make predictions
    y_pred = model.predict(X_eval)
    y_pred_proba = model.predict_proba(X_eval)[:, 1]  # Probability of positive class
    
    # Calculate metrics
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    accuracy = accuracy_score(y_eval, y_pred)
    precision = precision_score(y_eval, y_pred)
    recall = recall_score(y_eval, y_pred)
    f1 = f1_score(y_eval, y_pred)
    
    print("Model Performance on Evaluation Set:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
else:
    print("Cannot evaluate model - model not loaded.")

## Confusion Matrix

In [None]:
if model is not None:
    # Plot confusion matrix
    cm = confusion_matrix(y_eval, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
    
    # Calculate confusion matrix percentages
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    print("\nConfusion Matrix (Percentages):")
    print(f"True Negatives:  {cm_percent[0,0]:.1f}%")
    print(f"False Positives: {cm_percent[0,1]:.1f}%")
    print(f"False Negatives: {cm_percent[1,0]:.1f}%")
    print(f"True Positives:  {cm_percent[1,1]:.1f}%")

## ROC Curve Analysis

In [None]:
if model is not None:
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_eval, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', 
             label='Random classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print(f"Area Under the Curve (AUC): {roc_auc:.4f}")
    
    # Interpretation
    if roc_auc > 0.9:
        interpretation = "Excellent"
    elif roc_auc > 0.8:
        interpretation = "Good"
    elif roc_auc > 0.7:
        interpretation = "Fair"
    elif roc_auc > 0.6:
        interpretation = "Poor"
    else:
        interpretation = "Very Poor"
    
    print(f"Model Performance: {interpretation}")

## Feature Importance Analysis

In [None]:
if model is not None and hasattr(model, 'feature_importances_'):
    # Get feature importances
    importances = model.feature_importances_
    feature_names = X_eval.columns
    
    # Create DataFrame for easier handling
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("Top 10 Most Important Features:")
    print(feature_importance_df.head(10))
    
    # Plot feature importances
    plt.figure(figsize=(10, 8))
    top_features = feature_importance_df.head(10)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 10 Feature Importances')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print("Feature importance not available for this model type.")

## Prediction Distribution Analysis

In [None]:
if model is not None:
    # Analyze prediction probabilities
    plt.figure(figsize=(12, 5))
    
    # Plot 1: Prediction probability distribution
    plt.subplot(1, 2, 1)
    plt.hist(y_pred_proba[y_eval == 0], bins=30, alpha=0.7, label='Class 0 (Negative)', density=True)
    plt.hist(y_pred_proba[y_eval == 1], bins=30, alpha=0.7, label='Class 1 (Positive)', density=True)
    plt.xlabel('Prediction Probability')
    plt.ylabel('Density')
    plt.title('Prediction Probability Distribution by True Class')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 2: Calibration plot (reliability diagram)
    plt.subplot(1, 2, 2)
    from sklearn.calibration import calibration_curve
    
    prob_true, prob_pred = calibration_curve(y_eval, y_pred_proba, n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o', linewidth=2, label='Model')
    plt.plot([0, 1], [0, 1], linestyle='--', label='Perfect calibration')
    plt.xlabel('Mean Predicted Probability')
    plt.ylabel('Fraction of Positives')
    plt.title('Calibration Plot')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Model Comparison with Baseline

In [None]:
if model is not None:
    # Compare with simple baselines
    from sklearn.dummy import DummyClassifier
    
    # Majority class baseline
    dummy_majority = DummyClassifier(strategy='most_frequent')
    dummy_majority.fit(X_eval, y_eval)
    dummy_pred_majority = dummy_majority.predict(X_eval)
    
    # Random baseline
    dummy_random = DummyClassifier(strategy='uniform', random_state=42)
    dummy_random.fit(X_eval, y_eval)
    dummy_pred_random = dummy_random.predict(X_eval)
    
    # Compare accuracies
    model_accuracy = accuracy_score(y_eval, y_pred)
    majority_accuracy = accuracy_score(y_eval, dummy_pred_majority)
    random_accuracy = accuracy_score(y_eval, dummy_pred_random)
    
    print("Model Comparison:")
    print(f"  Our Model:      {model_accuracy:.4f}")
    print(f"  Majority Class: {majority_accuracy:.4f}")
    print(f"  Random:         {random_accuracy:.4f}")
    
    improvement_over_majority = (model_accuracy - majority_accuracy) / majority_accuracy * 100
    improvement_over_random = (model_accuracy - random_accuracy) / random_accuracy * 100
    
    print(f"\nImprovement:")
    print(f"  vs Majority: +{improvement_over_majority:.1f}%")
    print(f"  vs Random:   +{improvement_over_random:.1f}%")
    
    # Visualization
    plt.figure(figsize=(8, 6))
    models = ['Random', 'Majority Class', 'Our Model']
    accuracies = [random_accuracy, majority_accuracy, model_accuracy]
    colors = ['lightcoral', 'lightblue', 'lightgreen']
    
    bars = plt.bar(models, accuracies, color=colors, edgecolor='black')
    plt.ylabel('Accuracy')
    plt.title('Model Performance Comparison')
    plt.ylim(0, 1)
    
    # Add value labels on bars
    for bar, acc in zip(bars, accuracies):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{acc:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

## Error Analysis

In [None]:
if model is not None:
    # Identify misclassified examples
    misclassified = (y_pred != y_eval)
    
    print(f"Total misclassified examples: {misclassified.sum()} out of {len(y_eval)} ({misclassified.mean()*100:.1f}%)")
    
    if misclassified.sum() > 0:
        # Analyze misclassified examples
        false_positives = (y_pred == 1) & (y_eval == 0)
        false_negatives = (y_pred == 0) & (y_eval == 1)
        
        print(f"False Positives: {false_positives.sum()}")
        print(f"False Negatives: {false_negatives.sum()}")
        
        # Look at confidence of misclassified examples
        if false_positives.sum() > 0:
            fp_confidence = y_pred_proba[false_positives]
            print(f"False Positive confidence - Mean: {fp_confidence.mean():.3f}, Std: {fp_confidence.std():.3f}")
        
        if false_negatives.sum() > 0:
            fn_confidence = 1 - y_pred_proba[false_negatives]  # Confidence in negative prediction
            print(f"False Negative confidence - Mean: {fn_confidence.mean():.3f}, Std: {fn_confidence.std():.3f}")
        
        # Plot confidence distribution for errors
        plt.figure(figsize=(10, 6))
        
        plt.subplot(1, 2, 1)
        if false_positives.sum() > 0:
            plt.hist(y_pred_proba[false_positives], bins=20, alpha=0.7, label='False Positives')
        if false_negatives.sum() > 0:
            plt.hist(y_pred_proba[false_negatives], bins=20, alpha=0.7, label='False Negatives')
        plt.xlabel('Prediction Probability')
        plt.ylabel('Count')
        plt.title('Confidence Distribution of Errors')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Threshold analysis
        plt.subplot(1, 2, 2)
        thresholds = np.linspace(0, 1, 100)
        precisions = []
        recalls = []
        
        for thresh in thresholds:
            pred_thresh = (y_pred_proba >= thresh).astype(int)
            if pred_thresh.sum() > 0:  # Avoid division by zero
                precisions.append(precision_score(y_eval, pred_thresh, zero_division=0))
                recalls.append(recall_score(y_eval, pred_thresh, zero_division=0))
            else:
                precisions.append(0)
                recalls.append(0)
        
        plt.plot(thresholds, precisions, label='Precision', linewidth=2)
        plt.plot(thresholds, recalls, label='Recall', linewidth=2)
        plt.axvline(x=0.5, color='red', linestyle='--', alpha=0.7, label='Default threshold')
        plt.xlabel('Threshold')
        plt.ylabel('Score')
        plt.title('Precision-Recall vs Threshold')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    else:
        print("Perfect predictions - no errors to analyze!")

## Summary and Recommendations

In [None]:
if model is not None:
    print("=" * 50)
    print("MODEL EVALUATION SUMMARY")
    print("=" * 50)
    
    print(f"Model Type: {type(model).__name__}")
    print(f"Evaluation Dataset Size: {len(y_eval)} samples")
    print(f"\nPerformance Metrics:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  ROC-AUC:   {roc_auc:.4f}")
    
    print(f"\nRecommendations:")
    
    if roc_auc < 0.7:
        print("  🔴 Model performance is poor. Consider:")
        print("     - Feature engineering")
        print("     - Different algorithms")
        print("     - More data collection")
    elif roc_auc < 0.8:
        print("  🟡 Model performance is fair. Consider:")
        print("     - Hyperparameter tuning")
        print("     - Feature selection")
        print("     - Ensemble methods")
    else:
        print("  🟢 Model performance is good!")
    
    if precision < 0.7:
        print("  📈 Low precision - consider increasing prediction threshold")
    
    if recall < 0.7:
        print("  📉 Low recall - consider decreasing prediction threshold")
    
    print(f"\nNext Steps:")
    print("  1. Validate on additional test data")
    print("  2. Monitor model performance in production")
    print("  3. Set up model retraining pipeline")
    print("  4. Implement A/B testing for model updates")
else:
    print("No model evaluation performed. Please train a model first.")