In [None]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from tqdm.notebook import tqdm

# Set style
plt.style.use('seaborn-v0_8-paper')
sns.set_palette('husl')

print("✓ Imports successful")
print(f"Project root: {project_root}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Configuration

In [None]:
# Paths
DATA_PATH = project_root / 'data' / 'artemis'
CHECKPOINT_DIR = project_root / 'checkpoints'
OUTPUT_DIR = project_root / 'outputs'

# Create output directories
(OUTPUT_DIR / 'figures').mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / 'metrics').mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / 'tables').mkdir(parents=True, exist_ok=True)

# Emotion labels (ArtEmis)
EMOTION_LABELS = [
    'amusement', 'awe', 'contentment', 'excitement',
    'anger', 'disgust', 'fear', 'sadness', 'something else'
]

# Model configurations
MODELS = {
    'V1': {
        'name': 'V1 Baseline',
        'checkpoint': CHECKPOINT_DIR / 'v1_baseline' / 'checkpoint_best.pt',
        'expected_acc': 0.65
    },
    'V2': {
        'name': 'V2 Improved',
        'checkpoint': CHECKPOINT_DIR / 'v2_improved' / 'checkpoint_best.pt',
        'expected_acc': 0.68
    },
    'V3': {
        'name': 'V3 Fuzzy Features',
        'checkpoint': CHECKPOINT_DIR / 'v3_fuzzy_features' / 'checkpoint_best.pt',
        'expected_acc': 0.7063
    },
    'V4': {
        'name': 'V4 Fuzzy Gating',
        'checkpoint': CHECKPOINT_DIR / 'v4_fuzzy_gating' / 'checkpoint_best.pt',
        'expected_acc': 0.7037
    },
    'V4.1': {
        'name': 'V4.1 Integrated',
        'checkpoint': CHECKPOINT_DIR / 'v4_1_integrated' / 'checkpoint_best.pt',
        'expected_acc': 0.7040
    }
}

print(f"✓ Configuration loaded")
print(f"Data path: {DATA_PATH}")
print(f"Checkpoints: {CHECKPOINT_DIR}")
print(f"Models to evaluate: {len(MODELS)}")

## 2. Load Dataset

Load the ArtEmis test set for evaluation.

In [None]:
# TODO: Implement dataset loading
# from cerebrum_artis.data import ArtEmisDataset
# from torch.utils.data import DataLoader

# test_dataset = ArtEmisDataset(
#     data_path=DATA_PATH,
#     split='test',
#     image_size=224
# )

# test_loader = DataLoader(
#     test_dataset,
#     batch_size=32,
#     shuffle=False,
#     num_workers=4
# )

# print(f"✓ Test set loaded: {len(test_dataset)} samples")

## 3. Evaluate Each Model

Run inference on all models and collect predictions.

In [None]:
def evaluate_model(model, test_loader, device='cuda'):
    """
    Evaluate a single model on test set.
    
    Returns:
        predictions: numpy array of predicted classes
        probabilities: numpy array of class probabilities
        ground_truth: numpy array of true labels
    """
    model.eval()
    model = model.to(device)
    
    all_preds = []
    all_probs = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Evaluating'):
            images = batch['image'].to(device)
            labels = batch['emotion']
            
            # Forward pass
            outputs = model(images)
            probs = torch.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    return (
        np.array(all_preds),
        np.array(all_probs),
        np.array(all_labels)
    )

# TODO: Uncomment when models are ready
# results = {}
# 
# for model_key, config in MODELS.items():
#     print(f"\nEvaluating {config['name']}...")
#     
#     # Load model
#     model = load_model(model_key, config['checkpoint'])
#     
#     # Evaluate
#     preds, probs, labels = evaluate_model(model, test_loader)
#     
#     # Calculate metrics
#     accuracy = (preds == labels).mean()
#     
#     results[model_key] = {
#         'predictions': preds,
#         'probabilities': probs,
#         'labels': labels,
#         'accuracy': accuracy
#     }
#     
#     print(f"Accuracy: {accuracy:.2%}")

## 4. Calculate Metrics

Compute comprehensive metrics for each model.

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

def calculate_metrics(predictions, ground_truth, emotion_labels):
    """
    Calculate comprehensive metrics.
    """
    accuracy = accuracy_score(ground_truth, predictions)
    
    precision, recall, f1, support = precision_recall_fscore_support(
        ground_truth, predictions, average=None, labels=range(len(emotion_labels))
    )
    
    cm = confusion_matrix(ground_truth, predictions)
    
    # Per-class metrics
    per_class = pd.DataFrame({
        'Emotion': emotion_labels,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Support': support
    })
    
    # Macro averages
    macro_precision = precision.mean()
    macro_recall = recall.mean()
    macro_f1 = f1.mean()
    
    return {
        'accuracy': accuracy,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1,
        'per_class': per_class,
        'confusion_matrix': cm
    }

# TODO: Calculate metrics for all models
# metrics = {}
# for model_key, res in results.items():
#     metrics[model_key] = calculate_metrics(
#         res['predictions'],
#         res['labels'],
#         EMOTION_LABELS
#     )

## 5. Results Summary

Create a summary table comparing all models.

In [None]:
# TODO: Create summary table
# summary = pd.DataFrame([
#     {
#         'Model': MODELS[k]['name'],
#         'Accuracy': metrics[k]['accuracy'],
#         'Macro Precision': metrics[k]['macro_precision'],
#         'Macro Recall': metrics[k]['macro_recall'],
#         'Macro F1': metrics[k]['macro_f1']
#     }
#     for k in MODELS.keys()
# ])
# 
# # Style table
# summary_styled = summary.style.format({
#     'Accuracy': '{:.2%}',
#     'Macro Precision': '{:.2%}',
#     'Macro Recall': '{:.2%}',
#     'Macro F1': '{:.2%}'
# }).background_gradient(subset=['Accuracy'], cmap='RdYlGn', vmin=0.6, vmax=0.75)
# 
# display(summary_styled)
# 
# # Save to CSV and LaTeX
# summary.to_csv(OUTPUT_DIR / 'tables' / 'model_comparison.csv', index=False)
# summary.to_latex(OUTPUT_DIR / 'tables' / 'model_comparison.tex', index=False, float_format='%.2f')

print("Summary table will be displayed here")

## 6. Visualization: Model Comparison

Bar chart comparing model accuracies.

In [None]:
# TODO: Create visualization
# fig, ax = plt.subplots(figsize=(10, 6))
# 
# models = [MODELS[k]['name'] for k in MODELS.keys()]
# accuracies = [metrics[k]['accuracy'] for k in MODELS.keys()]
# 
# bars = ax.bar(models, accuracies, color=sns.color_palette('husl', len(models)))
# 
# # Add value labels on bars
# for bar in bars:
#     height = bar.get_height()
#     ax.text(bar.get_x() + bar.get_width()/2., height,
#             f'{height:.2%}',
#             ha='center', va='bottom', fontsize=10, fontweight='bold')
# 
# ax.set_ylabel('Accuracy', fontsize=12)
# ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
# ax.set_ylim(0.6, 0.75)
# ax.grid(axis='y', alpha=0.3)
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# 
# plt.savefig(OUTPUT_DIR / 'figures' / 'model_comparison.png', dpi=300, bbox_inches='tight')
# plt.show()

print("Visualization will be displayed here")

## 7. Confusion Matrices

Visualize confusion matrix for best model (V3).

In [None]:
# TODO: Plot confusion matrix
# def plot_confusion_matrix(cm, labels, title, save_path):
#     fig, ax = plt.subplots(figsize=(10, 8))
#     
#     # Normalize
#     cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#     
#     sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues',
#                 xticklabels=labels, yticklabels=labels,
#                 square=True, cbar_kws={'label': 'Proportion'})
#     
#     plt.ylabel('True Emotion', fontsize=12)
#     plt.xlabel('Predicted Emotion', fontsize=12)
#     plt.title(title, fontsize=14, fontweight='bold')
#     plt.tight_layout()
#     
#     plt.savefig(save_path, dpi=300, bbox_inches='tight')
#     plt.show()
# 
# # Plot for V3 (best model)
# plot_confusion_matrix(
#     metrics['V3']['confusion_matrix'],
#     EMOTION_LABELS,
#     'Confusion Matrix - V3 Fuzzy Features',
#     OUTPUT_DIR / 'figures' / 'confusion_matrix_v3.png'
# )

print("Confusion matrix will be displayed here")

## 8. Save Results

Save all metrics to JSON for later use.

In [None]:
import json

# TODO: Save metrics
# metrics_to_save = {}
# for model_key, m in metrics.items():
#     metrics_to_save[model_key] = {
#         'accuracy': float(m['accuracy']),
#         'macro_precision': float(m['macro_precision']),
#         'macro_recall': float(m['macro_recall']),
#         'macro_f1': float(m['macro_f1']),
#         'per_class': m['per_class'].to_dict('records'),
#         'confusion_matrix': m['confusion_matrix'].tolist()
#     }
# 
# with open(OUTPUT_DIR / 'metrics' / 'model_evaluation.json', 'w') as f:
#     json.dump(metrics_to_save, f, indent=2)
# 
# print("✓ Metrics saved to outputs/metrics/model_evaluation.json")

print("Metrics will be saved here")

## Summary

This notebook evaluated all model versions and generated:
- ✓ Accuracy metrics for each model
- ✓ Confusion matrices
- ✓ Per-class performance metrics
- ✓ Comparison visualizations
- ✓ LaTeX tables for paper

**Next steps:**
1. Run `02_ensemble_analysis.ipynb` for ensemble evaluation
2. Run `03_fuzzy_features_analysis.ipynb` for feature analysis