# 📊 Results Analysis & Comparison

This notebook compares the performance of the **Baseline Flat Classifier** (Notebook 02) against the **Hierarchical Classifier** (Notebook 03b End-to-End).

Metrics are loaded directly from the standardized JSON outputs generated during training/evaluation.

In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

MODELS_DIR = '../models'
def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)


In [None]:
# 1. Load Flat Models
flat_records = []
flat_dir = os.path.join(MODELS_DIR, 'flat-classifiers')
if os.path.exists(flat_dir):
    for arch in os.listdir(flat_dir):
        metrics_path = os.path.join(flat_dir, arch, 'metrics.json')
        if os.path.exists(metrics_path):
            data = load_json(metrics_path)
            m = data.get('metrics', {})
            flat_records.append({
                'Architecture': data.get('architecture', arch),
                'Accuracy': m.get('accuracy', 0),
                'F1-Weighted': m.get('f1_weighted', 0),
                'Precision': m.get('precision_weighted', 0),
                'Recall': m.get('recall_weighted', 0)
            })

print(f"Found {len(flat_records)} flat models.")
if flat_records:
    df_flat = pd.DataFrame(flat_records).sort_values('Accuracy', ascending=False)
    print("\n=== TABLE 1: FLAT CLASSIFIERS ===")
    display(df_flat.style.format('{:.4f}', subset=['Accuracy', 'F1-Weighted', 'Precision', 'Recall']))


In [None]:
# 2. Load Hierarchical Models (Per-Stage Analysis)
hier_records = []

for filename in os.listdir(MODELS_DIR):
    if filename.startswith('stage1_info_') and filename.endswith('.json'):
        path = os.path.join(MODELS_DIR, filename)
        s1_data = load_json(path)
        arch = s1_data.get('architecture', 'unknown')
        
        # Match with Stage 2
        stage2_path = os.path.join(MODELS_DIR, f"hierarchical_model_info_{arch}.json")
        if os.path.exists(stage2_path):
            s2_data = load_json(stage2_path)
            m = s2_data.get('metrics', {})
            h_metrics = s2_data.get('hierarchical_metrics', {})
            
            hier_records.append({
                'Architecture': arch,
                'Stage 1 Acc': s1_data.get('test_accuracy', 0),
                'Coarse Acc (E2E)': h_metrics.get('coarse_accuracy', 0),
                'Consistency': h_metrics.get('consistency_both_correct', 0),
                'End-to-End Acc': m.get('accuracy', 0),
                'End-to-End F1': m.get('f1_weighted', 0)
            })

if hier_records:
    df_hier = pd.DataFrame(hier_records).sort_values('End-to-End Acc', ascending=False)
    print("\n=== TABLE 2: HIERARCHICAL STAGES BREAKDOWN ===")
    # Show internal details
    display(df_hier[['Architecture', 'Stage 1 Acc', 'Coarse Acc (E2E)', 'Consistency', 'End-to-End Acc']].style.format('{:.2%}', subset=['Stage 1 Acc', 'Coarse Acc (E2E)', 'Consistency', 'End-to-End Acc']))


In [None]:
# 3. Final Comparison (Best Flat vs Best Hierarchical)
best_rows = []

if flat_records:
    # Get best flat by accuracy
    best_flat = df_flat.iloc[0]
    best_rows.append({
        'Model Type': 'Flat',
        'Architecture': best_flat['Architecture'],
        'Accuracy': best_flat['Accuracy'],
        'F1-Weighted': best_flat['F1-Weighted']
    })

if hier_records:
    # Get best hier by accuracy
    best_hier = df_hier.iloc[0]
    best_rows.append({
        'Model Type': 'Hierarchical',
        'Architecture': best_hier['Architecture'],
        'Accuracy': best_hier['End-to-End Acc'],
        'F1-Weighted': best_hier['End-to-End F1']
    })

if best_rows:
    df_comp = pd.DataFrame(best_rows).set_index('Model Type')
    print("\n=== TABLE 3: FINAL CHAMPIONSHIP (Best vs Best) ===")
    display(df_comp.style.format('{:.4f}'))
    
    # Bar plot
    plt.figure(figsize=(8, 5))
    sns.barplot(data=df_comp.reset_index(), x='Model Type', y='Accuracy', palette=['skyblue', 'lightgreen'])
    plt.title('Best Flat vs Best Hierarchical Accuracy')
    plt.ylim(0, 1.05)
    plt.grid(axis='y', alpha=0.3)
    plt.show()
else:
    print("Not enough data for comparison.")


In [None]:
# 4. Visual Comparison (Confusion Matrices)
from IPython.display import Image, display
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

if best_rows:
    print("\n=== VISUAL COMPARISON: CONFUSION MATRICES ===")
    
    # Get paths
    # Flat
    flat_arch = best_flat['Architecture']
    flat_cm_path = os.path.join(MODELS_DIR, 'flat-classifiers', flat_arch, 'confusion_matrix.png')
    
    # Hierarchical
    hier_arch = best_hier['Architecture']
    hier_cm_path = os.path.join(MODELS_DIR, f'hierarchical_confusion_matrix_{hier_arch}.png')
    
    # Display side-by-side
    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    
    if os.path.exists(flat_cm_path):
        img_flat = mpimg.imread(flat_cm_path)
        ax[0].imshow(img_flat)
        ax[0].set_title(f'Flat Classifier ({flat_arch})')
        ax[0].axis('off')
    else:
        ax[0].text(0.5, 0.5, 'Image not found', ha='center')
        ax[0].set_title(f'Flat Classifier ({flat_arch})')
        print(f"Warning: Could not find {flat_cm_path}")

    if os.path.exists(hier_cm_path):
        img_hier = mpimg.imread(hier_cm_path)
        ax[1].imshow(img_hier)
        ax[1].set_title(f'Hierarchical Classifier ({hier_arch})')
        ax[1].axis('off')
    else:
        ax[1].text(0.5, 0.5, 'Image not found', ha='center')
        ax[1].set_title(f'Hierarchical Classifier ({hier_arch})')
        print(f"Warning: Could not find {hier_cm_path}")
        
    plt.tight_layout()
    plt.show()
