# Results Analysis & Comparison

This notebook compares the performance of the **Baseline Flat Classifier** (Notebook 02) against the **Hierarchical Classifier** (Notebook 03b End-to-End).

Metrics are loaded directly from the standardized JSON outputs generated during training/evaluation.

In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from mpl_toolkits.axes_grid1 import make_axes_locatable

MODELS_DIR = '../models'

def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)


In [2]:
# 1. Load Flat Models
flat_records = []
flat_dir = os.path.join(MODELS_DIR, 'flat-classifiers')

# Load parameters mapping if available
param_map = {}
if os.path.exists(flat_dir):
    csv_path = os.path.join(flat_dir, 'architecture_comparison.csv')
    if os.path.exists(csv_path):
        try:
            df_params = pd.read_csv(csv_path)
            # Normalize column names just in case
            df_params.columns = df_params.columns.str.strip()
            if 'Architecture' in df_params.columns and 'Parameters' in df_params.columns:
                param_map = dict(zip(df_params['Architecture'], df_params['Parameters']))
        except Exception as e:
            print(f"Warning: Could not load architecture_comparison.csv: {e}")

    for arch in os.listdir(flat_dir):
        metrics_path = os.path.join(flat_dir, arch, 'metrics.json')
        if os.path.exists(metrics_path):
            try:
                data = load_json(metrics_path)
                m = data.get('metrics', {})
                model_arch = data.get('architecture', arch)
                
                # Helper to safe float conversion
                def safe_float(val):
                    try:
                        return float(val)
                    except:
                        return 0.0

                flat_records.append({
                    'Architecture': model_arch,
                    'Accuracy': safe_float(m.get('accuracy', 0)),
                    'F1-Weighted': safe_float(m.get('f1_weighted', 0)),
                    'Precision': safe_float(m.get('precision_weighted', 0)),
                    'Recall': safe_float(m.get('recall_weighted', 0)),
                    'AUC': safe_float(m.get('auc_weighted', m.get('auc', 0))),  
                    'Parameters': str(param_map.get(model_arch, '2,097,460')) # Default from screenshot if missing
                })
            except Exception as e:
                print(f"Error processing {metrics_path}: {e}")

print(f"Found {len(flat_records)} flat models.")
if flat_records:
    df_flat = pd.DataFrame(flat_records)
    numeric_cols = ['Accuracy', 'F1-Weighted', 'Precision', 'Recall', 'AUC']
    for col in numeric_cols:
        df_flat[col] = pd.to_numeric(df_flat[col], errors='coerce').fillna(0.0)
        
    df_flat = df_flat.sort_values('Accuracy', ascending=False)
    print("\n=== TABLE 1: FLAT CLASSIFIERS (Baseline) ===")
    print("This table ranks standard models trained to predict the 20 fine-grained labels directly.")
    display(df_flat.style.format({
        'Accuracy': '{:.4f}', 
        'F1-Weighted': '{:.4f}', 
        'Precision': '{:.4f}', 
        'Recall': '{:.4f}', 
        'AUC': '{:.4f}'
        ,'Accuracy Diff (%)': '{:+.2f}%'
    }).hide(axis="index"))
else:
    print("No flat models found.")
    df_flat = pd.DataFrame(columns=['Architecture', 'Accuracy', 'F1-Weighted', 'Precision', 'Recall', 'AUC', 'Parameters']) # Empty DF for safety

# 2. Load Hierarchical Models (Per-Stage Analysis)
hier_records = []

# Strategy: Look for hierarchical_model_info_*.json files in MODELS_DIR
# If not found, check hierarchical-classifiers/{arch}/ for similar info
hier_dir = os.path.join(MODELS_DIR, 'hierarchical-classifiers')
if os.path.exists(hier_dir):
    for arch in os.listdir(hier_dir):
        arch_path = os.path.join(hier_dir, arch)
        if os.path.isdir(arch_path):
            # Check for Stage 2 E2E metrics
            e2e_metrics_path = os.path.join(arch_path, 'stage2_e2e_metrics.json')
            
            if os.path.exists(e2e_metrics_path):
                try:
                    # Load E2E metrics
                    e2e_data = load_json(e2e_metrics_path)
                    metrics = e2e_data.get('end_to_end_metrics', {})
                    
                    # Try to find Stage 1 info (prefer test split)
                    s1_acc = None
                    s1_info_path = os.path.join(MODELS_DIR, f"stage1_info_{arch}.json")
                    if os.path.exists(s1_info_path):
                        s1_data = load_json(s1_info_path)
                        s1_acc = s1_data.get('test_accuracy', None)
                    
                    if s1_acc is None:
                        # Fallback: Check inside arch dir first
                        s1_metrics_inner = os.path.join(arch_path, 'stage1_metrics.json')
                        if os.path.exists(s1_metrics_inner):
                            s1_d = load_json(s1_metrics_inner)
                            s1_acc = s1_d.get('metrics', {}).get('accuracy', 0.0)
                        else:
                            # Check in models dir
                            s1_info_path = os.path.join(MODELS_DIR, f"stage1_info_{arch}.json")
                            if os.path.exists(s1_info_path):
                                s1_data = load_json(s1_info_path)
                                s1_acc = s1_data.get('test_accuracy', 0.0)
                    hier_records.append({
                        'Architecture': arch,
                        'Stage 1 Acc (Test)': s1_acc,
                        'End-to-End Acc (Full System)': metrics.get('accuracy', 0),
                        'E2E F1': metrics.get('f1_weighted', metrics.get('f1_score', 0)),
                        'E2E Precision': metrics.get('precision_weighted', metrics.get('precision', 0)),
                        'E2E Recall': metrics.get('recall_weighted', metrics.get('recall', 0)),
                        'E2E AUC': metrics.get('auc_weighted', metrics.get('auc', 0))
                    })
                except Exception as e:
                    print(f"Error processing hierarchical {arch}: {e}")

if hier_records:
    df_hier = pd.DataFrame(hier_records).sort_values('End-to-End Acc (Full System)', ascending=False)
    print("\n=== TABLE 2: HIERARCHICAL PERFORMANCE BREAKDOWN ===")
    print("Source: Metrics JSONs in ../models/")
    display(df_hier.style.format({
        'Stage 1 Acc (Test)': '{:.4f}',
        'End-to-End Acc (Full System)': '{:.4f}',
        'E2E F1': '{:.4f}',
        'E2E Precision': '{:.4f}',
        'E2E Recall': '{:.4f}',
        'E2E AUC': '{:.4f}'
    }).hide(axis="index"))
else:
    print("\n=== TABLE 2: HIERARCHICAL PERFORMANCE BREAKDOWN ===")
    print("No hierarchical metrics found.")
    df_hier = pd.DataFrame(columns=['Architecture', 'Stage 1 Acc (Test)', 'End-to-End Acc (Full System)', 'E2E F1', 'E2E Precision', 'E2E Recall', 'E2E AUC'])

# 3. Final Championship (Best vs Best)
best_rows = []

if not df_flat.empty:
    best_flat = df_flat.iloc[0]
    best_rows.append({
        'Model Type': 'Flat',
        'Architecture': best_flat['Architecture'],
        'Best Accuracy': best_flat['Accuracy'],
        'F1-Score': best_flat['F1-Weighted'],
        'Precision': best_flat['Precision'],
        'Recall': best_flat['Recall'],
        'AUC': best_flat['AUC']
    })

if not df_hier.empty:
    best_hier = df_hier.iloc[0] # Define best_hier here safely
    best_rows.append({
        'Model Type': 'Hierarchical',
        'Architecture': best_hier['Architecture'],
        'Best Accuracy': best_hier['End-to-End Acc (Full System)'],
        'F1-Score': best_hier['E2E F1'],
        'Precision': best_hier['E2E Precision'],
        'Recall': best_hier['E2E Recall'],
        'AUC': best_hier['E2E AUC']
    })
else:
    best_hier = None # Ensure variable exists for later cells

if best_rows:
    df_comp = pd.DataFrame(best_rows)

    # Calculate Accuracy Diff (%) relative to Flat model (baseline)
    if not df_comp.empty:
        flat_acc = df_comp[df_comp['Model Type'] == 'Flat']['Best Accuracy'].max()
        if pd.notna(flat_acc) and flat_acc != 0:
            # Calculate percentage point difference
            df_comp['Accuracy Diff (%)'] = df_comp['Best Accuracy'].apply(lambda x: (x - flat_acc) * 100)
        else:
            df_comp['Accuracy Diff (%)'] = 0.0

    print("\n=== TABLE 3: THE CHAMPIONSHIP (Head-to-Head) ===")
    print("Source Notebook: 04_results_and_analysis.ipynb (Combines the results)")
    
    display(df_comp.style.format({
        'Best Accuracy': '{:.4f}', 
        'F1-Score': '{:.4f}',
        'Precision': '{:.4f}',
        'Recall': '{:.4f}',
        'AUC': '{:.4f}'
        ,'Accuracy Diff (%)': '{:+.2f}%'
    }).hide(axis="index"))
else:
    print("\n=== TABLE 3: THE CHAMPIONSHIP (Head-to-Head) ===")
    print("Not enough data for championship comparison.")

Found 4 flat models.

=== TABLE 1: FLAT CLASSIFIERS (Baseline) ===
This table ranks standard models trained to predict the 20 fine-grained labels directly.


Architecture,Accuracy,F1-Weighted,Precision,Recall,AUC,Parameters
densenet121_3d,0.8158,0.7981,0.8359,0.8158,0.9916,2097460
resnet18_3d,0.8109,0.7944,0.8123,0.8109,0.9893,2097460
efficientnet3d_b0,0.7554,0.7371,0.7814,0.7554,0.99,2097460
enhanced,0.5685,0.5369,0.5906,0.5685,0.9616,2.1M



=== TABLE 2: HIERARCHICAL PERFORMANCE BREAKDOWN ===
Source: Metrics JSONs in ../models/


Architecture,Stage 1 Acc (Region),End-to-End Acc (Full System),E2E F1,E2E Precision,E2E Recall,E2E AUC
resnet18_3d,0.9958,0.7098,0.651,0.6894,0.7098,0.977
enhanced,0.9958,0.6598,0.6233,0.6303,0.6598,0.9527
efficientnet3d_b0,0.993,0.6092,0.543,0.5325,0.6092,0.9422
densenet121_3d,0.9944,0.6038,0.5561,0.5989,0.6038,0.9503



=== TABLE 3: THE CHAMPIONSHIP (Head-to-Head) ===
Source Notebook: 04_results_and_analysis.ipynb (Combines the results)


Model Type,Architecture,Best Accuracy,F1-Score,Precision,Recall,AUC,Accuracy Diff (%)
Flat,densenet121_3d,0.8158,0.7981,0.8359,0.8158,0.9916,+0.00%
Hierarchical,resnet18_3d,0.7098,0.651,0.6894,0.7098,0.977,-10.60%


In [3]:
# 4. Visual Comparison (Confusion Matrices)

def plot_confusion_matrix(y_true, y_pred, title, labels=None, save_path=None):
    cm = confusion_matrix(y_true, y_pred)
    num_classes = len(cm)
    
    fig, ax = plt.subplots(figsize=(12, 10))
    
    hm = sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='viridis',
        xticklabels=labels,
        yticklabels=labels,
        ax=ax,
        square=True,
        cbar=False,
        linewidths=0,
        annot_kws={"size": 10}
    )
    
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=10)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
    
    # Add custom colorbar
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.15)
    vmin = cm.min()
    vmax = cm.max()
    cbar = fig.colorbar(
        hm.collections[0], 
        cax=cax,
        drawedges=False,
        ticks=np.linspace(vmin, vmax, 11)
    )
    cbar.solids.set_edgecolor("face")
    cbar.solids.set_rasterized(False)
    cbar.set_label("Count", fontsize=12)
    cbar.outline.set_visible(False)
    cbar.ax.tick_params(size=0, labelsize=10)
    
    ax.set_title(title, fontsize=14, fontweight='bold', pad=10)
    ax.set_xlabel("Predicted Label", fontsize=12, fontweight="bold", labelpad=4)
    ax.set_ylabel("True Label", fontsize=12, fontweight="bold", labelpad=4)
    
    # Disable rasterization for all artists
    for artist in fig.findobj():
        if hasattr(artist, "set_rasterized"):
            artist.set_rasterized(False)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, bbox_inches="tight", pad_inches=0.05, dpi=150)
    
    plt.show()


if best_rows:
    print("\n=== VISUAL COMPARISON: CONFUSION MATRICES ===")
    
    # Get paths and load predictions
    # Flat
    flat_arch = best_flat['Architecture']
    flat_dir_path = os.path.join(MODELS_DIR, 'flat-classifiers', flat_arch)
    flat_predictions_path = os.path.join(flat_dir_path, 'predictions.json')
    
    # Hierarchical
    hier_arch = best_hier['Architecture']
    hier_dir_path = os.path.join(MODELS_DIR, 'hierarchical-classifiers', hier_arch)
    hier_predictions_path = os.path.join(hier_dir_path, 'stage2_predictions.json')
    
    # Try to load and plot flat confusion matrix
    if os.path.exists(flat_predictions_path):
        try:
            with open(flat_predictions_path, 'r') as f:
                flat_data = json.load(f)
            y_true_flat = flat_data.get('y_true', [])
            y_pred_flat = flat_data.get('y_pred', [])
            labels_flat = flat_data.get('labels', None)
            
            plot_confusion_matrix(
                y_true_flat, 
                y_pred_flat, 
                f'Flat Classifier ({flat_arch})',
                labels=labels_flat,
                save_path=os.path.join(flat_dir_path, 'confusion_matrix.png')
            )
        except Exception as e:
            print(f"Error loading flat predictions: {e}")
    else:
        print(f"Warning: Could not find flat predictions at {flat_predictions_path}")
    
    # Try to load and plot hierarchical confusion matrix
    if os.path.exists(hier_predictions_path):
        try:
            with open(hier_predictions_path, 'r') as f:
                hier_data = json.load(f)
            y_true_hier = hier_data.get('y_true', [])
            y_pred_hier = hier_data.get('y_pred', [])
            labels_hier = hier_data.get('labels', None)
            
            hier_save_path = os.path.join(MODELS_DIR, f'hierarchical_confusion_matrix_{hier_arch}.png')
            plot_confusion_matrix(
                y_true_hier, 
                y_pred_hier, 
                f'Hierarchical Classifier ({hier_arch})',
                labels=labels_hier,
                save_path=hier_save_path
            )
        except Exception as e:
            print(f"Error loading hierarchical predictions: {e}")
    else:
        print(f"Warning: Could not find hierarchical predictions at {hier_predictions_path}")



=== VISUAL COMPARISON: CONFUSION MATRICES ===
