# Multi-Model Family Comparison Analysis

This notebook analyzes and visualizes results from the multi-model family experiments:
- **3 Model Families**: Llama, Qwen, Mistral
- **2 Models per Family**: Small (1B-1.5B) and Large (7B-8B)
- **2 Datasets**: TriviaQA and SQuAD

## Analysis Dimensions:
1. Model size effect within families
2. Model family comparison at similar sizes
3. Dataset effect (TriviaQA vs SQuAD)
4. Uncertainty calibration (G-NLL AUROC)
5. Accuracy vs uncertainty correlation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

## 1. Load Results

In [None]:
# Load AUROC results
results_dir = Path('../../results/multi_model_auroc')
df = pd.read_csv(results_dir / 'auroc_results.csv')

print(f"Loaded {len(df)} experiment results")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

## 2. Summary Statistics

In [None]:
print("=" * 80)
print("Overall Statistics")
print("=" * 80)
print(df[['G-NLL_AUROC', 'Accuracy', 'Mean_G-NLL']].describe())
print()

print("=" * 80)
print("By Model Family")
print("=" * 80)
family_stats = df.groupby('model_family').agg({
    'G-NLL_AUROC': ['mean', 'std'],
    'Accuracy': ['mean', 'std'],
    'model_name': 'count'
}).round(4)
family_stats.columns = ['_'.join(col) for col in family_stats.columns]
print(family_stats)
print()

print("=" * 80)
print("By Model Size")
print("=" * 80)
size_stats = df.groupby('model_size').agg({
    'G-NLL_AUROC': ['mean', 'std'],
    'Accuracy': ['mean', 'std'],
    'model_name': 'count'
}).round(4)
size_stats.columns = ['_'.join(col) for col in size_stats.columns]
print(size_stats)
print()

print("=" * 80)
print("By Dataset")
print("=" * 80)
dataset_stats = df.groupby('dataset').agg({
    'G-NLL_AUROC': ['mean', 'std'],
    'Accuracy': ['mean', 'std'],
    'model_name': 'count'
}).round(4)
dataset_stats.columns = ['_'.join(col) for col in dataset_stats.columns]
print(dataset_stats)

## 3. Visualization: AUROC Comparison Matrix

In [None]:
# Create pivot table for heatmap
pivot_auroc = df.pivot_table(
    values='G-NLL_AUROC',
    index=['model_family', 'model_size'],
    columns='dataset',
    aggfunc='mean'
)

# Create heatmap
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(pivot_auroc, annot=True, fmt='.4f', cmap='RdYlGn', 
            vmin=0.5, vmax=1.0, ax=ax, cbar_kws={'label': 'G-NLL AUROC'})
plt.title('G-NLL AUROC: Model Family × Size × Dataset', fontsize=14, weight='bold')
plt.xlabel('Dataset', fontsize=12)
plt.ylabel('Model (Family, Size)', fontsize=12)
plt.tight_layout()
plt.savefig(results_dir / 'auroc_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Saved: auroc_heatmap.png")

## 4. Model Family Comparison

In [None]:
# Bar plot: AUROC by model family
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: By family
family_auroc = df.groupby('model_family')['G-NLL_AUROC'].mean().sort_values(ascending=False)
family_auroc.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Average G-NLL AUROC by Model Family', fontsize=12, weight='bold')
axes[0].set_xlabel('Model Family', fontsize=11)
axes[0].set_ylabel('G-NLL AUROC', fontsize=11)
axes[0].set_ylim([0.5, 1.0])
axes[0].grid(axis='y', alpha=0.3)

# Plot 2: By family and size
family_size_auroc = df.groupby(['model_family', 'model_size'])['G-NLL_AUROC'].mean().unstack()
family_size_auroc.plot(kind='bar', ax=axes[1], width=0.8)
axes[1].set_title('G-NLL AUROC by Model Family and Size', fontsize=12, weight='bold')
axes[1].set_xlabel('Model Family', fontsize=11)
axes[1].set_ylabel('G-NLL AUROC', fontsize=11)
axes[1].set_ylim([0.5, 1.0])
axes[1].legend(title='Size', fontsize=10)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(results_dir / 'family_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Saved: family_comparison.png")

## 5. Model Size Effect

In [None]:
# Analyze size effect within each family
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: AUROC improvement from small to large
size_comparison = df.pivot_table(
    values='G-NLL_AUROC',
    index='model_family',
    columns='model_size',
    aggfunc='mean'
)
if 'Small' in size_comparison.columns and 'Large' in size_comparison.columns:
    size_comparison['Improvement'] = size_comparison['Large'] - size_comparison['Small']
    size_comparison[['Small', 'Large']].plot(kind='bar', ax=axes[0], width=0.8)
    axes[0].set_title('AUROC: Small vs Large Models by Family', fontsize=12, weight='bold')
    axes[0].set_xlabel('Model Family', fontsize=11)
    axes[0].set_ylabel('G-NLL AUROC', fontsize=11)
    axes[0].legend(fontsize=10)
    axes[0].grid(axis='y', alpha=0.3)

# Plot 2: Accuracy improvement
acc_comparison = df.pivot_table(
    values='Accuracy',
    index='model_family',
    columns='model_size',
    aggfunc='mean'
)
if 'Small' in acc_comparison.columns and 'Large' in acc_comparison.columns:
    acc_comparison[['Small', 'Large']].plot(kind='bar', ax=axes[1], width=0.8, color=['coral', 'darkred'])
    axes[1].set_title('Accuracy: Small vs Large Models by Family', fontsize=12, weight='bold')
    axes[1].set_xlabel('Model Family', fontsize=11)
    axes[1].set_ylabel('Accuracy', fontsize=11)
    axes[1].legend(fontsize=10)
    axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(results_dir / 'size_effect.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Saved: size_effect.png")

## 6. Dataset Effect (TriviaQA vs SQuAD)

In [None]:
# Compare performance across datasets
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: AUROC by dataset and family
dataset_family = df.groupby(['dataset', 'model_family'])['G-NLL_AUROC'].mean().unstack()
dataset_family.plot(kind='bar', ax=axes[0], width=0.8)
axes[0].set_title('G-NLL AUROC by Dataset and Model Family', fontsize=12, weight='bold')
axes[0].set_xlabel('Dataset', fontsize=11)
axes[0].set_ylabel('G-NLL AUROC', fontsize=11)
axes[0].legend(title='Model Family', fontsize=10)
axes[0].grid(axis='y', alpha=0.3)

# Plot 2: Accuracy by dataset and family
acc_dataset_family = df.groupby(['dataset', 'model_family'])['Accuracy'].mean().unstack()
acc_dataset_family.plot(kind='bar', ax=axes[1], width=0.8)
axes[1].set_title('Accuracy by Dataset and Model Family', fontsize=12, weight='bold')
axes[1].set_xlabel('Dataset', fontsize=11)
axes[1].set_ylabel('Accuracy', fontsize=11)
axes[1].legend(title='Model Family', fontsize=10)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(results_dir / 'dataset_effect.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Saved: dataset_effect.png")

## 7. Accuracy vs Uncertainty Calibration

In [None]:
# Scatter plot: Accuracy vs AUROC
fig, ax = plt.subplots(figsize=(12, 8))

# Create scatter plot with different colors for families
for family in df['model_family'].unique():
    family_data = df[df['model_family'] == family]
    ax.scatter(family_data['Accuracy'], family_data['G-NLL_AUROC'], 
               label=family, s=100, alpha=0.7)

# Add model labels
for idx, row in df.iterrows():
    label = f"{row['model_family']}-{row['model_size'][:1]}\n{row['dataset'][:3]}"
    ax.annotate(label, (row['Accuracy'], row['G-NLL_AUROC']),
                xytext=(5, 5), textcoords='offset points', fontsize=8, alpha=0.7)

ax.set_xlabel('Accuracy', fontsize=12)
ax.set_ylabel('G-NLL AUROC (Uncertainty Calibration)', fontsize=12)
ax.set_title('Accuracy vs Uncertainty Calibration by Model Family', fontsize=14, weight='bold')
ax.legend(title='Model Family', fontsize=10)
ax.grid(alpha=0.3)

# Add diagonal reference line
lims = [max(ax.get_xlim()[0], ax.get_ylim()[0]), min(ax.get_xlim()[1], ax.get_ylim()[1])]
ax.plot(lims, lims, 'k--', alpha=0.3, zorder=0)

plt.tight_layout()
plt.savefig(results_dir / 'accuracy_vs_auroc.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Saved: accuracy_vs_auroc.png")

## 8. Radar Chart: Model Family Performance Profiles

In [None]:
from math import pi

# Prepare data for radar chart
metrics = ['G-NLL_AUROC', 'Accuracy', 'Num_examples']
family_profiles = df.groupby('model_family')[metrics].mean()

# Normalize metrics to 0-1 scale
family_profiles_norm = family_profiles.copy()
for col in family_profiles.columns:
    min_val = family_profiles[col].min()
    max_val = family_profiles[col].max()
    if max_val > min_val:
        family_profiles_norm[col] = (family_profiles[col] - min_val) / (max_val - min_val)

# Create radar chart
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

angles = [n / len(metrics) * 2 * pi for n in range(len(metrics))]
angles += angles[:1]

for family in family_profiles_norm.index:
    values = family_profiles_norm.loc[family].tolist()
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=family)
    ax.fill(angles, values, alpha=0.15)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(metrics)
ax.set_ylim(0, 1)
ax.set_title('Model Family Performance Profiles (Normalized)', fontsize=14, weight='bold', pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
ax.grid(True)

plt.tight_layout()
plt.savefig(results_dir / 'radar_chart.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Saved: radar_chart.png")

## 9. Key Findings Summary

In [None]:
print("=" * 80)
print("KEY FINDINGS")
print("=" * 80)
print()

# Best model overall
best_model = df.loc[df['G-NLL_AUROC'].idxmax()]
print(f"1. Best Overall Model (G-NLL AUROC):")
print(f"   {best_model['model_name']} on {best_model['dataset']}")
print(f"   AUROC: {best_model['G-NLL_AUROC']:.4f}, Accuracy: {best_model['Accuracy']:.4f}")
print()

# Best family
best_family = df.groupby('model_family')['G-NLL_AUROC'].mean().idxmax()
best_family_auroc = df.groupby('model_family')['G-NLL_AUROC'].mean().max()
print(f"2. Best Model Family:")
print(f"   {best_family} (Average AUROC: {best_family_auroc:.4f})")
print()

# Size effect
if 'Small' in df['model_size'].values and 'Large' in df['model_size'].values:
    small_auroc = df[df['model_size'] == 'Small']['G-NLL_AUROC'].mean()
    large_auroc = df[df['model_size'] == 'Large']['G-NLL_AUROC'].mean()
    print(f"3. Size Effect:")
    print(f"   Small models: {small_auroc:.4f}")
    print(f"   Large models: {large_auroc:.4f}")
    print(f"   Improvement: {(large_auroc - small_auroc):.4f} ({(large_auroc/small_auroc - 1)*100:.1f}%)")
    print()

# Dataset effect
dataset_auroc = df.groupby('dataset')['G-NLL_AUROC'].mean()
print(f"4. Dataset Effect:")
for dataset, auroc in dataset_auroc.items():
    print(f"   {dataset}: {auroc:.4f}")
print()

print("=" * 80)
print("Analysis complete!")
print("=" * 80)

## 10. Export Summary Report

In [None]:
# Create summary report
summary = {
    'experiment_overview': {
        'total_experiments': len(df),
        'model_families': df['model_family'].unique().tolist(),
        'datasets': df['dataset'].unique().tolist(),
        'model_sizes': df['model_size'].unique().tolist()
    },
    'best_performers': {
        'overall': {
            'model': best_model['model_name'],
            'dataset': best_model['dataset'],
            'auroc': float(best_model['G-NLL_AUROC']),
            'accuracy': float(best_model['Accuracy'])
        },
        'by_family': df.groupby('model_family')['G-NLL_AUROC'].mean().to_dict()
    },
    'statistics': {
        'by_family': family_stats.to_dict(),
        'by_size': size_stats.to_dict(),
        'by_dataset': dataset_stats.to_dict()
    }
}

# Save summary
with open(results_dir / 'analysis_summary.json', 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print("✅ Saved: analysis_summary.json")
print("\nAll analysis complete!")