# 06 - Summary Report

This notebook compiles all results and generates the final summary for the project:

**"When Geometry Fails: Stress-Testing Git Re-Basin on Spurious vs Robust Features"**

## Core Hypothesis (Recap):
Permutation alignment (Git Re-Basin) can successfully connect models relying on the same feature type, but fails to connect models with mismatched mechanisms (spurious vs robust), producing measurable loss and semantic barriers.

In [None]:
import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from src.config import (
    get_config, RESULTS_DIR, FIGURES_DIR, METRICS_DIR
)
from src.plotting import save_figure

config = get_config()
print(f"Loading results from: {RESULTS_DIR}")

## 1. Load All Results

In [None]:
# Load all saved metrics
results = {}

# Training summary
training_path = METRICS_DIR / 'training_summary.json'
if training_path.exists():
    with open(training_path, 'r') as f:
        results['training'] = json.load(f)
    print(f"Loaded training summary")

# Mechanism verification
mechanism_path = METRICS_DIR / 'mechanism_verification.json'
if mechanism_path.exists():
    with open(mechanism_path, 'r') as f:
        results['mechanism'] = json.load(f)
    print(f"Loaded mechanism verification results")

# Rebasin results
rebasin_path = METRICS_DIR / 'rebasin_results.json'
if rebasin_path.exists():
    with open(rebasin_path, 'r') as f:
        results['rebasin'] = json.load(f)
    print(f"Loaded rebasin results")

# Interpolation summary
summary_path = RESULTS_DIR / 'summary.json'
if summary_path.exists():
    with open(summary_path, 'r') as f:
        results['interpolation'] = json.load(f)
    print(f"Loaded interpolation summary")

print(f"\nLoaded {len(results)} result files")

## 2. Model Performance Summary

In [None]:
if 'mechanism' in results:
    srs_results = results['mechanism']['srs_results']
    
    # Create performance table
    performance_data = []
    for model_name in ['A1', 'A2', 'R1', 'R2']:
        if model_name in srs_results:
            m = srs_results[model_name]
            model_type = 'Spurious' if model_name.startswith('A') else 'Robust'
            performance_data.append({
                'Model': model_name,
                'Type': model_type,
                'ID Acc (%)': f"{m['id_accuracy']*100:.1f}",
                'OOD Acc (%)': f"{m['ood_accuracy']*100:.1f}",
                'OOD Drop (%)': f"{m['ood_drop']*100:.1f}",
                'SRS': f"{m['spurious_reliance_score']:.4f}",
            })
    
    df_performance = pd.DataFrame(performance_data)
    print("\n" + "="*70)
    print("MODEL PERFORMANCE SUMMARY")
    print("="*70)
    print(df_performance.to_string(index=False))
else:
    print("[WARNING] Mechanism verification results not found")

## 3. Git Re-Basin Effectiveness

In [None]:
if 'rebasin' in results:
    comparison = results['rebasin']['comparison']
    
    rebasin_data = []
    for pair_name, data in comparison.items():
        rebasin_data.append({
            'Pair': pair_name,
            'Type': data['type'],
            'Pre Cosine Sim': f"{data['pre_cosine_sim']:.4f}",
            'Post Cosine Sim': f"{data['post_cosine_sim']:.4f}",
            'Change': f"{data['cosine_sim_change']:+.4f}",
            'Pre Agreement (%)': f"{data['pre_agreement']*100:.1f}",
            'Post Agreement (%)': f"{data['post_agreement']*100:.1f}",
        })
    
    df_rebasin = pd.DataFrame(rebasin_data)
    print("\n" + "="*90)
    print("GIT RE-BASIN EFFECTIVENESS")
    print("="*90)
    print(df_rebasin.to_string(index=False))
else:
    print("[WARNING] Rebasin results not found")

## 4. Barrier Analysis

In [None]:
if 'interpolation' in results and 'barrier_comparison' in results['interpolation']:
    barriers = results['interpolation']['barrier_comparison']
    
    barrier_data = []
    for pair_name, data in barriers.items():
        barrier_data.append({
            'Pair': pair_name,
            'Type': data['type'],
            'Pre ID Barrier': f"{data['pre_id_loss_barrier']:.4f}",
            'Post ID Barrier': f"{data.get('post_id_loss_barrier', float('nan')):.4f}",
            'Pre OOD Barrier': f"{data['pre_ood_loss_barrier']:.4f}",
            'Post OOD Barrier': f"{data.get('post_ood_loss_barrier', float('nan')):.4f}",
        })
    
    df_barriers = pd.DataFrame(barrier_data)
    print("\n" + "="*90)
    print("LOSS BARRIER ANALYSIS")
    print("="*90)
    print(df_barriers.to_string(index=False))
    
    # Compute statistics
    same_mech = [barriers[p]['post_id_loss_barrier'] for p in ['A1-A2', 'R1-R2'] 
                 if 'post_id_loss_barrier' in barriers.get(p, {})]
    diff_mech = [barriers[p]['post_id_loss_barrier'] for p in ['A1-R1'] 
                 if 'post_id_loss_barrier' in barriers.get(p, {})]
    
    if same_mech and diff_mech:
        print(f"\nKey Statistics:")
        print(f"  Same-mechanism pairs avg barrier: {np.mean(same_mech):.4f}")
        print(f"  Diff-mechanism pair barrier:      {np.mean(diff_mech):.4f}")
        print(f"  Ratio (diff/same):                {np.mean(diff_mech)/np.mean(same_mech):.2f}x")
else:
    print("[WARNING] Interpolation results not found")

## 5. Semantic Barrier (SRS Variation)

In [None]:
if 'interpolation' in results and 'srs_interpolation' in results['interpolation']:
    srs_interp = results['interpolation']['srs_interpolation']
    
    print("\n" + "="*70)
    print("SEMANTIC BARRIER ANALYSIS (A1-R1 Interpolation)")
    print("="*70)
    print(f"\nSRS at endpoints:")
    print(f"  A1 (spurious, alpha=1): {srs_interp['srs_values'][-1]:.4f}")
    print(f"  R1 (robust, alpha=0):   {srs_interp['srs_values'][0]:.4f}")
    print(f"\nSemantic barrier metric:")
    print(f"  Max SRS variation: {srs_interp['semantic_barrier']:.4f}")
    print(f"  At alpha:          {srs_interp['semantic_barrier_alpha']:.2f}")
    
    # Plot SRS along path
    fig, ax = plt.subplots(figsize=(10, 6))
    alphas = srs_interp['alphas']
    srs_vals = srs_interp['srs_values']
    
    ax.plot(alphas, srs_vals, 'purple', linewidth=2, marker='o', markersize=8)
    ax.axhline(y=srs_vals[0], color='blue', linestyle='--', alpha=0.5, label='R1 (robust)')
    ax.axhline(y=srs_vals[-1], color='red', linestyle='--', alpha=0.5, label='A1 (spurious)')
    ax.set_xlabel(r'$\alpha$ (0=R1, 1=A1)', fontsize=12)
    ax.set_ylabel('Spurious Reliance Score', fontsize=12)
    ax.set_title('Semantic Barrier: SRS Along A1-R1 Interpolation', fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Annotate endpoints
    ax.annotate('Robust\n(low SRS)', xy=(0, srs_vals[0]), xytext=(0.15, srs_vals[0]-0.05),
                fontsize=10, ha='center')
    ax.annotate('Spurious\n(high SRS)', xy=(1, srs_vals[-1]), xytext=(0.85, srs_vals[-1]+0.05),
                fontsize=10, ha='center')
    
    plt.tight_layout()
    save_figure(fig, 'semantic_barrier_summary')
    plt.show()
else:
    print("[WARNING] SRS interpolation results not found")

## 6. Create Final Summary Figure

In [None]:
# Create comprehensive summary figure
fig = plt.figure(figsize=(16, 12))

# Panel 1: Model performance comparison
ax1 = fig.add_subplot(2, 2, 1)
if 'mechanism' in results:
    srs = results['mechanism']['srs_results']
    models = ['A1', 'A2', 'R1', 'R2']
    x = np.arange(len(models))
    width = 0.35
    
    id_accs = [srs[m]['id_accuracy']*100 for m in models]
    ood_accs = [srs[m]['ood_accuracy']*100 for m in models]
    
    ax1.bar(x - width/2, id_accs, width, label='ID Acc', color='steelblue')
    ax1.bar(x + width/2, ood_accs, width, label='OOD Acc', color='coral')
    ax1.set_xticks(x)
    ax1.set_xticklabels(models)
    ax1.set_ylabel('Accuracy (%)')
    ax1.set_title('(A) Model Performance: ID vs OOD')
    ax1.legend()
    ax1.grid(True, alpha=0.3, axis='y')

# Panel 2: SRS comparison
ax2 = fig.add_subplot(2, 2, 2)
if 'mechanism' in results:
    srs_vals = [srs[m]['spurious_reliance_score'] for m in models]
    colors = ['#e74c3c', '#e67e22', '#3498db', '#2ecc71']
    bars = ax2.bar(x, srs_vals, color=colors)
    ax2.set_xticks(x)
    ax2.set_xticklabels(models)
    ax2.set_ylabel('Spurious Reliance Score')
    ax2.set_title('(B) Spurious Reliance Score by Model')
    ax2.grid(True, alpha=0.3, axis='y')
    
    # Add horizontal line separating spurious vs robust
    avg_srs = np.mean(srs_vals)
    ax2.axhline(y=avg_srs, color='gray', linestyle='--', alpha=0.5)

# Panel 3: Barrier comparison
ax3 = fig.add_subplot(2, 2, 3)
if 'interpolation' in results and 'barrier_comparison' in results['interpolation']:
    barriers = results['interpolation']['barrier_comparison']
    pairs = list(barriers.keys())
    x = np.arange(len(pairs))
    width = 0.35
    
    pre = [barriers[p]['pre_id_loss_barrier'] for p in pairs]
    post = [barriers[p].get('post_id_loss_barrier', 0) for p in pairs]
    
    ax3.bar(x - width/2, pre, width, label='Pre-Rebasin', color='salmon')
    ax3.bar(x + width/2, post, width, label='Post-Rebasin', color='steelblue')
    ax3.set_xticks(x)
    ax3.set_xticklabels(pairs)
    ax3.set_ylabel('Loss Barrier')
    ax3.set_title('(C) Loss Barriers: Pre vs Post Re-Basin')
    ax3.legend()
    ax3.grid(True, alpha=0.3, axis='y')

# Panel 4: Key finding - barrier ratio
ax4 = fig.add_subplot(2, 2, 4)
if 'interpolation' in results and 'barrier_comparison' in results['interpolation']:
    barriers = results['interpolation']['barrier_comparison']
    
    # Get post-rebasin barriers
    same_mech_barriers = [barriers[p].get('post_id_loss_barrier', barriers[p]['pre_id_loss_barrier']) 
                         for p in ['A1-A2', 'R1-R2']]
    diff_mech_barrier = barriers['A1-R1'].get('post_id_loss_barrier', barriers['A1-R1']['pre_id_loss_barrier'])
    
    categories = ['Same\nMechanism', 'Different\nMechanism']
    values = [np.mean(same_mech_barriers), diff_mech_barrier]
    colors = ['#2ecc71', '#e74c3c']
    
    bars = ax4.bar(categories, values, color=colors, edgecolor='black', linewidth=2)
    ax4.set_ylabel('Post-Rebasin Loss Barrier')
    ax4.set_title('(D) Key Finding: Mechanism Mismatch = Higher Barrier')
    ax4.grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for bar, val in zip(bars, values):
        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{val:.4f}', ha='center', va='bottom', fontsize=12, fontweight='bold')
    
    # Add ratio annotation
    if values[0] > 0:
        ratio = values[1] / values[0]
        ax4.text(0.5, max(values) * 0.5, f'Ratio: {ratio:.1f}x', ha='center', fontsize=14,
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.suptitle('When Geometry Fails: Git Re-Basin on Spurious vs Robust Features', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
save_figure(fig, 'final_summary')
plt.show()

## 7. Key Findings for Blog Post

In [None]:
print("\n" + "="*70)
print("KEY FINDINGS FOR CLASS BLOG")
print("="*70)

findings = []

# Finding 1: Spurious models rely on patches
if 'mechanism' in results:
    srs = results['mechanism']['srs_results']
    spurious_srs = np.mean([srs['A1']['spurious_reliance_score'], srs['A2']['spurious_reliance_score']])
    robust_srs = np.mean([srs['R1']['spurious_reliance_score'], srs['R2']['spurious_reliance_score']])
    
    findings.append(
        f"1. **Spurious Feature Reliance**: Models trained on spurious-aligned data (A1, A2) "
        f"show {spurious_srs/robust_srs:.1f}x higher Spurious Reliance Score than robust models (R1, R2), "
        f"confirming they learn to rely on the colored patch shortcut."
    )

# Finding 2: OOD accuracy gap
if 'mechanism' in results:
    spurious_ood_drop = np.mean([srs['A1']['ood_drop'], srs['A2']['ood_drop']]) * 100
    robust_ood_drop = np.mean([srs['R1']['ood_drop'], srs['R2']['ood_drop']]) * 100
    
    findings.append(
        f"2. **OOD Generalization Gap**: Spurious models suffer {spurious_ood_drop:.1f}% accuracy drop "
        f"when patches are removed, while robust models only drop {robust_ood_drop:.1f}%."
    )

# Finding 3: Rebasin reduces barriers
if 'rebasin' in results:
    comp = results['rebasin']['comparison']
    avg_sim_increase = np.mean([comp[p]['cosine_sim_change'] for p in comp])
    
    findings.append(
        f"3. **Git Re-Basin Works**: Weight matching increases cosine similarity by "
        f"{avg_sim_increase:+.4f} on average, enabling more meaningful weight interpolation."
    )

# Finding 4: Different mechanisms = higher barriers
if 'interpolation' in results and 'barrier_comparison' in results['interpolation']:
    barriers = results['interpolation']['barrier_comparison']
    same_mech = np.mean([barriers[p].get('post_id_loss_barrier', barriers[p]['pre_id_loss_barrier']) 
                        for p in ['A1-A2', 'R1-R2']])
    diff_mech = barriers['A1-R1'].get('post_id_loss_barrier', barriers['A1-R1']['pre_id_loss_barrier'])
    
    findings.append(
        f"4. **Geometry Fails for Mechanism Mismatch**: Even after Re-Basin, spurious-robust pairs "
        f"have {diff_mech/same_mech:.1f}x higher loss barriers than same-mechanism pairs, "
        f"indicating that geometric alignment cannot bridge semantic differences."
    )

# Finding 5: Semantic barrier
if 'interpolation' in results and 'srs_interpolation' in results['interpolation']:
    srs_interp = results['interpolation']['srs_interpolation']
    
    findings.append(
        f"5. **Semantic Barrier Evidence**: Along the A1-R1 interpolation path, SRS varies from "
        f"{srs_interp['srs_values'][0]:.3f} (robust) to {srs_interp['srs_values'][-1]:.3f} (spurious), "
        f"demonstrating that intermediate models inherit inconsistent feature dependencies."
    )

# Print findings
for finding in findings:
    print(f"\n{finding}")

In [None]:
# Additional findings
print("\n" + "-"*70)
print("ADDITIONAL INSIGHTS")
print("-"*70)

additional = [
    "6. **Same-Mechanism Connectivity**: Models sharing the same feature dependency "
    "(both spurious or both robust) can be smoothly interpolated after Re-Basin, "
    "with minimal loss barriers along the path.",
    
    "7. **Practical Implication**: Before merging or ensembling models, practitioners should "
    "verify that models rely on similar features. Geometric tools like Re-Basin cannot fix "
    "fundamental differences in what models have learned.",
    
    "8. **Future Directions**: This work suggests that loss barrier analysis post-Re-Basin "
    "could serve as a diagnostic tool for detecting when models have learned qualitatively "
    "different solutions to the same task.",
]

for insight in additional:
    print(f"\n{insight}")

## 8. Export Final Summary

In [None]:
# Compile final summary
final_report = {
    'project_title': 'When Geometry Fails: Stress-Testing Git Re-Basin on Spurious vs Robust Features',
    'hypothesis': 'Permutation alignment (Git Re-Basin) can successfully connect models relying on the same feature type, but fails to connect models with mismatched mechanisms.',
    'key_findings': findings,
    'additional_insights': additional,
}

# Add numerical results if available
if 'mechanism' in results:
    final_report['model_performance'] = results['mechanism']['srs_results']
    final_report['group_statistics'] = results['mechanism']['group_statistics']

if 'interpolation' in results and 'barrier_comparison' in results['interpolation']:
    final_report['barrier_analysis'] = results['interpolation']['barrier_comparison']

# Save final report
report_path = RESULTS_DIR / 'final_report.json'
with open(report_path, 'w') as f:
    json.dump(final_report, f, indent=2, default=str)

print(f"\nFinal report saved to: {report_path}")

## 9. List All Generated Outputs

In [None]:
print("\n" + "="*70)
print("ALL GENERATED OUTPUTS")
print("="*70)

print("\nCheckpoints (results/checkpoints/):")
from src.config import CHECKPOINTS_DIR
for f in sorted(CHECKPOINTS_DIR.glob('*.pt')):
    print(f"  - {f.name}")

print("\nFigures (results/figures/):")
for f in sorted(FIGURES_DIR.glob('*.png')):
    print(f"  - {f.name}")

print("\nMetrics (results/metrics/):")
for f in sorted(METRICS_DIR.glob('*.json')):
    print(f"  - {f.name}")

print("\nSummary files (results/):")
for f in sorted(RESULTS_DIR.glob('*.json')):
    if f.parent == RESULTS_DIR:  # Only top-level
        print(f"  - {f.name}")

## 10. Conclusion

In [None]:
print("\n" + "="*70)
print("EXPERIMENT COMPLETE")
print("="*70)
print("""
This experiment demonstrated that:

1. Git Re-Basin (weight matching) successfully aligns models in weight space,
   increasing cosine similarity and reducing pre-rebasin loss barriers.

2. However, when models rely on fundamentally different features (spurious vs
   robust), significant barriers remain even after alignment.

3. The "semantic barrier" - measured by Spurious Reliance Score variation
   along the interpolation path - reveals mechanism mismatch that pure
   geometric methods cannot resolve.

4. This has practical implications for model merging, ensembling, and
   understanding the structure of loss landscapes.

Key Takeaway:
-------------
Geometry (permutation alignment) is necessary but not sufficient for
meaningful model interpolation. Models must also share similar learned
representations and feature dependencies.

"When geometry fails, it's because the models have learned to see
the world in fundamentally different ways."
""")