In [6]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib.gridspec import GridSpec

plt.rcParams.update({
    'font.family': 'DejaVu Sans',
    'figure.dpi': 150,
    'axes.spines.top': False,
    'axes.spines.right': False,
})

# ── Load data ──────────────────────────────────────────────────
with open('results/metrics/ablation_final.json') as f:
    raw = json.load(f)

rows = []
for e in raw:
    c = e['config']
    fm = e.get('failure_mode_distribution', {})
    rows.append({
        'exp_id':      c['exp_id'],
        'method':      c['retrieval_method'].upper(),
        'chunk_size':  c['chunk_size'],
        'overlap':     c['overlap'],
        'top_k':       c['top_k'],
        'faithfulness':      e['avg_faithfulness'],
        'ctx_relevance':     e['avg_context_relevance'],
        'ans_relevance':     e['avg_answer_relevance'],
        'hallucination':     e['hallucination_rate'],
        'abstention_rate':   e['honest_abstention_rate'],
        'latency':           e['avg_latency'],
        'correct':           fm.get('correct', 0),
        'partial_context':   fm.get('partial_context', 0),
        'honest_abstention': fm.get('honest_abstention', 0),
        'total_queries':     e['total_queries'],
    })

df = pd.DataFrame(rows).sort_values('faithfulness', ascending=False).reset_index(drop=True)
df['rank'] = df.index + 1
df['label'] = df['exp_id'] + '\n' + df['method'] + ' c' + df['chunk_size'].astype(str)
df['config'] = df['method'] + '\nchunk=' + df['chunk_size'].astype(str)

COLORS = {'BM25': '#1F77B4', 'DENSE': '#2CA02C', 'HYBRID': '#FF7F0E'}
df['color'] = df['method'].map(COLORS)

print(df[['exp_id','method','chunk_size','faithfulness','ctx_relevance','latency','rank']].to_string(index=False))

FileNotFoundError: [Errno 2] No such file or directory: 'results/metrics/ablation_final.json'

## Figure 1 — Leaderboard: Faithfulness & Context Relevance

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Figure 1 — Experiment Leaderboard', fontsize=14, fontweight='bold', y=1.02)

metrics = [('faithfulness', 'Faithfulness Score'), ('ctx_relevance', 'Context Relevance Score')]
for ax, (metric, title) in zip(axes, metrics):
    bars = ax.barh(df['label'][::-1], df[metric][::-1],
                   color=df['color'][::-1], edgecolor='white', height=0.6)
    for bar, val in zip(bars, df[metric][::-1]):
        ax.text(val + 0.005, bar.get_y() + bar.get_height()/2,
                f'{val:.3f}', va='center', fontsize=9, fontweight='bold')
    ax.set_xlim(0, 1.12)
    ax.set_xlabel(title, fontsize=11)
    ax.axvline(x=df[metric].mean(), color='gray', linestyle='--', alpha=0.5, linewidth=1)
    ax.text(df[metric].mean()+0.005, -0.5, f'avg={df[metric].mean():.3f}',
            color='gray', fontsize=8)

legend = [mpatches.Patch(color=v, label=k) for k, v in COLORS.items()]
axes[0].legend(handles=legend, loc='lower right', fontsize=9)
plt.tight_layout()
plt.savefig('results/fig1_leaderboard.png', bbox_inches='tight', dpi=150)
plt.show()
print('Saved: results/fig1_leaderboard.png')


KeyboardInterrupt



## Figure 2 — Chunk Size vs Metrics (per Method)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Figure 2 — Chunk Size Effect per Retrieval Method', fontsize=14, fontweight='bold')

metrics3 = [('faithfulness','Faithfulness'), ('ctx_relevance','Context Relevance'), ('latency','Latency (s)')]
chunk_sizes = [256, 512]

for ax, (metric, ylabel) in zip(axes, metrics3):
    for method, color in COLORS.items():
        sub = df[df['method'] == method].sort_values('chunk_size')
        if len(sub) == 2:
            ax.plot(sub['chunk_size'], sub[metric], 'o-', color=color,
                    label=method, linewidth=2, markersize=8)
            for _, row in sub.iterrows():
                ax.annotate(f"{row[metric]:.2f}",
                            (row['chunk_size'], row[metric]),
                            textcoords='offset points', xytext=(5, 5), fontsize=8)
    ax.set_xticks(chunk_sizes)
    ax.set_xlabel('Chunk Size', fontsize=11)
    ax.set_ylabel(ylabel, fontsize=11)
    ax.set_title(ylabel, fontsize=11)
    ax.legend(fontsize=9)
    if metric != 'latency':
        ax.set_ylim(0, 1.1)

plt.tight_layout()
plt.savefig('results/fig2_chunksize_effect.png', bbox_inches='tight', dpi=150)
plt.show()
print('Saved: results/fig2_chunksize_effect.png')

## Figure 3 — Heatmap: All Metrics per Experiment

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
fig.suptitle('Figure 3 — Metrics Heatmap (All Experiments)', fontsize=14, fontweight='bold')

hm_cols = ['faithfulness', 'ctx_relevance', 'ans_relevance', 'hallucination', 'abstention_rate']
hm_labels = ['Faithfulness', 'Context\nRelevance', 'Answer\nRelevance', 'Hallucination\nRate', 'Abstention\nRate']
hm_df = df.set_index('exp_id')[hm_cols]

# Custom colormap: green=good for faithfulness/ctx/ans, red=bad for hallucination
sns.heatmap(hm_df, annot=True, fmt='.3f', cmap='RdYlGn',
            linewidths=0.5, linecolor='white',
            ax=ax, vmin=0, vmax=1,
            xticklabels=hm_labels, annot_kws={'size': 11, 'weight': 'bold'})

ax.set_ylabel('')
# Annotate rank
for i, row in df.iterrows():
    ax.text(-0.6, i + 0.5, f'#{row["rank"]}', va='center', ha='center',
            fontsize=9, color='gray')

plt.tight_layout()
plt.savefig('results/fig3_heatmap.png', bbox_inches='tight', dpi=150)
plt.show()
print('Saved: results/fig3_heatmap.png')

## Figure 4 — Failure Mode Distribution (Stacked Bar)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
fig.suptitle('Figure 4 — Failure Mode Distribution per Experiment', fontsize=14, fontweight='bold')

x = np.arange(len(df))
w = 0.5
total = df['total_queries']

p1 = ax.bar(x, df['correct']/total*100,       w, label='Correct',           color='#2ECC71', edgecolor='white')
p2 = ax.bar(x, df['partial_context']/total*100, w, label='Partial Context',  color='#F39C12', edgecolor='white',
            bottom=df['correct']/total*100)
p3 = ax.bar(x, df['honest_abstention']/total*100, w, label='Honest Abstention', color='#95A5A6', edgecolor='white',
            bottom=(df['correct']+df['partial_context'])/total*100)

ax.set_xticks(x)
ax.set_xticklabels(df['config'], fontsize=9)
ax.set_ylabel('Percentage of Queries (%)', fontsize=11)
ax.set_ylim(0, 115)
ax.legend(fontsize=10, loc='upper right')
ax.axhline(y=100, color='black', linestyle=':', alpha=0.3)

# Value labels on bars
for bar in [p1, p2, p3]:
    for rect in bar:
        h = rect.get_height()
        if h > 5:
            ax.text(rect.get_x() + rect.get_width()/2., rect.get_y() + h/2,
                    f'{h:.0f}%', ha='center', va='center', fontsize=8, fontweight='bold', color='white')

plt.tight_layout()
plt.savefig('results/fig4_failure_modes.png', bbox_inches='tight', dpi=150)
plt.show()
print('Saved: results/fig4_failure_modes.png')

## Figure 5 — Quality vs Latency Trade-off (Bubble Chart)

In [2]:
fig, ax = plt.subplots(figsize=(10, 6))
fig.suptitle('Figure 5 — Quality vs Latency Trade-off', fontsize=14, fontweight='bold')

for _, row in df.iterrows():
    size = (row['ctx_relevance'] + 0.1) * 800
    ax.scatter(row['latency'], row['faithfulness'],
               s=size, color=COLORS[row['method']],
               alpha=0.75, edgecolors='white', linewidth=1.5, zorder=3)
    ax.annotate(f"{row['exp_id']}\n{row['method']} c{row['chunk_size']}",
                (row['latency'], row['faithfulness']),
                xytext=(8, 4), textcoords='offset points', fontsize=8)

ax.set_xlabel('Average Latency (seconds)', fontsize=12)
ax.set_ylabel('Faithfulness Score', fontsize=12)
ax.set_ylim(0.5, 1.1)
ax.axhline(y=df['faithfulness'].mean(), color='gray', linestyle='--', alpha=0.4, label=f'avg faith={df["faithfulness"].mean():.3f}')
ax.axvline(x=df['latency'].mean(), color='gray', linestyle=':', alpha=0.4, label=f'avg lat={df["latency"].mean():.0f}s')

legend = [mpatches.Patch(color=v, label=k) for k, v in COLORS.items()]
ax.legend(handles=legend + [plt.Line2D([0],[0],color='gray',linestyle='--',label=f'avg faith={df["faithfulness"].mean():.3f}'),
                              plt.Line2D([0],[0],color='gray',linestyle=':',label=f'avg lat={df["latency"].mean():.0f}s')],
          fontsize=9)
ax.text(0.02, 0.97, 'Bubble size = Context Relevance', transform=ax.transAxes,
        fontsize=8, color='gray', va='top')

plt.tight_layout()
plt.savefig('results/fig5_quality_latency.png', bbox_inches='tight', dpi=150)
plt.show()
print('Saved: results/fig5_quality_latency.png')

NameError: name 'plt' is not defined

## Figure 6 — Method Comparison (Radar / Spider Chart)

In [None]:
from matplotlib.patches import FancyArrowPatch

# Average per method
method_avg = df.groupby('method')[['faithfulness','ctx_relevance','ans_relevance','abstention_rate']].mean()
method_avg['correct_rate'] = df.groupby('method').apply(lambda x: (x['correct']/x['total_queries']).mean())

categories = ['Faithfulness', 'Ctx Relevance', 'Ans Relevance', 'Correct Rate', 'Low Abstention']
methods = method_avg.index.tolist()

values_dict = {}
for m in methods:
    r = method_avg.loc[m]
    values_dict[m] = [
        r['faithfulness'],
        r['ctx_relevance'],
        r['ans_relevance'],
        r['correct_rate'],
        1 - r['abstention_rate'],   # invert: lower abstention = better
    ]

N = len(categories)
angles = np.linspace(0, 2*np.pi, N, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(7, 7), subplot_kw=dict(polar=True))
fig.suptitle('Figure 6 — Method Comparison (Radar)', fontsize=14, fontweight='bold')

for method in methods:
    vals = values_dict[method] + values_dict[method][:1]
    ax.plot(angles, vals, 'o-', linewidth=2, label=method, color=COLORS[method])
    ax.fill(angles, vals, alpha=0.1, color=COLORS[method])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=10)
ax.set_ylim(0, 1)
ax.set_yticks([0.25, 0.5, 0.75, 1.0])
ax.set_yticklabels(['0.25','0.50','0.75','1.00'], fontsize=7, color='gray')
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/fig6_radar.png', bbox_inches='tight', dpi=150)
plt.show()
print('Saved: results/fig6_radar.png')

## Summary Table — Final Rankings

In [3]:
print('=' * 75)
print('  FINAL ABLATION SUMMARY')
print('=' * 75)
print(f'{"Rank":<5} {"Exp":<8} {"Method":<8} {"Chunk":<6} {"Faith":>8} {"CtxRel":>8} {"AnsRel":>8} {"Lat(s)":>8}')
print('-' * 75)
for _, r in df.iterrows():
    print(f'{r["rank"]:<5} {r["exp_id"]:<8} {r["method"]:<8} {r["chunk_size"]:<6} {r["faithfulness"]:>8.3f} {r["ctx_relevance"]:>8.3f} {r["ans_relevance"]:>8.3f} {r["latency"]:>8.1f}')
print('=' * 75)
print(f'\nWINNER: {df.iloc[0]["exp_id"]} — {df.iloc[0]["method"]} chunk={df.iloc[0]["chunk_size"]}')
print(f'ALL HALLUCINATION RATES: 0.000 (zero hallucinations across all experiments)')
print(f'\nKey insight: BM25 chunk=256 wins on faithfulness (1.000)')
print(f'Key insight: Hybrid chunk=256 underperforms — RRF hurts with small chunks')
print(f'Key insight: All methods plateau at ans_relevance=0.800 — generation stable')

  FINAL ABLATION SUMMARY
Rank  Exp      Method   Chunk     Faith   CtxRel   AnsRel   Lat(s)
---------------------------------------------------------------------------


NameError: name 'df' is not defined