# RAG Robustness Experiments - Analysis & Visualization

This notebook loads experiment results and provides comprehensive visualizations of RAG system performance across different noise types and retrieval strategies.

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Make plots larger
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

## Load Experiment Results

In [None]:
# Load results
results_path = '../results/experiment_results.json'

with open(results_path, 'r') as f:
    results = json.load(f)

print(f"Loaded results from: {results_path}")
print(f"Timestamp: {results['metadata']['timestamp']}")
print(f"\nNoise types: {results['metadata']['noise_types']}")
print(f"Retrieval strategies: {results['metadata']['retrieval_strategies']}")

## Overall Performance Summary

In [None]:
summary = results['summary']

print("="*80)
print("OVERALL PERFORMANCE SUMMARY")
print("="*80)
print(f"\nTotal Queries: {summary['overall']['total_queries']}")
print(f"Average Precision@5: {summary['overall']['avg_precision_at_5']:.3f}")
print(f"Standard Deviation: {summary['overall']['std_precision_at_5']:.3f}")

## 1. Heatmap: Performance Across Noise Types and Strategies

In [None]:
# Extract data for heatmap
noise_types = results['metadata']['noise_types']
strategies = results['metadata']['retrieval_strategies']

# Build matrix of average precision@5 scores
heatmap_data = []

for noise_type in noise_types:
    row = []
    for strategy in strategies:
        if noise_type in summary['by_noise_type'] and strategy in summary['by_noise_type'][noise_type]['strategies']:
            score = summary['by_noise_type'][noise_type]['strategies'][strategy]['avg_precision_at_5']
            row.append(score)
        else:
            row.append(0.0)
    heatmap_data.append(row)

# Create DataFrame
df_heatmap = pd.DataFrame(
    heatmap_data,
    index=[nt.replace('_', ' ').title() for nt in noise_types],
    columns=[s.replace('_retrieval', '').replace('_', ' ').title() for s in strategies]
)

# Create heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(
    df_heatmap,
    annot=True,
    fmt='.3f',
    cmap='RdYlGn',
    vmin=0,
    vmax=1,
    cbar_kws={'label': 'Precision@5'},
    linewidths=0.5
)
plt.title('RAG Performance Heatmap: Precision@5 by Noise Type and Strategy', fontsize=16, pad=20)
plt.xlabel('Retrieval Strategy', fontsize=13)
plt.ylabel('Noise Type', fontsize=13)
plt.tight_layout()
plt.savefig('../results/heatmap_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print("Heatmap saved to: results/heatmap_performance.png")

## 2. Bar Chart: Strategy Comparison

In [None]:
# Extract strategy performance data
strategy_names = []
strategy_scores = []
strategy_stds = []

for strategy in strategies:
    if strategy in summary['by_strategy']:
        strategy_names.append(strategy.replace('_retrieval', '').replace('_', ' ').title())
        strategy_scores.append(summary['by_strategy'][strategy]['avg_precision_at_5'])
        strategy_stds.append(summary['by_strategy'][strategy]['std_precision_at_5'])

# Create bar chart
fig, ax = plt.subplots(figsize=(12, 7))
x_pos = np.arange(len(strategy_names))
bars = ax.bar(x_pos, strategy_scores, yerr=strategy_stds, capsize=5, alpha=0.8, edgecolor='black')

# Color bars based on performance
colors = plt.cm.RdYlGn(np.array(strategy_scores))
for bar, color in zip(bars, colors):
    bar.set_color(color)

# Add value labels on bars
for i, (score, std) in enumerate(zip(strategy_scores, strategy_stds)):
    ax.text(i, score + std + 0.02, f'{score:.3f}', ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.set_xlabel('Retrieval Strategy', fontsize=13)
ax.set_ylabel('Average Precision@5', fontsize=13)
ax.set_title('Retrieval Strategy Performance Comparison\n(averaged across all noise types)', fontsize=16, pad=20)
ax.set_xticks(x_pos)
ax.set_xticklabels(strategy_names, rotation=15, ha='right')
ax.set_ylim(0, 1.0)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('../results/strategy_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Bar chart saved to: results/strategy_comparison.png")

## 3. Grouped Bar Chart: Performance by Noise Type

In [None]:
# Create grouped bar chart
fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(noise_types))
width = 0.2
multiplier = 0

for strategy in strategies:
    scores = []
    for noise_type in noise_types:
        if noise_type in summary['by_noise_type'] and strategy in summary['by_noise_type'][noise_type]['strategies']:
            score = summary['by_noise_type'][noise_type]['strategies'][strategy]['avg_precision_at_5']
            scores.append(score)
        else:
            scores.append(0.0)
    
    offset = width * multiplier
    rects = ax.bar(x + offset, scores, width, label=strategy.replace('_retrieval', '').replace('_', ' ').title())
    multiplier += 1

ax.set_xlabel('Noise Type', fontsize=13)
ax.set_ylabel('Precision@5', fontsize=13)
ax.set_title('Strategy Performance Across Different Noise Types', fontsize=16, pad=20)
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels([nt.replace('_', ' ').title() for nt in noise_types], rotation=15, ha='right')
ax.legend(loc='upper right', framealpha=0.9)
ax.set_ylim(0, 1.0)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('../results/noise_type_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Grouped bar chart saved to: results/noise_type_comparison.png")

## 4. Detailed Statistics Table

In [None]:
# Create detailed statistics table
stats_data = []

for strategy in strategies:
    if strategy in summary['by_strategy']:
        stats = summary['by_strategy'][strategy]
        stats_data.append({
            'Strategy': strategy.replace('_retrieval', '').replace('_', ' ').title(),
            'Queries': stats['num_queries'],
            'Avg P@5': f"{stats['avg_precision_at_5']:.3f}",
            'Std Dev': f"{stats['std_precision_at_5']:.3f}",
            'Min': f"{stats['min_precision_at_5']:.3f}",
            'Max': f"{stats['max_precision_at_5']:.3f}"
        })

df_stats = pd.DataFrame(stats_data)
print("\nDetailed Strategy Statistics:")
print("="*80)
display(df_stats)

## 5. Example Failures for Each Noise Type

In [None]:
# Display example failures
print("\n" + "="*80)
print("EXAMPLE FAILURES BY NOISE TYPE")
print("="*80 + "\n")

for noise_type in noise_types:
    if noise_type in results['examples'] and results['examples'][noise_type]:
        print(f"\n{'─'*80}")
        print(f"NOISE TYPE: {noise_type.upper().replace('_', ' ')}")
        print(f"{'─'*80}\n")
        
        for strategy, failures in results['examples'][noise_type].items():
            if failures:
                print(f"\nStrategy: {strategy.replace('_retrieval', '').replace('_', ' ').title()}")
                print("-" * 80)
                
                for i, failure in enumerate(failures[:2], 1):  # Show top 2 failures per strategy
                    print(f"\nExample {i}:")
                    print(f"  Query ID: {failure['query_id']}")
                    print(f"  Query: {failure['query']}")
                    if failure['query'] != failure['original_query']:
                        print(f"  Original: {failure['original_query']}")
                    print(f"  Query Type: {failure['query_type']}")
                    
                    precision = failure['evaluation']['precision'].get('p@5', {}).get('precision_at_k', 0.0)
                    print(f"  Precision@5: {precision:.3f}")
                    
                    print(f"  Generated Answer: {failure['generated_answer'][:200]}...")
                    
                    if failure['retrieved_passages']:
                        print(f"  Top Retrieved Passage: {failure['retrieved_passages'][0]['text'][:150]}...")
                    
                    print()

## 6. Performance Degradation Analysis

In [None]:
# Calculate performance degradation from clean baseline
print("\n" + "="*80)
print("PERFORMANCE DEGRADATION FROM CLEAN BASELINE")
print("="*80 + "\n")

degradation_data = []

for strategy in strategies:
    strategy_name = strategy.replace('_retrieval', '').replace('_', ' ').title()
    
    # Get clean baseline
    if 'clean' in summary['by_noise_type'] and strategy in summary['by_noise_type']['clean']['strategies']:
        baseline = summary['by_noise_type']['clean']['strategies'][strategy]['avg_precision_at_5']
        
        print(f"\n{strategy_name}:")
        print(f"  Baseline (clean): {baseline:.3f}")
        print(f"  Performance with noise:")
        
        for noise_type in ['noisy', 'ambiguous', 'context_dependent', 'adversarial']:
            if noise_type in summary['by_noise_type'] and strategy in summary['by_noise_type'][noise_type]['strategies']:
                score = summary['by_noise_type'][noise_type]['strategies'][strategy]['avg_precision_at_5']
                degradation = ((baseline - score) / baseline * 100) if baseline > 0 else 0
                
                print(f"    {noise_type.replace('_', ' ').title():20s}: {score:.3f} (-{degradation:.1f}%)")
                
                degradation_data.append({
                    'Strategy': strategy_name,
                    'Noise Type': noise_type.replace('_', ' ').title(),
                    'Degradation (%)': degradation
                })

# Visualize degradation
if degradation_data:
    df_degradation = pd.DataFrame(degradation_data)
    df_pivot = df_degradation.pivot(index='Noise Type', columns='Strategy', values='Degradation (%)')
    
    plt.figure(figsize=(12, 7))
    df_pivot.plot(kind='bar', ax=plt.gca(), width=0.8)
    plt.title('Performance Degradation from Clean Baseline', fontsize=16, pad=20)
    plt.xlabel('Noise Type', fontsize=13)
    plt.ylabel('Degradation (%)', fontsize=13)
    plt.legend(title='Strategy', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=15, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig('../results/performance_degradation.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nDegradation chart saved to: results/performance_degradation.png")

## 7. Query Type Analysis

In [None]:
# Analyze performance by query type
print("\n" + "="*80)
print("PERFORMANCE BY QUERY TYPE")
print("="*80 + "\n")

query_type_stats = {}

for noise_type, strategies_dict in results['detailed_results'].items():
    for strategy, result_list in strategies_dict.items():
        for result in result_list:
            q_type = result.get('query_type', 'unknown')
            
            if q_type not in query_type_stats:
                query_type_stats[q_type] = []
            
            precision = result['evaluation']['precision'].get('p@5', {}).get('precision_at_k', 0.0)
            query_type_stats[q_type].append(precision)

# Calculate averages
query_type_avgs = {}
for q_type, precisions in query_type_stats.items():
    if precisions:
        query_type_avgs[q_type] = {
            'count': len(precisions),
            'avg': np.mean(precisions),
            'std': np.std(precisions)
        }

# Sort by average precision
sorted_types = sorted(query_type_avgs.items(), key=lambda x: x[1]['avg'], reverse=True)

print(f"{'Query Type':<20} {'Count':<10} {'Avg P@5':<12} {'Std Dev':<10}")
print("-" * 60)
for q_type, stats in sorted_types:
    print(f"{q_type:<20} {stats['count']:<10} {stats['avg']:<12.3f} {stats['std']:<10.3f}")

# Visualize
if query_type_avgs:
    fig, ax = plt.subplots(figsize=(12, 7))
    
    types = [t[0] for t in sorted_types]
    avgs = [t[1]['avg'] for t in sorted_types]
    stds = [t[1]['std'] for t in sorted_types]
    
    bars = ax.bar(range(len(types)), avgs, yerr=stds, capsize=5, alpha=0.8, edgecolor='black')
    
    # Color bars
    colors = plt.cm.RdYlGn(np.array(avgs))
    for bar, color in zip(bars, colors):
        bar.set_color(color)
    
    ax.set_xlabel('Query Type', fontsize=13)
    ax.set_ylabel('Average Precision@5', fontsize=13)
    ax.set_title('Performance by Query Type\n(averaged across all strategies and noise types)', fontsize=16, pad=20)
    ax.set_xticks(range(len(types)))
    ax.set_xticklabels(types, rotation=45, ha='right')
    ax.set_ylim(0, 1.0)
    ax.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig('../results/query_type_performance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nQuery type chart saved to: results/query_type_performance.png")

## Summary

This notebook provides a comprehensive analysis of RAG system robustness across different noise types and retrieval strategies. Key visualizations include:

1. **Heatmap** - Overall performance matrix
2. **Bar Charts** - Strategy and noise type comparisons
3. **Example Failures** - Specific cases where the system struggled
4. **Degradation Analysis** - How performance degrades with different noise types
5. **Query Type Analysis** - Performance across different question types

All visualizations are saved to the `results/` directory.