# Agent Comparison Notebook

This notebook provides comprehensive comparison of all implemented RL algorithms for autonomous parcel routing.

## Algorithms Compared:
1. **Q-Learning**: Classic off-policy temporal difference learning
2. **Double Q-Learning**: Reduces overestimation bias with dual Q-tables
3. **SARSA**: On-policy temporal difference learning
4. **SARSA(λ)**: SARSA with eligibility traces

## Comparison Metrics:
- Learning performance and convergence
- Final policy quality
- Sample efficiency
- Exploration behavior
- Robustness across different scenarios

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path
from collections import defaultdict
import time

# Add src to path for imports
sys.path.append('../src')

from apr import WarehouseEnv, AgentEvaluator
from apr.agents import create_agent
from apr.train import run_episode

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Ensure reproducible results
np.random.seed(42)

## Environment and Agent Setup

In [None]:
# Create environment
env = WarehouseEnv(seed=42)
print(f"Environment: {env.n_rows}x{env.n_cols} warehouse")
print(f"Packages to collect: {len(env.packages_remaining)}")
print(f"Max steps per episode: {env.max_steps}")

# Visualize environment
env.reset()
env.render(mode='human')
plt.title('Warehouse Environment Layout')
plt.show()

# Algorithm configurations
algorithms = {
    'Q-Learning': {
        'name': 'q_learning',
        'params': {'alpha': 0.1, 'gamma': 0.95, 'epsilon': 0.3, 'epsilon_decay': 0.999},
        'color': 'skyblue',
        'description': 'Off-policy TD learning with experience replay'
    },
    'Double Q-Learning': {
        'name': 'double_q_learning',
        'params': {'alpha': 0.1, 'gamma': 0.95, 'epsilon': 0.3, 'epsilon_decay': 0.999},
        'color': 'lightgreen',
        'description': 'Dual Q-tables to reduce overestimation bias'
    },
    'SARSA': {
        'name': 'sarsa',
        'params': {'alpha': 0.1, 'gamma': 0.95, 'epsilon': 0.3, 'epsilon_decay': 0.999},
        'color': 'lightcoral',
        'description': 'On-policy TD learning'
    },
    'SARSA(λ)': {
        'name': 'sarsa_lambda',
        'params': {'alpha': 0.1, 'gamma': 0.95, 'epsilon': 0.3, 'epsilon_decay': 0.999, 'lambda_': 0.9},
        'color': 'lightyellow',
        'description': 'SARSA with eligibility traces'
    }
}

print(f"\nAlgorithms to compare: {list(algorithms.keys())}")
for name, config in algorithms.items():
    print(f"  {name}: {config['description']}")

## Training Performance Comparison

In [None]:
def train_agent(agent_name, config, episodes=800, log_interval=100):
    """
    Train an agent and return training metrics.
    """
    print(f"\n🤖 Training {agent_name}...")
    
    # Create agent
    agent = create_agent(
        config['name'],
        env.observation_space,
        env.action_space,
        **config['params']
    )
    
    # Training metrics
    episode_rewards = []
    episode_lengths = []
    epsilon_values = []
    training_times = []
    
    start_time = time.time()
    
    for episode in range(episodes):
        episode_start = time.time()
        
        # Run episode
        reward = run_episode(env, agent, training=True)
        episode_time = time.time() - episode_start
        
        # Track metrics
        episode_rewards.append(reward)
        episode_lengths.append(env.episode_length if hasattr(env, 'episode_length') else 0)
        epsilon_values.append(agent.epsilon)
        training_times.append(episode_time)
        
        # Logging
        if (episode + 1) % log_interval == 0:
            avg_reward = np.mean(episode_rewards[-log_interval:])
            print(f"  Episode {episode + 1:3d}: Avg Reward = {avg_reward:6.1f}, ε = {agent.epsilon:.3f}")
    
    total_time = time.time() - start_time
    
    print(f"  ✅ {agent_name} completed in {total_time:.1f}s")
    print(f"     Final performance: {np.mean(episode_rewards[-50:]):.1f} (last 50 episodes)")
    
    return {
        'agent': agent,
        'rewards': episode_rewards,
        'lengths': episode_lengths,
        'epsilon_values': epsilon_values,
        'training_times': training_times,
        'total_time': total_time,
        'final_performance': np.mean(episode_rewards[-50:])
    }

# Train all agents
print("🏁 Starting Algorithm Comparison Training")
print("=" * 50)

training_results = {}
for agent_name, config in algorithms.items():
    training_results[agent_name] = train_agent(agent_name, config, episodes=600)

print("\n🎉 All training completed!")

## Learning Curves Analysis

In [None]:
# Plot comprehensive learning curves
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Raw learning curves
ax1 = axes[0, 0]
for agent_name, results in training_results.items():
    color = algorithms[agent_name]['color']
    ax1.plot(results['rewards'], alpha=0.3, color=color, linewidth=0.5)
    
    # Add smoothed curve
    window = 50
    smoothed = np.convolve(results['rewards'], np.ones(window)/window, mode='valid')
    ax1.plot(range(window-1, len(results['rewards'])), smoothed, 
             color=color, linewidth=2, label=agent_name)

ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward')
ax1.set_title('Learning Curves (Raw + Smoothed)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Epsilon decay comparison
ax2 = axes[0, 1]
for agent_name, results in training_results.items():
    color = algorithms[agent_name]['color']
    ax2.plot(results['epsilon_values'], color=color, linewidth=2, label=agent_name)

ax2.set_xlabel('Episode')
ax2.set_ylabel('Epsilon')
ax2.set_title('Exploration Decay')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Final performance comparison
ax3 = axes[1, 0]
agent_names = list(training_results.keys())
final_performances = [results['final_performance'] for results in training_results.values()]
colors = [algorithms[name]['color'] for name in agent_names]

bars = ax3.bar(agent_names, final_performances, color=colors, alpha=0.8)
ax3.set_xlabel('Algorithm')
ax3.set_ylabel('Final Performance (Last 50 Episodes)')
ax3.set_title('Final Performance Comparison')
ax3.tick_params(axis='x', rotation=45)

# Add value labels
for bar, perf in zip(bars, final_performances):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
             f'{perf:.1f}', ha='center', va='bottom', fontweight='bold')

# 4. Training time comparison
ax4 = axes[1, 1]
training_times = [results['total_time'] for results in training_results.values()]
bars = ax4.bar(agent_names, training_times, color=colors, alpha=0.8)
ax4.set_xlabel('Algorithm')
ax4.set_ylabel('Training Time (seconds)')
ax4.set_title('Training Efficiency')
ax4.tick_params(axis='x', rotation=45)

for bar, time_val in zip(bars, training_times):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
             f'{time_val:.1f}s', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## Convergence Analysis

In [None]:
# Analyze convergence characteristics
def analyze_convergence(rewards, window=100):
    """
    Analyze convergence characteristics of learning curve.
    """
    if len(rewards) < window * 2:
        return {'error': 'Insufficient data'}
    
    # Calculate moving average
    moving_avg = np.convolve(rewards, np.ones(window)/window, mode='valid')
    
    # Find convergence point (when improvement becomes < 5% over 100 episodes)
    convergence_episode = len(moving_avg)
    for i in range(window, len(moving_avg) - window):
        current_avg = np.mean(moving_avg[i:i+window])
        previous_avg = np.mean(moving_avg[i-window:i])
        improvement = (current_avg - previous_avg) / abs(previous_avg) if previous_avg != 0 else 0
        
        if improvement < 0.05:  # Less than 5% improvement
            convergence_episode = i
            break
    
    # Sample efficiency (episodes to reach 80% of final performance)
    final_perf = np.mean(rewards[-50:])
    target_perf = 0.8 * final_perf
    sample_efficiency = len(rewards)
    
    for i, avg in enumerate(moving_avg):
        if avg >= target_perf:
            sample_efficiency = i + window
            break
    
    return {
        'convergence_episode': convergence_episode,
        'sample_efficiency': sample_efficiency,
        'final_performance': final_perf,
        'stability': np.std(rewards[-100:])  # Stability in final 100 episodes
    }

# Analyze all algorithms
convergence_analysis = {}
for agent_name, results in training_results.items():
    convergence_analysis[agent_name] = analyze_convergence(results['rewards'])

# Create convergence comparison table
convergence_data = []
for agent_name, analysis in convergence_analysis.items():
    if 'error' not in analysis:
        convergence_data.append({
            'Algorithm': agent_name,
            'Convergence Episode': analysis['convergence_episode'],
            'Sample Efficiency': analysis['sample_efficiency'],
            'Final Performance': f"{analysis['final_performance']:.1f}",
            'Stability (σ)': f"{analysis['stability']:.1f}"
        })

convergence_df = pd.DataFrame(convergence_data)
print("📊 CONVERGENCE ANALYSIS")
print("=" * 50)
print(convergence_df.to_string(index=False))

# Visualize convergence metrics
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics = ['convergence_episode', 'sample_efficiency', 'stability']
titles = ['Episodes to Convergence', 'Sample Efficiency', 'Final Stability']
colors = [algorithms[name]['color'] for name in convergence_analysis.keys()]

for i, (metric, title) in enumerate(zip(metrics, titles)):
    values = [analysis[metric] for analysis in convergence_analysis.values() if 'error' not in analysis]
    agent_names = [name for name, analysis in convergence_analysis.items() if 'error' not in analysis]
    
    bars = axes[i].bar(agent_names, values, color=colors, alpha=0.8)
    axes[i].set_title(title)
    axes[i].tick_params(axis='x', rotation=45)
    
    # Add value labels
    for bar, val in zip(bars, values):
        axes[i].text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(values)*0.01,
                     f'{val:.0f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Comprehensive Agent Evaluation

In [None]:
# Use the evaluation module for comprehensive testing
print("🔍 Comprehensive Agent Evaluation")
print("=" * 40)

evaluator = AgentEvaluator(env, verbose=True)

# Collect trained agents
trained_agents = {name: results['agent'] for name, results in training_results.items()}

# Run comprehensive comparison
evaluation_results = evaluator.compare_agents(
    trained_agents,
    num_episodes=100,
    seeds=[42, 123, 456]
)

# Display evaluation results
print("\n📊 COMPREHENSIVE EVALUATION RESULTS")
print("=" * 50)

comparison_stats = evaluation_results['comparison_stats']
agent_stats = comparison_stats['agent_statistics']

# Create detailed comparison table
eval_data = []
for agent_name, stats in agent_stats.items():
    eval_data.append({
        'Algorithm': agent_name,
        'Mean Reward': f"{stats['mean_reward']:.1f}",
        'Success Rate': f"{stats['success_rate']:.1%}",
        'Episode Length': f"{stats['episode_length']:.1f}",
        'State Coverage': f"{stats['state_coverage']:.1%}"
    })

eval_df = pd.DataFrame(eval_data)
print(eval_df.to_string(index=False))

# Print rankings
print("\n🏆 RANKINGS:")
rankings = comparison_stats['rankings']
for metric, ranking in rankings.items():
    print(f"\n{metric.replace('_', ' ').title()}:")
    for i, (agent_name, _) in enumerate(ranking):
        print(f"  {i+1}. {agent_name}")

## Evaluation Visualization

In [None]:
# Generate comprehensive evaluation visualization
evaluator.visualize_evaluation(evaluation_results)

print("✅ Comprehensive evaluation visualization complete!")

## Statistical Significance Analysis

In [None]:
# Perform statistical tests between algorithms
from scipy import stats

print("📈 STATISTICAL SIGNIFICANCE ANALYSIS")
print("=" * 45)

# Get final performance data for all algorithms
algorithm_performances = {}
for agent_name, results in training_results.items():
    # Use last 100 episodes for statistical testing
    algorithm_performances[agent_name] = results['rewards'][-100:]

# Pairwise t-tests
algorithm_names = list(algorithm_performances.keys())
n_algorithms = len(algorithm_names)

print("Pairwise t-test results (p-values):")
print("-" * 40)

# Create significance matrix
significance_matrix = np.ones((n_algorithms, n_algorithms))

for i in range(n_algorithms):
    for j in range(i+1, n_algorithms):
        alg1_name = algorithm_names[i]
        alg2_name = algorithm_names[j]
        
        alg1_perf = algorithm_performances[alg1_name]
        alg2_perf = algorithm_performances[alg2_name]
        
        # Perform t-test
        t_stat, p_value = stats.ttest_ind(alg1_perf, alg2_perf)
        significance_matrix[i, j] = p_value
        significance_matrix[j, i] = p_value
        
        # Determine which is better
        better = alg1_name if np.mean(alg1_perf) > np.mean(alg2_perf) else alg2_name
        significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
        
        print(f"{alg1_name} vs {alg2_name}: p={p_value:.4f} {significance} (better: {better})")

# Visualize significance matrix
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(significance_matrix))
sns.heatmap(significance_matrix, annot=True, fmt='.4f', cmap='RdYlBu_r',
            xticklabels=algorithm_names, yticklabels=algorithm_names,
            mask=mask, cbar_kws={'label': 'p-value'})
plt.title('Statistical Significance Matrix (p-values)')
plt.tight_layout()
plt.show()

print("\nSignificance levels: *** p<0.001, ** p<0.01, * p<0.05, ns = not significant")

## Algorithm Characteristics Summary

In [None]:
print("🎯 ALGORITHM CHARACTERISTICS SUMMARY")
print("=" * 50)

# Analyze each algorithm's strengths and weaknesses
for agent_name, config in algorithms.items():
    if agent_name in training_results:
        results = training_results[agent_name]
        convergence = convergence_analysis[agent_name]
        
        print(f"\n{agent_name}:")
        print(f"  Description: {config['description']}")
        print(f"  Final Performance: {results['final_performance']:.1f}")
        print(f"  Training Time: {results['total_time']:.1f}s")
        
        if 'error' not in convergence:
            print(f"  Sample Efficiency: {convergence['sample_efficiency']} episodes")
            print(f"  Stability: {convergence['stability']:.1f}")
        
        # Get evaluation stats if available
        if agent_name in agent_stats:
            eval_stats = agent_stats[agent_name]
            print(f"  Success Rate: {eval_stats['success_rate']:.1%}")
            print(f"  State Coverage: {eval_stats['state_coverage']:.1%}")

# Overall recommendations
print("\n💡 RECOMMENDATIONS:")
print("-" * 20)

# Find best performer overall
best_performer = max(training_results.items(), key=lambda x: x[1]['final_performance'])
print(f"• Best Overall Performance: {best_performer[0]} ({best_performer[1]['final_performance']:.1f} reward)")

# Find most sample efficient
most_efficient = min([(name, analysis) for name, analysis in convergence_analysis.items() 
                     if 'error' not in analysis], key=lambda x: x[1]['sample_efficiency'])
print(f"• Most Sample Efficient: {most_efficient[0]} ({most_efficient[1]['sample_efficiency']} episodes)")

# Find most stable
most_stable = min([(name, analysis) for name, analysis in convergence_analysis.items() 
                  if 'error' not in analysis], key=lambda x: x[1]['stability'])
print(f"• Most Stable: {most_stable[0]} (σ = {most_stable[1]['stability']:.1f})")

print("\n🔬 INSIGHTS:")
print("-" * 10)
print("• Double Q-Learning typically reduces overestimation bias")
print("• SARSA is more conservative due to on-policy learning")
print("• SARSA(λ) can improve credit assignment with eligibility traces")
print("• Q-Learning is often more sample efficient for deterministic environments")

## Save Comparison Results

In [None]:
# Save comprehensive comparison results
results_dir = Path('../comparison_results')
results_dir.mkdir(exist_ok=True)

# Save training metrics
training_summary = pd.DataFrame([
    {
        'Algorithm': name,
        'Final_Performance': results['final_performance'],
        'Training_Time': results['total_time'],
        'Convergence_Episode': convergence_analysis[name].get('convergence_episode', 'N/A'),
        'Sample_Efficiency': convergence_analysis[name].get('sample_efficiency', 'N/A'),
        'Stability': convergence_analysis[name].get('stability', 'N/A')
    }
    for name, results in training_results.items()
])

training_summary.to_csv(results_dir / 'algorithm_comparison_summary.csv', index=False)

# Save detailed training curves
for agent_name, results in training_results.items():
    agent_df = pd.DataFrame({
        'episode': range(1, len(results['rewards']) + 1),
        'reward': results['rewards'],
        'epsilon': results['epsilon_values'],
        'training_time': results['training_times']
    })
    
    filename = agent_name.lower().replace(' ', '_').replace('(', '').replace(')', '')
    agent_df.to_csv(results_dir / f'{filename}_training_curve.csv', index=False)

# Save evaluation results
if 'evaluation_results' in locals():
    eval_df.to_csv(results_dir / 'evaluation_comparison.csv', index=False)

print(f"💾 All comparison results saved to {results_dir}")
print("\n📁 Files saved:")
for file in results_dir.glob('*.csv'):
    print(f"  - {file.name}")

print("\n✅ Algorithm comparison analysis complete!")