# ViZDoom Ablation Study - Results Analysis

**IEEE Report Data Generation**

This notebook analyzes all experiment results and generates tables/plots for the report.

---
## 1. Setup & Load Data

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Check if on Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    RESULTS_DIR = Path('/content/drive/MyDrive/vizdoom-ablation-results')
else:
    RESULTS_DIR = Path('results')

print(f"Results directory: {RESULTS_DIR}")
print(f"Exists: {RESULTS_DIR.exists()}")

In [None]:
# Load all experiment results
def load_all_results(results_dir):
    """Load all experiment metadata and summaries."""
    results = []
    
    for date_folder in sorted(results_dir.iterdir()):
        if not date_folder.is_dir() or date_folder.name.startswith('.'):
            continue
            
        for run_folder in sorted(date_folder.iterdir()):
            if not run_folder.is_dir():
                continue
                
            meta_file = run_folder / 'metadata.json'
            summary_file = run_folder / 'summary.json'
            csv_file = run_folder / 'training_log.csv'
            
            if not meta_file.exists():
                continue
                
            try:
                with open(meta_file) as f:
                    meta = json.load(f)
                
                # Load summary if exists
                summary = {}
                if summary_file.exists():
                    with open(summary_file) as f:
                        summary = json.load(f)
                
                # Load training CSV if exists
                training_df = None
                if csv_file.exists():
                    training_df = pd.read_csv(csv_file)
                
                # Categorize experiment
                agent = meta.get('agent_type', 'unknown')
                n_step = meta.get('n_step', 1)
                per = meta.get('buffer_prioritized', False)
                
                if agent == 'dqn' and n_step == 1 and not per:
                    phase = 'Phase1'
                    method = 'DQN'
                elif agent == 'deep_sarsa':
                    phase = 'Phase1'
                    method = 'DeepSARSA'
                elif n_step > 1:
                    phase = 'Phase2'
                    method = f'DQN_n{n_step}'
                elif per:
                    phase = 'Phase3a'
                    method = 'DQN_PER'
                elif agent == 'ddqn':
                    phase = 'Phase3b'
                    method = 'DDQN'
                elif 'dueling' in agent:
                    phase = 'Phase3c'
                    method = 'Dueling_DDQN'
                else:
                    phase = 'Other'
                    method = agent
                
                results.append({
                    'phase': phase,
                    'method': method,
                    'scenario': meta.get('scenario_short', meta.get('scenario', 'unknown')),
                    'seed': meta.get('seed', 0),
                    'n_step': n_step,
                    'per': per,
                    'lr': meta.get('learning_rate', 0.0001),
                    'final_reward': summary.get('final_avg_reward', None),
                    'best_eval': summary.get('best_eval_reward', None),
                    'total_episodes': summary.get('total_episodes', None),
                    'training_time': summary.get('training_time_seconds', None),
                    'run_dir': str(run_folder),
                    'training_df': training_df,
                    'date': date_folder.name
                })
            except Exception as e:
                print(f"Error loading {run_folder}: {e}")
                continue
    
    return results

# Load all results
all_results = load_all_results(RESULTS_DIR)
print(f"\nLoaded {len(all_results)} experiment runs")

# Convert to DataFrame (without training_df column)
df_results = pd.DataFrame([{k:v for k,v in r.items() if k != 'training_df'} for r in all_results])
print(f"\nColumns: {list(df_results.columns)}")

---
## 2. Experiment Overview

In [None]:
# Summary by phase and method
print("=" * 70)
print("EXPERIMENT OVERVIEW")
print("=" * 70)

overview = df_results.groupby(['phase', 'method', 'scenario']).agg({
    'seed': 'count',
    'final_reward': ['mean', 'std'],
    'best_eval': ['mean', 'std']
}).round(2)

overview.columns = ['runs', 'final_mean', 'final_std', 'best_mean', 'best_std']
print(overview.to_string())

In [None]:
# Count by phase
print("\n" + "=" * 70)
print("RUNS PER PHASE")
print("=" * 70)
phase_counts = df_results.groupby('phase').size()
for phase, count in phase_counts.items():
    print(f"{phase:15} | {count} runs")
print(f"{'TOTAL':15} | {len(df_results)} runs")

---
## 3. Phase 1: DQN vs Deep SARSA

In [None]:
# Phase 1 comparison
phase1 = df_results[df_results['phase'] == 'Phase1'].copy()

print("=" * 70)
print("PHASE 1: DQN vs Deep SARSA")
print("=" * 70)

phase1_summary = phase1.groupby(['method', 'scenario']).agg({
    'final_reward': ['mean', 'std', 'count'],
    'best_eval': ['mean', 'std']
}).round(2)

phase1_summary.columns = ['final_mean', 'final_std', 'n', 'best_mean', 'best_std']
print(phase1_summary.to_string())

# Statistical comparison
print("\n--- Winner by Scenario ---")
for scenario in phase1['scenario'].unique():
    dqn = phase1[(phase1['method'] == 'DQN') & (phase1['scenario'] == scenario)]['final_reward']
    sarsa = phase1[(phase1['method'] == 'DeepSARSA') & (phase1['scenario'] == scenario)]['final_reward']
    
    dqn_mean = dqn.mean() if len(dqn) > 0 else float('-inf')
    sarsa_mean = sarsa.mean() if len(sarsa) > 0 else float('-inf')
    
    winner = 'DQN' if dqn_mean > sarsa_mean else 'DeepSARSA'
    diff = abs(dqn_mean - sarsa_mean)
    print(f"{scenario:15} | Winner: {winner:12} | Diff: {diff:.2f}")

In [None]:
# Phase 1 Learning Curves
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

scenarios = ['Basic', 'TakeCover', 'Deathmatch']
colors = {'DQN': 'blue', 'DeepSARSA': 'orange'}

for idx, scenario in enumerate(scenarios):
    ax = axes[idx]
    
    for method in ['DQN', 'DeepSARSA']:
        method_results = [r for r in all_results 
                         if r['phase'] == 'Phase1' 
                         and r['method'] == method 
                         and r['scenario'] == scenario
                         and r['training_df'] is not None]
        
        if not method_results:
            continue
            
        # Average across seeds
        all_rewards = []
        for r in method_results:
            df = r['training_df']
            if 'episode' in df.columns and 'reward' in df.columns:
                all_rewards.append(df.set_index('episode')['reward'])
        
        if all_rewards:
            combined = pd.concat(all_rewards, axis=1)
            mean_reward = combined.mean(axis=1)
            std_reward = combined.std(axis=1)
            
            # Smooth with rolling average
            mean_smooth = mean_reward.rolling(50, min_periods=1).mean()
            
            ax.plot(mean_smooth.index, mean_smooth.values, 
                   label=method, color=colors[method], linewidth=2)
            ax.fill_between(mean_smooth.index, 
                           mean_smooth.values - std_reward.rolling(50, min_periods=1).mean().values,
                           mean_smooth.values + std_reward.rolling(50, min_periods=1).mean().values,
                           alpha=0.2, color=colors[method])
    
    ax.set_title(f'{scenario}', fontsize=12, fontweight='bold')
    ax.set_xlabel('Episode')
    ax.set_ylabel('Reward')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle('Phase 1: DQN vs Deep SARSA Learning Curves', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('phase1_learning_curves.png', dpi=150, bbox_inches='tight')
plt.show()
print("Saved: phase1_learning_curves.png")

---
## 4. Phase 2: N-Step Ablation

In [None]:
# Phase 2: N-step comparison
# Compare DQN (n=1) from Phase1 with DQN_n3 from Phase2

print("=" * 70)
print("PHASE 2: N-STEP ABLATION (n=1 vs n=3)")
print("=" * 70)

# Get baseline DQN (n=1)
baseline = df_results[(df_results['method'] == 'DQN') & (df_results['phase'] == 'Phase1')].copy()
baseline['method'] = 'DQN_n1'

# Get n-step results
nstep = df_results[df_results['phase'] == 'Phase2'].copy()

# Combine for comparison
nstep_comparison = pd.concat([baseline, nstep])

nstep_summary = nstep_comparison.groupby(['method', 'scenario']).agg({
    'final_reward': ['mean', 'std', 'count'],
    'best_eval': ['mean', 'std']
}).round(2)

nstep_summary.columns = ['final_mean', 'final_std', 'n', 'best_mean', 'best_std']
print(nstep_summary.to_string())

# Winner analysis
print("\n--- N-step Impact by Scenario ---")
for scenario in nstep_comparison['scenario'].unique():
    n1 = nstep_comparison[(nstep_comparison['method'] == 'DQN_n1') & (nstep_comparison['scenario'] == scenario)]['final_reward'].mean()
    n3 = nstep_comparison[(nstep_comparison['method'] == 'DQN_n3') & (nstep_comparison['scenario'] == scenario)]['final_reward'].mean()
    
    if pd.notna(n1) and pd.notna(n3):
        diff = n3 - n1
        pct = (diff / abs(n1)) * 100 if n1 != 0 else 0
        better = "n=3 better" if diff > 0 else "n=1 better"
        print(f"{scenario:15} | n=1: {n1:8.2f} | n=3: {n3:8.2f} | Diff: {diff:+8.2f} ({pct:+.1f}%) | {better}")

---
## 5. Phase 3: Extensions Comparison

In [None]:
# Phase 3: All extensions vs baseline DQN
print("=" * 70)
print("PHASE 3: EXTENSIONS vs BASELINE DQN")
print("=" * 70)

# Get all Phase 3 results
phase3 = df_results[df_results['phase'].str.startswith('Phase3')].copy()

# Baseline DQN for comparison
baseline_dqn = df_results[(df_results['method'] == 'DQN') & (df_results['phase'] == 'Phase1')].copy()
baseline_dqn['method'] = 'DQN_baseline'

# Combine
extensions_comparison = pd.concat([baseline_dqn, phase3])

ext_summary = extensions_comparison.groupby(['method', 'scenario']).agg({
    'final_reward': ['mean', 'std', 'count'],
    'best_eval': ['mean', 'std']
}).round(2)

ext_summary.columns = ['final_mean', 'final_std', 'n', 'best_mean', 'best_std']
print(ext_summary.to_string())

In [None]:
# Bar chart comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

scenarios = ['Basic', 'TakeCover', 'Deathmatch']
methods_order = ['DQN_baseline', 'DQN_n3', 'DQN_PER', 'DDQN', 'Dueling_DDQN']
colors = ['gray', 'blue', 'green', 'orange', 'red']

for idx, scenario in enumerate(scenarios):
    ax = axes[idx]
    
    # Combine all methods for this scenario
    all_methods = pd.concat([baseline_dqn, nstep, phase3])
    scenario_data = all_methods[all_methods['scenario'] == scenario]
    
    means = []
    stds = []
    labels = []
    
    for method in methods_order:
        method_data = scenario_data[scenario_data['method'] == method]['final_reward']
        if len(method_data) > 0:
            means.append(method_data.mean())
            stds.append(method_data.std() if len(method_data) > 1 else 0)
            labels.append(method.replace('DQN_', '').replace('_', '\n'))
    
    if means:
        x = range(len(means))
        bars = ax.bar(x, means, yerr=stds, capsize=5, color=colors[:len(means)], alpha=0.7)
        ax.set_xticks(x)
        ax.set_xticklabels(labels, fontsize=9)
        ax.set_ylabel('Final Reward')
        ax.set_title(f'{scenario}', fontsize=12, fontweight='bold')
        ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
        ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('All Methods Comparison by Scenario', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('all_methods_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("Saved: all_methods_comparison.png")

---
## 6. IEEE Report Tables

In [None]:
# Generate LaTeX tables for IEEE report
print("=" * 70)
print("LATEX TABLES FOR IEEE REPORT")
print("=" * 70)

# Table 1: Phase 1 Results
print("\n% Table 1: DQN vs Deep SARSA")
print("\\begin{table}[h]")
print("\\centering")
print("\\caption{Phase 1: DQN vs Deep SARSA Comparison}")
print("\\begin{tabular}{|l|l|r|r|}")
print("\\hline")
print("Scenario & Method & Final Reward & Best Eval \\\\")
print("\\hline")

for scenario in ['Basic', 'TakeCover', 'Deathmatch']:
    for method in ['DQN', 'DeepSARSA']:
        data = phase1[(phase1['scenario'] == scenario) & (phase1['method'] == method)]
        if len(data) > 0:
            final = f"{data['final_reward'].mean():.1f} $\\pm$ {data['final_reward'].std():.1f}"
            best = f"{data['best_eval'].mean():.1f} $\\pm$ {data['best_eval'].std():.1f}"
            print(f"{scenario} & {method} & {final} & {best} \\\\")
    print("\\hline")

print("\\end{tabular}")
print("\\end{table}")

In [None]:
# Table 2: All Extensions Comparison
print("\n% Table 2: Extensions Comparison")
print("\\begin{table}[h]")
print("\\centering")
print("\\caption{DQN Extensions Comparison (Final Reward)}")
print("\\begin{tabular}{|l|r|r|r|}")
print("\\hline")
print("Method & Basic & TakeCover & Deathmatch \\\\")
print("\\hline")

all_methods_df = pd.concat([baseline_dqn, nstep, phase3])

for method in ['DQN_baseline', 'DQN_n3', 'DQN_PER', 'DDQN', 'Dueling_DDQN']:
    row = [method.replace('_', ' ')]
    for scenario in ['Basic', 'TakeCover', 'Deathmatch']:
        data = all_methods_df[(all_methods_df['method'] == method) & (all_methods_df['scenario'] == scenario)]
        if len(data) > 0:
            val = f"{data['final_reward'].mean():.1f}"
        else:
            val = "-"
        row.append(val)
    print(f"{row[0]} & {row[1]} & {row[2]} & {row[3]} \\\\")

print("\\hline")
print("\\end{tabular}")
print("\\end{table}")

---
## 7. Summary Statistics

In [None]:
# Final summary
print("=" * 70)
print("FINAL SUMMARY")
print("=" * 70)

print(f"\nTotal Experiments: {len(df_results)}")
print(f"Date Range: {df_results['date'].min()} to {df_results['date'].max()}")

# Best method per scenario
print("\n--- Best Method per Scenario (by Final Reward) ---")
all_methods_df = pd.concat([baseline_dqn, nstep, phase3])

for scenario in ['Basic', 'TakeCover', 'Deathmatch']:
    scenario_data = all_methods_df[all_methods_df['scenario'] == scenario]
    if len(scenario_data) > 0:
        best_method = scenario_data.groupby('method')['final_reward'].mean().idxmax()
        best_reward = scenario_data.groupby('method')['final_reward'].mean().max()
        print(f"{scenario:15} | Best: {best_method:20} | Reward: {best_reward:.2f}")

# Training time
print("\n--- Total Training Time ---")
total_time = df_results['training_time'].sum()
if pd.notna(total_time):
    hours = total_time / 3600
    print(f"Total: {hours:.1f} hours")

In [None]:
# Export full results to CSV
export_df = df_results.drop(columns=['run_dir'], errors='ignore')
export_df.to_csv('experiment_results_summary.csv', index=False)
print("Exported: experiment_results_summary.csv")

# Show the export
print("\nFull Results Table:")
display(export_df.sort_values(['phase', 'method', 'scenario', 'seed']))

---
## 8. Key Findings for Report

In [None]:
print("=" * 70)
print("KEY FINDINGS FOR IEEE REPORT")
print("=" * 70)

print("""
1. PHASE 1: DQN vs Deep SARSA
   - [Fill based on results above]
   - Off-policy (DQN) vs On-policy (SARSA) comparison
   
2. PHASE 2: N-Step Returns
   - n=3 vs n=1 (baseline) comparison
   - TD/MC tradeoff: bias vs variance
   - [Fill: which scenarios benefited?]
   
3. PHASE 3: Extensions
   - PER: Priority sampling impact
   - DDQN: Overestimation bias reduction
   - Dueling: V/A separation benefit
   - [Fill: Rainbow ablation alignment?]

4. SCENARIO INSIGHTS
   - Basic: [short episodes, simple task]
   - TakeCover: [survival, longer episodes]
   - Deathmatch: [combat, complex rewards]
""")