# Double Q-Learning Training Notebook

This notebook demonstrates training a Double Q-Learning agent for autonomous parcel routing.

## Key Features:
- **Reduced Overestimation Bias**: Uses two separate Q-tables
- **Better Convergence**: More stable learning compared to standard Q-Learning
- **Action Selection**: Combines both Q-tables for decision making

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

# Add src to path for imports
sys.path.append('../src')

from apr import WarehouseEnv
from apr.agents import create_agent
from apr.train import run_episode
from apr.logger import RunLogger

# Set up plotting
plt.style.use('seaborn-v0_8')
%matplotlib inline

## Environment and Agent Setup

In [None]:
# Create environment
env = WarehouseEnv(seed=42)
print(f"Environment: {env.n_rows}x{env.n_cols} warehouse")
print(f"Packages to collect: {len(env.packages_remaining)}")
print(f"Max steps per episode: {env.max_steps}")

# Visualize initial environment
env.reset()
env.render(mode='human')
plt.title('Initial Warehouse Layout')
plt.show()

In [None]:
# Create Double Q-Learning agent
agent = create_agent(
    'double_q_learning',
    env.observation_space,
    env.action_space,
    alpha=0.1,          # Learning rate
    gamma=0.95,         # Discount factor
    epsilon=0.3,        # Initial exploration rate
    epsilon_decay=0.999, # Exploration decay
    epsilon_min=0.05    # Minimum exploration
)

print(f"Created agent: {agent}")
print(f"Agent has dual Q-tables: Q1={hasattr(agent, 'Q1')}, Q2={hasattr(agent, 'Q2')}")

## Training Loop

In [None]:
# Training parameters
episodes = 1000
log_interval = 100

# Metrics tracking
episode_rewards = []
episode_lengths = []
epsilon_values = []
success_rates = []

print("🚀 Starting Double Q-Learning Training")
print("=" * 50)

for episode in range(episodes):
    # Run episode
    reward = run_episode(env, agent, training=True)
    
    # Track metrics
    episode_rewards.append(reward)
    episode_lengths.append(env.episode_length if hasattr(env, 'episode_length') else 0)
    epsilon_values.append(agent.epsilon)
    
    # Calculate success rate (rolling window)
    if episode >= 100:
        recent_rewards = episode_rewards[-100:]
        success_count = sum(1 for r in recent_rewards if r > 400)  # Successful episodes
        success_rates.append(success_count / 100)
    else:
        success_rates.append(0)
    
    # Logging
    if (episode + 1) % log_interval == 0:
        avg_reward = np.mean(episode_rewards[-log_interval:])
        print(f"Episode {episode + 1:4d}: Avg Reward = {avg_reward:6.1f}, ε = {agent.epsilon:.3f}")

print("\n✅ Training completed!")
print(f"Final average reward (last 100 episodes): {np.mean(episode_rewards[-100:]):.1f}")
print(f"Final epsilon: {agent.epsilon:.3f}")

## Training Analysis

In [None]:
# Create comprehensive training plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Learning curve (smoothed)
ax1 = axes[0, 0]
window = 50
smoothed_rewards = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
ax1.plot(range(window-1, len(episode_rewards)), smoothed_rewards, 'b-', linewidth=2)
ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward (smoothed)')
ax1.set_title('Double Q-Learning: Learning Curve')
ax1.grid(True, alpha=0.3)

# 2. Epsilon decay
ax2 = axes[0, 1]
ax2.plot(epsilon_values, 'r-', linewidth=2)
ax2.set_xlabel('Episode')
ax2.set_ylabel('Epsilon')
ax2.set_title('Exploration Rate Decay')
ax2.grid(True, alpha=0.3)

# 3. Success rate over time
ax3 = axes[1, 0]
ax3.plot(success_rates, 'g-', linewidth=2)
ax3.set_xlabel('Episode')
ax3.set_ylabel('Success Rate (100-episode window)')
ax3.set_title('Success Rate Evolution')
ax3.set_ylim(0, 1)
ax3.grid(True, alpha=0.3)

# 4. Reward distribution
ax4 = axes[1, 1]
ax4.hist(episode_rewards[-200:], bins=30, alpha=0.7, edgecolor='black')
ax4.axvline(np.mean(episode_rewards[-200:]), color='red', linestyle='--', 
           label=f'Mean: {np.mean(episode_rewards[-200:]):.1f}')
ax4.set_xlabel('Reward')
ax4.set_ylabel('Frequency')
ax4.set_title('Final Reward Distribution (Last 200 Episodes)')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Q-Table Analysis

In [None]:
# Analyze Q-tables
print("📊 Double Q-Learning Q-Table Analysis")
print("=" * 40)

# Get Q-table statistics
if hasattr(agent, 'get_q_statistics'):
    stats = agent.get_q_statistics()
    
    print(f"Number of states visited: {stats['num_states']}")
    print(f"State space coverage: {stats['num_states']/(env.n_rows*env.n_cols):.1%}")
    
    print("\nQ1 Table Statistics:")
    q1_stats = stats['q1_stats']
    print(f"  Mean: {q1_stats['mean']:.2f}")
    print(f"  Std:  {q1_stats['std']:.2f}")
    print(f"  Range: [{q1_stats['min']:.2f}, {q1_stats['max']:.2f}]")
    
    print("\nQ2 Table Statistics:")
    q2_stats = stats['q2_stats']
    print(f"  Mean: {q2_stats['mean']:.2f}")
    print(f"  Std:  {q2_stats['std']:.2f}")
    print(f"  Range: [{q2_stats['min']:.2f}, {q2_stats['max']:.2f}]")
    
    print("\nCombined Q-Table Statistics:")
    combined_stats = stats['combined_stats']
    print(f"  Mean: {combined_stats['mean']:.2f}")
    print(f"  Std:  {combined_stats['std']:.2f}")
    print(f"  Range: [{combined_stats['min']:.2f}, {combined_stats['max']:.2f}]")

# Visualize Q-value distributions
if hasattr(agent, 'Q1') and hasattr(agent, 'Q2'):
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Collect Q-values
    q1_values = []
    q2_values = []
    combined_values = []
    
    for state in agent.Q1.keys():
        q1_values.extend(agent.Q1[state])
        q2_values.extend(agent.Q2[state])
        combined_values.extend(agent.Q[state])
    
    # Plot distributions
    axes[0].hist(q1_values, bins=30, alpha=0.7, color='blue', edgecolor='black')
    axes[0].set_title('Q1 Value Distribution')
    axes[0].set_xlabel('Q-Value')
    axes[0].set_ylabel('Frequency')
    
    axes[1].hist(q2_values, bins=30, alpha=0.7, color='red', edgecolor='black')
    axes[1].set_title('Q2 Value Distribution')
    axes[1].set_xlabel('Q-Value')
    axes[1].set_ylabel('Frequency')
    
    axes[2].hist(combined_values, bins=30, alpha=0.7, color='green', edgecolor='black')
    axes[2].set_title('Combined Q-Value Distribution')
    axes[2].set_xlabel('Q-Value')
    axes[2].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## Test Trained Agent

In [None]:
print("🎯 Testing Trained Double Q-Learning Agent")
print("=" * 45)

# Test agent performance (no exploration)
test_episodes = 50
test_rewards = []
successful_episodes = 0

for episode in range(test_episodes):
    env.reset()
    total_reward = 0
    done = False
    steps = 0
    
    while not done and steps < env.max_steps:
        state = env.agent_pos
        action = agent.act(state, training=False)  # No exploration
        _, reward, done, _ = env.step(action)
        total_reward += reward
        steps += 1
    
    test_rewards.append(total_reward)
    if total_reward > 400:  # Successful episode threshold
        successful_episodes += 1

# Print test results
print(f"Test Episodes: {test_episodes}")
print(f"Mean Reward: {np.mean(test_rewards):.1f} ± {np.std(test_rewards):.1f}")
print(f"Success Rate: {successful_episodes/test_episodes:.1%}")
print(f"Best Performance: {np.max(test_rewards):.1f}")
print(f"Worst Performance: {np.min(test_rewards):.1f}")

# Visualize test performance
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(test_rewards, 'o-', alpha=0.7)
plt.axhline(np.mean(test_rewards), color='red', linestyle='--', label=f'Mean: {np.mean(test_rewards):.1f}')
plt.xlabel('Test Episode')
plt.ylabel('Reward')
plt.title('Test Episode Performance')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(test_rewards, bins=15, alpha=0.7, edgecolor='black')
plt.axvline(np.mean(test_rewards), color='red', linestyle='--', label=f'Mean: {np.mean(test_rewards):.1f}')
plt.xlabel('Reward')
plt.ylabel('Frequency')
plt.title('Test Reward Distribution')
plt.legend()

plt.tight_layout()
plt.show()

## Demonstration Episode

In [None]:
print("🎬 Demonstration Episode")
print("=" * 25)

# Reset environment for demo
env.reset()
done = False
steps = 0
total_reward = 0
trajectory = []

print(f"Initial state: Agent at {env.agent_pos}")
print(f"Packages to collect: {len(env.packages_remaining)}")
print(f"Dropoff location: {env.dropoff}")
print()

# Run demonstration episode
while not done and steps < 20:  # Limit to 20 steps for demo
    state = env.agent_pos
    action = agent.act(state, training=False)
    
    # Action names for readability
    action_names = ['Up', 'Down', 'Left', 'Right']
    
    next_state, reward, done, info = env.step(action)
    trajectory.append((state, action, reward, next_state))
    total_reward += reward
    
    print(f"Step {steps + 1}: {state} → {action_names[action]} → {next_state} (reward: {reward:+.0f})")
    
    steps += 1

print(f"\nDemo completed after {steps} steps")
print(f"Total reward: {total_reward}")
print(f"Packages remaining: {len(env.packages_remaining)}")
print(f"Carrying packages: {env.carrying_packages}")

# Render final state
env.render(mode='human')
plt.title(f'Agent State After {steps} Steps (Reward: {total_reward})')
plt.show()

## Save Trained Agent

In [None]:
# Create save directory
save_dir = Path('../models')
save_dir.mkdir(exist_ok=True)

# Save agent
agent_path = save_dir / 'double_q_learning_trained.pkl'
agent.save(agent_path)

print(f"✅ Agent saved to: {agent_path}")
print(f"Final performance: {np.mean(episode_rewards[-100:]):.1f} average reward (last 100 episodes)")
print(f"Training episodes: {episodes}")

# Save training metrics
metrics_df = pd.DataFrame({
    'episode': range(1, episodes + 1),
    'reward': episode_rewards,
    'epsilon': epsilon_values,
    'success_rate': success_rates
})

metrics_path = save_dir / 'double_q_learning_metrics.csv'
metrics_df.to_csv(metrics_path, index=False)

print(f"📊 Training metrics saved to: {metrics_path}")