# Q-Learning: Complete Training, Validation & Testing

This notebook provides a complete workflow for Q-Learning agent:
1. **Training**: Learn optimal policy through experience
2. **Validation**: Rigorous testing to ensure proper learning
3. **Testing**: Evaluate final performance and behavior

## Q-Learning Algorithm
- **Type**: Off-policy temporal difference learning
- **Update Rule**: Q(s,a) ← Q(s,a) + α[r + γ max Q(s',a') - Q(s,a)]
- **Key Feature**: Uses max over next actions (optimistic)

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import time

# Add src to path for imports
sys.path.append('../src')

from apr import WarehouseEnv, RLAgentValidator, AgentEvaluator
from apr.agents import create_agent
from apr.train import run_episode
from apr.utils import ensure_outputs_directory

# Set up plotting
plt.style.use('seaborn-v0_8')
%matplotlib inline

# Ensure output directories exist
ensure_outputs_directory()
print("✅ Output directories ready")

## 1. Environment & Agent Setup

In [None]:
# Create environment
env = WarehouseEnv(seed=42)
print(f"Environment: {env.n_rows}x{env.n_cols} warehouse")
print(f"Packages to collect: {len(env.packages_remaining)}")
print(f"Max steps per episode: {env.max_steps}")

# Visualize environment
env.reset()
env.render(mode='human')
plt.title('Q-Learning Training Environment')
plt.show()

# Create Q-Learning agent
agent = create_agent(
    'q_learning',
    env.observation_space,
    env.action_space,
    alpha=0.1,          # Learning rate
    gamma=0.95,         # Discount factor
    epsilon=0.3,        # Initial exploration rate
    epsilon_decay=0.999, # Exploration decay
    epsilon_min=0.05    # Minimum exploration
)

print(f"\n🤖 Created Q-Learning Agent:")
print(f"  Algorithm: {type(agent).__name__}")
print(f"  Learning rate (α): {agent.alpha}")
print(f"  Discount factor (γ): {agent.gamma}")
print(f"  Initial exploration (ε): {agent.epsilon}")

## 2. Training Phase

In [None]:
# Training parameters
training_episodes = 800
log_interval = 100

print("🚀 Starting Q-Learning Training")
print("=" * 40)

# Training metrics
episode_rewards = []
episode_lengths = []
epsilon_values = []
q_table_sizes = []
training_times = []

start_time = time.time()

for episode in range(training_episodes):
    episode_start = time.time()
    
    # Run training episode
    reward = run_episode(env, agent, training=True)
    
    # Track metrics
    episode_rewards.append(reward)
    episode_lengths.append(env.episode_length if hasattr(env, 'episode_length') else 0)
    epsilon_values.append(agent.epsilon)
    q_table_sizes.append(len(agent.Q))
    training_times.append(time.time() - episode_start)
    
    # Logging
    if (episode + 1) % log_interval == 0:
        avg_reward = np.mean(episode_rewards[-log_interval:])
        avg_time = np.mean(training_times[-log_interval:])
        print(f"Episode {episode + 1:3d}: Reward = {avg_reward:6.1f}, ε = {agent.epsilon:.3f}, "
              f"Q-states = {len(agent.Q):3d}, Time = {avg_time:.3f}s")

total_training_time = time.time() - start_time

print(f"\n✅ Training Complete!")
print(f"  Total time: {total_training_time:.1f}s")
print(f"  Final performance: {np.mean(episode_rewards[-50:]):.1f} (last 50 episodes)")
print(f"  Q-table size: {len(agent.Q)} states")
print(f"  Final exploration: {agent.epsilon:.3f}")

## 3. Training Analysis

In [None]:
# Create comprehensive training analysis
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Learning curve
ax1 = axes[0, 0]
ax1.plot(episode_rewards, alpha=0.3, color='skyblue', linewidth=0.5, label='Raw')
window = 50
smoothed = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
ax1.plot(range(window-1, len(episode_rewards)), smoothed, 'b-', linewidth=2, label='Smoothed')
ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward')
ax1.set_title('Q-Learning: Learning Curve')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Epsilon decay
ax2 = axes[0, 1]
ax2.plot(epsilon_values, 'r-', linewidth=2)
ax2.set_xlabel('Episode')
ax2.set_ylabel('Epsilon')
ax2.set_title('Exploration Rate Decay')
ax2.grid(True, alpha=0.3)

# 3. Q-table growth
ax3 = axes[0, 2]
ax3.plot(q_table_sizes, 'g-', linewidth=2)
ax3.set_xlabel('Episode')
ax3.set_ylabel('Number of States')
ax3.set_title('Q-Table Growth')
ax3.grid(True, alpha=0.3)

# 4. Episode lengths
ax4 = axes[1, 0]
lengths_smoothed = np.convolve(episode_lengths, np.ones(window)/window, mode='valid')
ax4.plot(range(window-1, len(episode_lengths)), lengths_smoothed, 'm-', linewidth=2)
ax4.set_xlabel('Episode')
ax4.set_ylabel('Episode Length')
ax4.set_title('Episode Length Over Time')
ax4.grid(True, alpha=0.3)

# 5. Training time per episode
ax5 = axes[1, 1]
time_smoothed = np.convolve(training_times, np.ones(window)/window, mode='valid')
ax5.plot(range(window-1, len(training_times)), time_smoothed, 'orange', linewidth=2)
ax5.set_xlabel('Episode')
ax5.set_ylabel('Time per Episode (s)')
ax5.set_title('Training Efficiency')
ax5.grid(True, alpha=0.3)

# 6. Final reward distribution
ax6 = axes[1, 2]
final_rewards = episode_rewards[-200:]  # Last 200 episodes
ax6.hist(final_rewards, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
ax6.axvline(np.mean(final_rewards), color='red', linestyle='--', 
           label=f'Mean: {np.mean(final_rewards):.1f}')
ax6.set_xlabel('Reward')
ax6.set_ylabel('Frequency')
ax6.set_title('Final Performance Distribution')
ax6.legend()

plt.tight_layout()
plt.show()

# Print training statistics
print("📊 Training Statistics:")
print(f"  Episodes: {training_episodes}")
print(f"  Total time: {total_training_time:.1f}s ({total_training_time/60:.1f} min)")
print(f"  Avg time per episode: {np.mean(training_times):.3f}s")
print(f"  Final 100-episode average: {np.mean(episode_rewards[-100:]):.1f}")
print(f"  Best single episode: {np.max(episode_rewards):.1f}")
print(f"  Q-table final size: {len(agent.Q)} states")
print(f"  State space coverage: {len(agent.Q)/(env.n_rows*env.n_cols)*100:.1f}%")

## 4. Validation Phase

In [None]:
print("🔍 Q-Learning Agent Validation")
print("=" * 35)

# Create fresh agent for validation (to avoid training bias)
validation_agent = create_agent(
    'q_learning',
    env.observation_space,
    env.action_space,
    alpha=0.1, gamma=0.95, epsilon=0.3
)

# Run comprehensive validation
validator = RLAgentValidator(validation_agent, env, verbose=True)
validation_results = validator.full_validation(
    training_episodes=300,
    test_episodes=50,
    n_seeds=3
)

# Display validation summary
print("\n" + "="*60)
print("📊 VALIDATION SUMMARY")
print("="*60)

summary = validation_results['summary']
print(f"Overall Assessment: {summary['overall_assessment']}")
print()

print("Component Scores:")
for component, score in summary['scores'].items():
    status_icon = "✅" if score == "PASS" else "⚠️" if score == "WARNING" else "❌"
    print(f"  {status_icon} {component.capitalize()}: {score}")

if summary['warnings']:
    print("\n⚠️  Warnings:")
    for warning in summary['warnings']:
        print(f"  - {warning}")

# Key metrics
learning_result = validation_results['learning']
exploration = validation_results['exploration']
generalization = validation_results['generalization']

print("\nKey Validation Metrics:")
print(f"  Learning improvement: {learning_result['improvement']:.1f} reward vs random")
print(f"  Statistical significance: p={learning_result['statistical_test']['p_value']:.4f}")
print(f"  State coverage: {exploration['state_coverage']['coverage_percent']:.1f}%")
print(f"  Generalization consistency: {generalization['consistency_score']:.3f}")

## 5. Validation Visualization

In [None]:
# Generate comprehensive validation visualization
validator.visualize_results()
print("✅ Validation visualization complete!")

## 6. Testing Phase

In [None]:
print("🎯 Q-Learning Agent Testing")
print("=" * 30)

# Use the trained agent for testing
evaluator = AgentEvaluator(env, verbose=True)

# Comprehensive evaluation
test_results = evaluator.evaluate_agent(
    agent,
    num_episodes=100,
    seeds=[42, 123, 456, 789, 999],  # Multiple scenarios
    render=False
)

# Display test results
print("\n📊 TEST RESULTS SUMMARY")
print("=" * 40)

agg_stats = test_results['aggregated_results']['overall_statistics']
print(f"Mean Reward: {agg_stats['mean_reward']:.1f} ± {agg_stats['std_reward']:.1f}")
print(f"Success Rate: {agg_stats['mean_success_rate']:.1%}")
print(f"Mean Episode Length: {agg_stats['mean_episode_length']:.1f}")
print(f"State Coverage: {agg_stats['mean_state_coverage']:.1%}")

# Performance across different seeds
seed_results = test_results['per_seed_results']
print("\nPerformance Across Seeds:")
for seed_key, result in seed_results.items():
    seed = result['seed']
    mean_reward = result['statistics']['mean_reward']
    success_rate = result['statistics']['success_rate']
    print(f"  Seed {seed}: {mean_reward:.1f} reward, {success_rate:.1%} success")

## 7. Testing Visualization

In [None]:
# Generate testing visualization
evaluator.visualize_evaluation(test_results)
print("✅ Testing visualization complete!")

## 8. Agent Demonstration

In [None]:
print("🎬 Q-Learning Agent Demonstration")
print("=" * 35)

# Reset for demonstration
env.reset()
done = False
steps = 0
total_reward = 0
max_demo_steps = 25

print(f"Initial state: Agent at {env.agent_pos}")
print(f"Packages to collect: {env.packages_remaining}")
print(f"Dropoff location: {env.dropoff}")
print()

# Action names for readability
action_names = ['Up', 'Down', 'Left', 'Right']

while not done and steps < max_demo_steps:
    state = env.agent_pos
    action = agent.act(state, training=False)  # No exploration
    
    next_state, reward, done, info = env.step(action)
    total_reward += reward
    
    print(f"Step {steps + 1:2d}: {state} → {action_names[action]:5s} → {next_state} "
          f"(reward: {reward:+4.0f}, total: {total_reward:+4.0f})")
    
    steps += 1

print(f"\nDemo completed after {steps} steps")
print(f"Final reward: {total_reward}")
print(f"Episode completed: {done}")
print(f"Packages remaining: {len(env.packages_remaining)}")
print(f"Carrying packages: {env.carrying_packages}")

# Show final state
env.render(mode='human')
plt.title(f'Q-Learning Agent After {steps} Steps (Reward: {total_reward})')
plt.show()

## 9. Policy Analysis

In [None]:
print("🧠 Q-Learning Policy Analysis")
print("=" * 30)

# Analyze learned Q-values and policy
if hasattr(agent, 'Q') and len(agent.Q) > 0:
    # Extract Q-values and policy
    all_q_values = []
    policy = {}
    state_values = {}
    
    for state, q_vals in agent.Q.items():
        if isinstance(q_vals, np.ndarray):
            all_q_values.extend(q_vals)
            policy[state] = np.argmax(q_vals)
            state_values[state] = np.max(q_vals)
    
    print(f"Q-table Statistics:")
    print(f"  States learned: {len(agent.Q)}")
    print(f"  Q-value range: [{np.min(all_q_values):.1f}, {np.max(all_q_values):.1f}]")
    print(f"  Q-value mean: {np.mean(all_q_values):.1f}")
    print(f"  Q-value std: {np.std(all_q_values):.1f}")
    
    # Action distribution in policy
    from collections import Counter
    action_dist = Counter(policy.values())
    print(f"\nPolicy Action Distribution:")
    for action, count in action_dist.items():
        percentage = count / len(policy) * 100
        print(f"  {action_names[action]:5s}: {count:3d} states ({percentage:4.1f}%)")
    
    # Visualize Q-value distribution
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.hist(all_q_values, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    plt.axvline(np.mean(all_q_values), color='red', linestyle='--', 
               label=f'Mean: {np.mean(all_q_values):.1f}')
    plt.xlabel('Q-Value')
    plt.ylabel('Frequency')
    plt.title('Q-Value Distribution')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    actions = list(action_dist.keys())
    counts = list(action_dist.values())
    action_labels = [action_names[a] for a in actions]
    
    plt.bar(action_labels, counts, color=['red', 'blue', 'green', 'orange'][:len(actions)], alpha=0.7)
    plt.xlabel('Action')
    plt.ylabel('Frequency in Policy')
    plt.title('Learned Policy Action Distribution')
    
    plt.tight_layout()
    plt.show()

else:
    print("⚠️  No Q-table available for analysis")

## 10. Save Results

In [None]:
from apr.utils import get_outputs_dir

# Create save paths
outputs_dir = get_outputs_dir()
models_dir = outputs_dir / 'models'
models_dir.mkdir(exist_ok=True)

# Save trained agent
agent_path = models_dir / 'q_learning_complete.pkl'
agent.save(agent_path)
print(f"✅ Saved trained agent to: {agent_path}")

# Save training metrics
training_df = pd.DataFrame({
    'episode': range(1, training_episodes + 1),
    'reward': episode_rewards,
    'epsilon': epsilon_values,
    'q_table_size': q_table_sizes,
    'episode_length': episode_lengths,
    'training_time': training_times
})

metrics_path = models_dir / 'q_learning_training_metrics.csv'
training_df.to_csv(metrics_path, index=False)
print(f"📊 Saved training metrics to: {metrics_path}")

# Save validation results
validation_dir = outputs_dir / 'validation_results'
validation_dir.mkdir(exist_ok=True)

# Save test results
test_summary = pd.DataFrame([{
    'algorithm': 'Q-Learning',
    'mean_reward': agg_stats['mean_reward'],
    'std_reward': agg_stats['std_reward'],
    'success_rate': agg_stats['mean_success_rate'],
    'episode_length': agg_stats['mean_episode_length'],
    'state_coverage': agg_stats['mean_state_coverage'],
    'training_episodes': training_episodes,
    'training_time': total_training_time,
    'final_q_table_size': len(agent.Q)
}])

test_path = validation_dir / 'q_learning_test_summary.csv'
test_summary.to_csv(test_path, index=False)
print(f"🎯 Saved test summary to: {test_path}")

# Final summary
print("\n" + "="*50)
print("🎉 Q-LEARNING COMPLETE WORKFLOW FINISHED!")
print("="*50)
print(f"Training Episodes: {training_episodes}")
print(f"Training Time: {total_training_time:.1f}s")
print(f"Final Performance: {np.mean(episode_rewards[-50:]):.1f} reward")
print(f"Test Performance: {agg_stats['mean_reward']:.1f} ± {agg_stats['std_reward']:.1f}")
print(f"Success Rate: {agg_stats['mean_success_rate']:.1%}")
print(f"Validation Status: {summary['overall_assessment']}")
print("\n✅ All results saved to outputs directory")