In [None]:
import sys
sys.path.append('..')

import torch
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from src.agents import DQNAgent
from src.environment import NavigationEnv
from src.training import BasicTrainer
from src.utils import TrainingLogger

## Setup

In [None]:
# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

## Create Environment and Agent

In [None]:
env = NavigationEnv()

agent = DQNAgent(
    state_dim=8,
    action_dim=4,
    learning_rate=1e-4,
    gamma=0.99,
    hidden_dims=[256, 256],
    device=device
)

print("Agent created successfully")

## Train Agent

In [None]:
trainer = BasicTrainer(
    agent=agent,
    env=env,
    num_episodes=500,
    batch_size=64,
    learning_starts=1000,
    save_path='../trained_models/basic',
    log_interval=10
)

# Create save directory
Path('../trained_models/basic').mkdir(parents=True, exist_ok=True)

stats = trainer.train()

print("\nTraining completed!")
print(f"Mean reward (last 100): {stats['mean_reward']:.2f}")
print(f"Mean length (last 100): {stats['mean_length']:.1f}")
print(f"Training time: {stats['training_time']:.2f}s")

## Visualize Results

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Episode rewards
axes[0, 0].plot(stats['episode_rewards'], alpha=0.3)
window = 50
if len(stats['episode_rewards']) > window:
    moving_avg = np.convolve(stats['episode_rewards'], 
                              np.ones(window)/window, mode='valid')
    axes[0, 0].plot(range(window-1, len(stats['episode_rewards'])), 
                     moving_avg, linewidth=2, label=f'{window}-episode avg')
    axes[0, 0].legend()
axes[0, 0].set_title('Episode Rewards')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Reward')
axes[0, 0].grid(True, alpha=0.3)

# Episode lengths
axes[0, 1].plot(stats['episode_lengths'], alpha=0.5)
axes[0, 1].set_title('Episode Lengths')
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Steps')
axes[0, 1].grid(True, alpha=0.3)

# Training losses
if trainer.losses:
    axes[1, 0].plot(trainer.losses, alpha=0.5)
    axes[1, 0].set_title('Training Loss')
    axes[1, 0].set_xlabel('Training Step')
    axes[1, 0].set_ylabel('Loss')
    axes[1, 0].grid(True, alpha=0.3)

# Q-values
if trainer.q_values:
    axes[1, 1].plot(trainer.q_values, alpha=0.5)
    axes[1, 1].set_title('Average Q-Values')
    axes[1, 1].set_xlabel('Training Step')
    axes[1, 1].set_ylabel('Q-Value')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/dqn_training.png', dpi=150, bbox_inches='tight')
plt.show()

## Test Trained Agent

In [None]:
# Test for 10 episodes
test_rewards = []

for episode in range(10):
    state, _ = env.reset()
    episode_reward = 0
    
    for step in range(1000):
        action = agent.select_action(state, epsilon=0.0)  # Greedy
        state, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        
        if terminated or truncated:
            break
    
    test_rewards.append(episode_reward)
    print(f"Test Episode {episode + 1}: Reward = {episode_reward:.2f}")

print(f"\nAverage test reward: {np.mean(test_rewards):.2f} Â± {np.std(test_rewards):.2f}")

In [None]:
env.close()