# Project 2: Highway Driving Baseline

Train an agent to drive safely on a highway.

**Runtime:** ~3 minutes for baseline (50k steps)

## Setup

In [None]:
!pip install stable-baselines3 highway-env pygame -q

In [None]:
import gymnasium as gym
import highway_env
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt

## Create Environment

The highway environment features:
- 5 lanes
- Dynamic traffic
- 5 discrete actions: [idle, left, right, faster, slower]

In [None]:
# Create environment
env = gym.make('highway-fast-v0', render_mode='rgb_array')

print(f"Environment: {env.spec.id}")
print(f"Observation space: {env.observation_space.shape}")
print(f"  (5 vehicles x 5 features: presence, x, y, vx, vy)")
print(f"Action space: {env.action_space}")
print(f"  0=idle, 1=left, 2=right, 3=faster, 4=slower")

## Configure Environment (Optional)

Customize scenarios:

In [None]:
# Optional: customize configuration
config = {
    "observation": {
        "type": "Kinematics",
        "vehicles_count": 5,
        "features": ["presence", "x", "y", "vx", "vy"],
        "absolute": False,
    },
    "action": {
        "type": "DiscreteMetaAction",
    },
    "lanes_count": 4,
    "vehicles_count": 50,
    "duration": 40,  # seconds
    "reward_speed_range": [20, 30],
    "simulation_frequency": 15,
    "policy_frequency": 1,
}

env.unwrapped.config.update(config)
env.reset()

print("\nEnvironment configured!")

## Visualize Environment

In [None]:
# Show initial state
obs, _ = env.reset()
img = env.render()

plt.figure(figsize=(12, 4))
plt.imshow(img)
plt.title("Highway Environment (Bird's Eye View)")
plt.axis('off')
plt.show()

print(f"\nObservation shape: {obs.shape}")
print(f"First vehicle (ego): {obs[0]}")
print(f"  [presence, rel_x, rel_y, rel_vx, rel_vy]")

## Train Baseline Agent

In [None]:
# Create DQN agent
model = DQN(
    'MlpPolicy',
    env,
    learning_rate=5e-4,
    buffer_size=15000,
    learning_starts=200,
    batch_size=32,
    gamma=0.8,
    train_freq=1,
    gradient_steps=1,
    target_update_interval=50,
    exploration_fraction=0.7,
    verbose=1,
    seed=42
)

print("\nDQN agent created. Ready to train!")

In [None]:
# Train agent
print("Training for 50k steps (~3 minutes)...\n")
model.learn(total_timesteps=50_000)
print("\nTraining complete!")

## Evaluate Agent

In [None]:
# Detailed evaluation
def detailed_evaluate(model, env, n_episodes=20):
    """Evaluate with detailed metrics."""
    rewards = []
    successes = []
    speeds = []
    crashes = []
    
    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0
        episode_speeds = []
        crashed = False
        
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            
            # Track speed
            if 'speed' in info:
                episode_speeds.append(info['speed'])
            
            # Check if crashed
            if info.get('crashed', False):
                crashed = True
        
        rewards.append(episode_reward)
        successes.append(not crashed)
        crashes.append(crashed)
        if episode_speeds:
            speeds.append(np.mean(episode_speeds))
    
    return {
        'mean_reward': np.mean(rewards),
        'std_reward': np.std(rewards),
        'success_rate': np.mean(successes) * 100,
        'crash_rate': np.mean(crashes) * 100,
        'avg_speed': np.mean(speeds) if speeds else 0
    }


# Evaluate
results = detailed_evaluate(model, env, n_episodes=20)

print(f"\n{'='*50}")
print(f"Evaluation Results (20 episodes):")
print(f"{'='*50}")
print(f"  Mean reward: {results['mean_reward']:.2f} ± {results['std_reward']:.2f}")
print(f"  Success rate: {results['success_rate']:.1f}%")
print(f"  Crash rate: {results['crash_rate']:.1f}%")
print(f"  Average speed: {results['avg_speed']:.1f} km/h")
print(f"{'='*50}")

if results['success_rate'] >= 70:
    print(f"\n✓ Good performance! Agent drives safely.")
else:
    print(f"\n⚠ Agent needs more training.")
    print(f"  Try: model.learn(total_timesteps=150_000)")

## Visualize Trained Agent

In [None]:
# Record an episode
obs, _ = env.reset()
frames = []
done = False
episode_reward = 0

while not done and len(frames) < 200:  # Max 200 frames
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    episode_reward += reward
    
    frames.append(env.render())

print(f"Episode reward: {episode_reward:.2f}")
print(f"Crashed: {info.get('crashed', False)}")

# Show sample frames
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, idx in enumerate([0, len(frames)//2, -1]):
    axes[i].imshow(frames[idx])
    axes[i].set_title(f"Frame {idx if idx >= 0 else len(frames)+idx}")
    axes[i].axis('off')

plt.suptitle(f"Agent Driving (Reward: {episode_reward:.1f})")
plt.tight_layout()
plt.show()

## Next Steps

Now that you have a working baseline:

1. **Train longer:** Try 150k steps for better performance
2. **Reward shaping:** Modify rewards for safety/speed trade-off
3. **Traffic scenarios:** Increase vehicle density
4. **Continuous control:** Switch to PPO with continuous actions

See `project2_highway_README.md` for detailed ideas!