# Part 2 : Implementation of a continuous environment

In [61]:
import gymnasium as gym
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
from torch.utils.tensorboard import SummaryWriter
from gymnasium.wrappers import RecordEpisodeStatistics, RecordVideo
from matplotlib.animation import FFMpegWriter
import highway_env

### 1. Load environment config

In [62]:
with open("config_part2.pkl", "rb") as f:
    config = pickle.load(f)
# Create the continuous-action Racetrack environment
base_env = gym.make("racetrack-v0", render_mode="rgb_array")
base_env.unwrapped.configure(config)
# Record stats and videos: record 20 videos uniformly across training
env = RecordEpisodeStatistics(base_env)
env = RecordVideo(env,
                   video_folder="videos_racetrack/",
                   episode_trigger=lambda episode_id: episode_id % 20 == 0,
                   name_prefix="train_ep")


  logger.warn(


In [63]:
# Set seeds for reproducibility
seed = 0
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
env.action_space.seed(seed)
env.observation_space.seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [64]:
# Hyperparameters
gamma = 0.99            # discount factor
tau = 0.005             # target network update rate
actor_lr = 1e-4         # learning rate for actor
critic_lr = 1e-3        # learning rate for critic
buffer_capacity = 100000
batch_size = 64
num_episodes = 20
max_steps = config.get("duration") * config.get("policy_frequency")
writer = SummaryWriter(log_dir='racetrack_dqn/')

### 2. Replay buffer

In [65]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    def push(self, state, action, reward, done, next_state):
        self.buffer.append((state, action, reward, done, next_state))
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, dones, next_states = map(np.stack, zip(*batch))
        return states, actions, rewards, dones, next_states
    def __len__(self):
        return len(self.buffer)

### 3. Actor Network

In [66]:
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, act_dim),
            nn.Tanh()  # output in [-1,1]
        )
    def forward(self, x):
        return self.net(x)

### 4. Critic Network

In [67]:
class Critic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim + act_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    def forward(self, obs, act):
        x = torch.cat([obs, act], dim=-1)
        return self.net(x)

### 5. DDPG Agent

In [68]:
class GaussianNoise:
    def __init__(self, dim, std):
        self.dim = dim
        self.std = std
    def sample(self):
        # retourne un vecteur de taille dim ~ N(0, std²)
        return np.random.randn(self.dim).astype(np.float32) * self.std
    
class DDPGAgent:
    def __init__(self, obs_dim, act_dim):
        # Main networks
        self.actor = Actor(obs_dim, act_dim).to(device)
        self.critic = Critic(obs_dim, act_dim).to(device)
        # Target networks
        self.targ_actor = Actor(obs_dim, act_dim).to(device)
        self.targ_critic = Critic(obs_dim, act_dim).to(device)
        # Copy weights
        self.targ_actor.load_state_dict(self.actor.state_dict())
        self.targ_critic.load_state_dict(self.critic.state_dict())
        # Optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
        # Replay buffer
        self.buffer = ReplayBuffer(buffer_capacity)
        # Exploration noise std
        self.noise_std = 0.1
        self.noise = GaussianNoise(act_dim, self.noise_std)

        self.act_low, self.act_high = env.action_space.low, env.action_space.high
    
    def get_action(self, state, noise=True):
        # 0) Flatten
        state_arr = np.array(state, dtype=np.float32).ravel()
        # 1) Batch dim
        state_t = torch.from_numpy(state_arr).unsqueeze(0).to(device)

        # 2) Prédiction de l’Actor et extraction de la batch 0
        action = self.actor(state_t).cpu().detach().numpy()[0]

        # 4) Exploration
        if noise:
            action += self.noise.sample()

        # 5) Remise à l’échelle de [−1,1] → [act_low, act_high] + clipping
        action = np.clip(
            action * (self.act_high - self.act_low) + self.act_low,
            self.act_low,
            self.act_high
        )

        return action

    def update(self, step):
        if len(self.buffer) < batch_size:
            return
        # Sample a batch
        states, actions, rewards, dones, next_states = self.buffer.sample(batch_size)
        states = torch.FloatTensor(states).to(device)
        actions = torch.FloatTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        
        low = env.action_space.low
        high = env.action_space.high

        # Critic loss
        with torch.no_grad():
            next_actions = self.targ_actor(next_states)
            next_actions = low + (next_actions + 1.0) * 0.5 * (high - low)
            target_q = self.targ_critic(next_states, next_actions)
            y = rewards + gamma * (1 - dones) * target_q
        q = self.critic(states, actions)
        critic_loss = nn.MSELoss()(q, y)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss (maximize Q)
        pred_actions = self.actor(states)
        pred_actions = low + (pred_actions + 1.0) * 0.5 * (high - low)
        actor_loss = -self.critic(states, pred_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft update target networks
        for param, targ_param in zip(self.actor.parameters(), self.targ_actor.parameters()):
            targ_param.data.copy_(tau * param.data + (1 - tau) * targ_param.data)
        for param, targ_param in zip(self.critic.parameters(), self.targ_critic.parameters()):
            targ_param.data.copy_(tau * param.data + (1 - tau) * targ_param.data)

                # Log losses to TensorBoard
        writer.add_scalar('loss/critic', critic_loss.item(), step)
        writer.add_scalar('loss/actor', actor_loss.item(), step)
        
    def store_transition(self, state, action, reward, done, next_state):
        self.buffer.push(state, action, reward, done, next_state)

In [70]:
obs, _ = env.reset()  
flat = np.array(obs, dtype=np.float32).ravel()
print("flat.shape:", flat.shape)   # ← doit être (6,) si vos next_states sont en 6 colonnes


flat.shape: (30,)


### 6. Training Loop

In [69]:
obs, _ = env.reset()            # gymnasium retourne (obs, info)
flat_obs = np.array(obs, dtype=np.float32).ravel()
print("flat_obs.shape =", flat_obs.shape)   # disons (30,)

obs_dim = flat_obs.shape[0]     # 30
act_dim = env.action_space.shape[0]
agent = DDPGAgent(obs_dim, act_dim)

global_step = 0
for ep in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0.0
    for step in range(int(max_steps)):
        action = agent.get_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        agent.store_transition(state, action, reward, done, next_state)
        agent.update(global_step)
        state = next_state
        total_reward += reward
        global_step += 1
        if done:
            break
    # Log episode reward
    writer.add_scalar('episode/reward', total_reward, ep)
    print(f"Episode {ep+1}/{num_episodes}: Reward = {total_reward:.2f}")

# Save final models
torch.save(agent.actor.state_dict(), "ddpg_actor.pth")
torch.save(agent.critic.state_dict(), "ddpg_critic.pth")

flat_obs.shape = (30,)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (320x6 and 30x256)

### 7. Evaluation comparison

In [None]:
eval_env = RecordEpisodeStatistics(
    gym.make("highway-fast-v0", render_mode="rgb_array").unwrapped.configure(config)
)
# Test trained agent
trained_rewards = []
agent.actor.eval()
for i in range(10):
    state, _ = eval_env.reset()
    r_sum = 0
    done = False
    while not done:
        a = agent.get_action(state, noise=False)
        state, r, term, trunc, _ = eval_env.step(a)
        r_sum += r
        done = term or trunc
    trained_rewards.append(r_sum)
# Test untrained (random) agent
random_rewards = []
for i in range(10):
    state, _ = eval_env.reset()
    r_sum = 0
    done = False
    while not done:
        a = eval_env.action_space.sample()
        state, r, term, trunc, _ = eval_env.step(a)
        r_sum += r
        done = term or trunc
    random_rewards.append(r_sum)

print("Average trained reward:", np.mean(trained_rewards))
print("Average random reward:", np.mean(random_rewards))
# Close environments
env.close()
eval_env.close()