# Partie 2 : Implémentation d'un algorithme DDPG sur un environnement continu Racetrack

In [173]:
import os
import pickle
import random
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import RecordEpisodeStatistics, RecordVideo, FlattenObservation
import highway_env
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt


### 1. Chargement de la configuration

In [174]:
# --- 1. Chargez la config pickle ---
with open("config_part2.pkl", "rb") as f:
    config = pickle.load(f)

# --- 2. Créez l'env et configurer la vraie instance ---
env = gym.make("racetrack-v0", render_mode='rgb_array')
raw_env = env.unwrapped          # on récupère l'instance sans wrappers
raw_env.configure(config)        # applique la config

# 3. Aplatissez l'observation (de (5,6) → (30,))
flat_env = FlattenObservation(raw_env)


# --- 3. Re-wrapper pour n'enregistrer qu'un épisode sur 10 ---
env = RecordVideo(
    flat_env,
    video_folder="videos_racetrack_grid/",
    name_prefix="ddpg-racetrack_occupancy",
    episode_trigger=lambda episode_id: episode_id % 10 == 0
)

# 4. Infère state_dim & action_dim dynamiquement
obs, _ = env.reset()
state = obs if not isinstance(obs, dict) else obs["observation"]
state_dim  = state.shape[0]               # devrait être 12*12*2 = 288
action_dim = env.action_space.shape[0]    # steering seul → 1
act_max    = env.action_space.high

print(f"state_dim = {state_dim}, action_dim = {action_dim}")


### Pour l'environnement kinematics ###
# 5. Récupère un obs réel pour inférer state_dim
# reset_out = env.reset()

# # gymnasium renvoie (obs, info)
# obs = reset_out[0] if isinstance(reset_out, tuple) else reset_out
# # si c’est un dict, prendre la clé "observation"
# if isinstance(obs, dict):
#     obs = obs["observation"]
  
# state_dim  = obs.shape[0]                # = 30 après flatten
# action_dim = env.action_space.shape[0]   # = 2
# act_max    = env.action_space.high

print(f"→ state_dim = {state_dim}, action_dim = {action_dim}")

state_dim = 288, action_dim = 1
→ state_dim = 288, action_dim = 1


### 2. Réseaux Actor / Critic

In [175]:

class Actor(nn.Module):
    def __init__(self, s_dim, a_dim, a_max):
        super().__init__()
        self.fc1 = nn.Linear(s_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        # self.fc3 = nn.Linear(256, a_dim)
        self.fc3 = nn.Linear(256, 1)
        self.a_max = torch.tensor(a_max, dtype=torch.float32)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # tanh ∈ [-1,1], on remet à l’échelle
        return torch.tanh(self.fc3(x)) * self.a_max

class Critic(nn.Module):
    def __init__(self, s_dim, a_dim):
        super().__init__()
        self.fc1 = nn.Linear(s_dim + a_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)
    def forward(self, s, a):
        x = torch.cat([s, a], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

### 3. Replay Buffer

In [176]:

class ReplayBuffer:
    def __init__(self, max_size=int(1e6)):
        self.storage = deque(maxlen=max_size)
    def add(self, data):
        self.storage.append(data)
    def sample(self, batch_size):
        batch = random.sample(self.storage, batch_size)
        s, a, r, s2, d = zip(*batch)
        return (np.vstack(s), np.vstack(a), np.vstack(r), np.vstack(s2), np.vstack(d))

### 4. Bruit Ornstein–Uhlenbeck

In [177]:

class OUNoise:
    def __init__(self, size, mu=0., theta=0.15, sigma=0.2):
        self.mu = mu * np.ones(size)
        self.theta, self.sigma = theta, sigma
        self.state = self.mu.copy()
    def reset(self):
        self.state = self.mu.copy()
    def __call__(self):
        dx = self.theta * (self.mu - self.state) + self.sigma * np.random.randn(len(self.state))
        self.state += dx
        return self.state

### 5. Agent DDPG

In [178]:
class DDPGAgent:
    def __init__(self, s_dim, a_dim, a_max, device):
        self.device = device
        # réseaux principaux et cibles
        self.actor = Actor(s_dim, a_dim, a_max).to(device)
        self.actor_target = Actor(s_dim, a_dim, a_max).to(device)
        self.critic = Critic(s_dim, a_dim).to(device)
        self.critic_target = Critic(s_dim, a_dim).to(device)
        # synchronisation initiale
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        # optimizers
        self.opt_a = optim.Adam(self.actor.parameters(), lr=1e-5)
        self.opt_c = optim.Adam(self.critic.parameters(), lr=5e-4)
        # buffer & bruit
        self.buffer = ReplayBuffer()
        self.noise = OUNoise(a_dim)
        # hyperparams
        self.gamma = 0.99
        self.tau = 5e-4
        self.batch_size = 32

    def select_action(self, state, noise=True):
        state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor(state_t).cpu().data.numpy().flatten()
        if noise:
            action += self.noise()
        return np.clip(action, -act_max, act_max)

    def update(self):
        if len(self.buffer.storage) < self.batch_size:
            return None, None
        s, a, r, s2, done = self.buffer.sample(self.batch_size)
        s = torch.FloatTensor(s).to(self.device)
        a = torch.FloatTensor(a).to(self.device)
        r = torch.FloatTensor(r).to(self.device)
        s2 = torch.FloatTensor(s2).to(self.device)
        done = torch.FloatTensor(1 - done).to(self.device)

        # Critic update
        with torch.no_grad():
            a2 = self.actor_target(s2)
            q2 = self.critic_target(s2, a2)
            q_target = r + self.gamma * done * q2
        q = self.critic(s, a)
        loss_c = F.mse_loss(q, q_target)
        self.opt_c.zero_grad(); loss_c.backward(); self.opt_c.step()

        # Actor update
        loss_a = -self.critic(s, self.actor(s)).mean()
        self.opt_a.zero_grad(); loss_a.backward(); self.opt_a.step()

        # Soft update cibles
        for param, target in zip(self.actor.parameters(), self.actor_target.parameters()):
            target.data.copy_(self.tau * param.data + (1 - self.tau) * target.data)
        for param, target in zip(self.critic.parameters(), self.critic_target.parameters()):
            target.data.copy_(self.tau * param.data + (1 - self.tau) * target.data)

        return loss_a.item(), loss_c.item()

    def save(self, path):
        torch.save(self.actor.state_dict(), os.path.join(path, "actor.pth"))
        torch.save(self.critic.state_dict(), os.path.join(path, "critic.pth"))

    def load(self, path):
        self.actor.load_state_dict(torch.load(os.path.join(path, "actor.pth")))
        self.critic.load_state_dict(torch.load(os.path.join(path, "critic.pth")))



### 6. Entraînement

In [179]:

def train(agent, env, max_episodes=200, max_steps=150, save_dir="models"):
    os.makedirs(save_dir, exist_ok=True)
    writer = SummaryWriter()
    rewards_history = []

    
    for ep in range(1, max_episodes + 1):
        # gymnasium.reset() → (obs, info)
        reset_out = env.reset()
        state = reset_out[0] if isinstance(reset_out, tuple) else reset_out
        # state est un vecteur [x,y,vx,vy,cos_h,sin_h]
        cx, cy = env.unwrapped.config["centering_position"]
        if isinstance(state, dict):
            state = state.get("observation", next(iter(state.values())))
        agent.noise.reset()

        ep_reward = 0
        ep_loss_a   = 0.0
        ep_loss_c   = 0.0
        update_count= 0
        collisions  = 0
        dist_center = 0.0
        actions     = []

        for step in range(max_steps):
            action = agent.select_action(state)
            actions.append(action)

            step_out = env.step(action)
            # gymnasium.step() → (obs, reward, terminated, truncated, info)
            if len(step_out) == 5:
                next_obs, reward, term, trunc, info = step_out
                done = term or trunc
            else:
                next_obs, reward, done, info = step_out
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            if isinstance(next_obs, dict):
                next_obs = next_obs.get("observation", next(iter(next_obs.values())))

            # 4) Metrics d'environnement
            collisions  += info.get("collision_count", 0)
            x,y = state[0], state[1]
            dist_center += ((x - cx)**2 + (y - cy)**2)**0.5

            agent.buffer.add((state, action, reward, next_obs, float(done)))
            state = next_obs
            ep_reward += reward

            la, lc = agent.update()

            if la is not None:
                ep_loss_a    += la
                ep_loss_c    += lc
                update_count += 1

            if done:
                break

        # ==== Post-épisode ====
        length = step + 1
        # moyennes de loss (si jamais pas de maj, on évite la division par 0)
        avg_loss_a = ep_loss_a / update_count if update_count else 0.0
        avg_loss_c = ep_loss_c / update_count if update_count else 0.0

        # stats d'action
        actions_arr = np.array(actions)  # shape (length, action_dim)
        mean_steer       = actions_arr[:, 0].mean()
        std_steer        = actions_arr[:, 0].std()
        mean_acceleration= actions_arr[:, 1].mean() if actions_arr.shape[1] > 1 else 0.0
        std_acceleration = actions_arr[:, 1].std()  if actions_arr.shape[1] > 1 else 0.0

        # ==== Logging TensorBoard ====
        writer.add_scalar("Reward/episode",        ep_reward,      ep)
        writer.add_scalar("Loss/actor",            avg_loss_a,     ep)
        writer.add_scalar("Loss/critic",           avg_loss_c,     ep)
        writer.add_scalar("Episode/length",        length,         ep)
        writer.add_scalar("Env/collisions",        collisions,     ep)
        writer.add_scalar("Env/avg_dist_center",   dist_center/length, ep)

        writer.add_scalar("Action/mean_steer",         mean_steer,        ep)
        writer.add_scalar("Action/std_steer",          std_steer,         ep)
        writer.add_scalar("Action/mean_acceleration",  mean_acceleration, ep)
        writer.add_scalar("Action/std_acceleration",   std_acceleration,  ep)

        # console & save
        print(f"[Episode {ep:03d}] Reward: {ep_reward:.2f} | Length: {length}")
        if ep % 50 == 0:
            agent.save(save_dir)

        rewards_history.append(ep_reward)

    # Fin d’entraînement : sauvegarde finale, fermeture
    agent.save(save_dir)
    writer.close()
    env.close()   # <— IMPORTANT pour forcer l’écriture du dernier .mp4

    return rewards_history

### 8. Évaluation

In [180]:

def evaluate(agent, env, episodes=10, max_steps = 200, random_agent=False):
    total_reward = 0.0
    for ep in range(1, episodes + 1):
        # gymnasium 0.26+ : reset → (obs, info)
        out = env.reset()
        state = out[0] if isinstance(out, tuple) else out
        if isinstance(state, dict):
            state = state["observation"]

        done = False
        ep_reward = 0.0
        for step in range(1, max_steps + 1):
            if random_agent:
                action = env.action_space.sample()
            else:
                action = agent.select_action(state, noise=False)

            step_out = env.step(action)
            # gymnasium.step() → (obs, reward, terminated, truncated, info)
            if len(step_out) == 5:
                nxt, reward, term, trunc, _ = step_out
                done = term or trunc
            else:
                nxt, reward, done, _ = step_out

            if isinstance(nxt, tuple):
                nxt = nxt[0]
            if isinstance(nxt, dict):
                nxt = nxt["observation"]

            state = nxt
            ep_reward += reward

            # Arrêt si on a atteint done ou max_steps
            if done:
                break

        print(f"[Eval Episode {ep:02d}] steps: {step}  reward: {ep_reward:.2f}")
        total_reward += ep_reward

    avg_reward = total_reward / episodes
    return avg_reward


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = DDPGAgent(state_dim, action_dim, act_max, device)
    rewards = train(agent, env)
    # test final
    avg_ddpg = evaluate(agent, env, random_agent=False)
    avg_rand = evaluate(agent, env, random_agent=True)
    print(f"DDPG avg reward: {avg_ddpg:.2f} vs random avg reward: {avg_rand:.2f}")


[Episode 001] Reward: 6.04 | Length: 150
[Episode 002] Reward: 3.18 | Length: 150
[Episode 003] Reward: 26.35 | Length: 150
[Episode 004] Reward: 28.63 | Length: 150
[Episode 005] Reward: 45.78 | Length: 150
[Episode 006] Reward: 55.09 | Length: 150
[Episode 007] Reward: 42.28 | Length: 150
[Episode 008] Reward: 7.72 | Length: 44
[Episode 009] Reward: 50.45 | Length: 150
[Episode 010] Reward: 12.68 | Length: 150
[Episode 011] Reward: 42.98 | Length: 150
[Episode 012] Reward: 30.26 | Length: 150
[Episode 013] Reward: 31.29 | Length: 150
[Episode 014] Reward: 43.84 | Length: 150
[Episode 015] Reward: 35.81 | Length: 150
[Episode 016] Reward: 13.69 | Length: 150
[Episode 017] Reward: 5.27 | Length: 150
[Episode 018] Reward: 1.87 | Length: 150
[Episode 019] Reward: 91.19 | Length: 150
[Episode 020] Reward: 34.60 | Length: 53
[Episode 021] Reward: 99.60 | Length: 150
[Episode 022] Reward: 27.31 | Length: 52
[Episode 023] Reward: 110.78 | Length: 150
[Episode 024] Reward: 109.83 | Length: 15

In [None]:
### Run this command in your terminal to see with tensorboard the learning : tensorboard --logdir runs_event_racetrack/
