In [3]:
import gymnasium as gym
import numpy as np
import torch.nn as nn
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy, BaseFeaturesExtractor
from stable_baselines3.common.callbacks import EvalCallback
import pandas as pd
from tensorflow import keras

# ------------------ Custom Environment Wrapper ------------------

class CustomHighwayEnv(gym.Wrapper):
    def step(self, action):
        obs, reward, done, info,_ = self.env.step(action)
        if done:
            reward = +1000
        else:
            reward = -50
        return obs, reward, done, info,_  

# ------------------ Behavior Cloning ------------------

class BehaviorCloningPolicy:
    def __init__(self, model_path):
        self.model = keras.models.load_model(model_path)

    def predict(self, ego_vehicle, vehicles):
        obs = [self.vehicle_to_observation(ego_vehicle)]
        for vehicle in vehicles:
            obs.append(self.vehicle_to_observation(vehicle))
        while len(obs) < 5:
            obs.append([0, 0, 0, 0, 0])  # Padding
        obs = np.array(obs).reshape(1, 5, 5)
        action_probs = self.model.predict(obs)
        return np.argmax(action_probs[0])

    @staticmethod
    def vehicle_to_observation(vehicle):
        return [
            1.0,
            vehicle.position[0] / 100.0,
            vehicle.position[1] / 5.0,
            vehicle.speed / 30.0,
            vehicle.heading / (2 * np.pi)
        ]

# ------------------ Custom Network and Policy ------------------

class D2RLNetwork(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim: int = 256):
        super(D2RLNetwork, self).__init__(observation_space, features_dim)
        self.flatten = nn.Flatten()
        self.hidden = nn.Sequential(
            nn.Linear(np.prod(observation_space.shape), features_dim),
            nn.ReLU()
        )
        self.d2rl1 = nn.Sequential(
            nn.Linear(features_dim, features_dim),
            nn.ReLU()
        )
        self.d2rl2 = nn.Sequential(
            nn.Linear(features_dim, features_dim),
            nn.ReLU()
        )

    def forward(self, observations):
        x = self.flatten(observations)
        x1 = self.hidden(x)
        x2 = self.d2rl1(x1)
        x3 = self.d2rl2(x1 + x2)
        return x3

class D2RLPolicy(ActorCriticPolicy):
    def __init__(self, *args, **kwargs):
        super(D2RLPolicy, self).__init__(*args, **kwargs, features_extractor_class=D2RLNetwork, features_extractor_kwargs=dict(features_dim=256))


In [4]:

# ------------------ Main Execution ------------------

if __name__ == "__main__":
    # Create environment
    base_env = gym.make("highway-fast-v0")
    env = CustomHighwayEnv(gym.make("highway-fast-v0", render_mode="rgb_array"))

    # Load the pretrained behavior cloning model
    model_path = "C:\\Users\\Ram\\highway\\perturbed_models\\perturbed_model_1.keras"
    bc_policy = BehaviorCloningPolicy(model_path)

    # Callbacks for evaluation during training and tensorboard
    eval_callback = EvalCallback(env, best_model_save_path='./logs/best_model',
                                 log_path='./logs/results', eval_freq=100, n_eval_episodes=5)

    # Train PPO with the custom D2RL policy
    model = PPO(D2RLPolicy, env, verbose=1, tensorboard_log="./tensorboard/")
    model.learn(total_timesteps=1000, callback=eval_callback)

    # Test the trained model and save rewards to CSV
    model = PPO.load("C:\\Users\\Ram\\highway\\logs\\best_model\\best_model.zip", env=env)

    episode_data = {}
    for ep in range(10000):
        obs = env.reset()
        if len(obs) > 1:
            obs = obs[0]
        done = False
        episode_reward = 0
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, _,_ = env.step(action)
            episode_reward += reward
            env.render()
        episode_data[ep+1] = episode_reward

    # Save episode rewards to CSV
    pd.DataFrame({"Episode ": list(episode_data.keys()), "Reward": list(episode_data.values())}).to_csv("d2rl_PPO.csv", index=False)
    print("Training and testing complete!")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./tensorboard/PPO_1


  logger.warn(
  logger.warn(


Eval num_timesteps=100, episode_reward=630.00 +/- 150.33
Episode length: 8.40 +/- 3.01
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.4      |
|    mean_reward     | 630      |
| time/              |          |
|    total_timesteps | 100      |
---------------------------------
New best mean reward!




Eval num_timesteps=200, episode_reward=800.00 +/- 154.92
Episode length: 5.00 +/- 3.10
---------------------------------
| eval/              |          |
|    mean_ep_length  | 5        |
|    mean_reward     | 800      |
| time/              |          |
|    total_timesteps | 200      |
---------------------------------
New best mean reward!




Eval num_timesteps=300, episode_reward=810.00 +/- 66.33
Episode length: 4.80 +/- 1.33
---------------------------------
| eval/              |          |
|    mean_ep_length  | 4.8      |
|    mean_reward     | 810      |
| time/              |          |
|    total_timesteps | 300      |
---------------------------------
New best mean reward!




Eval num_timesteps=400, episode_reward=710.00 +/- 146.29
Episode length: 6.80 +/- 2.93
---------------------------------
| eval/              |          |
|    mean_ep_length  | 6.8      |
|    mean_reward     | 710      |
| time/              |          |
|    total_timesteps | 400      |
---------------------------------




Eval num_timesteps=500, episode_reward=820.00 +/- 97.98
Episode length: 4.60 +/- 1.96
---------------------------------
| eval/              |          |
|    mean_ep_length  | 4.6      |
|    mean_reward     | 820      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------
New best mean reward!




Eval num_timesteps=600, episode_reward=680.00 +/- 107.70
Episode length: 7.40 +/- 2.15
---------------------------------
| eval/              |          |
|    mean_ep_length  | 7.4      |
|    mean_reward     | 680      |
| time/              |          |
|    total_timesteps | 600      |
---------------------------------




Eval num_timesteps=700, episode_reward=640.00 +/- 247.79
Episode length: 8.20 +/- 4.96
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.2      |
|    mean_reward     | 640      |
| time/              |          |
|    total_timesteps | 700      |
---------------------------------




Eval num_timesteps=800, episode_reward=650.00 +/- 170.29
Episode length: 8.00 +/- 3.41
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8        |
|    mean_reward     | 650      |
| time/              |          |
|    total_timesteps | 800      |
---------------------------------




Eval num_timesteps=900, episode_reward=670.00 +/- 188.68
Episode length: 7.60 +/- 3.77
---------------------------------
| eval/              |          |
|    mean_ep_length  | 7.6      |
|    mean_reward     | 670      |
| time/              |          |
|    total_timesteps | 900      |
---------------------------------




Eval num_timesteps=1000, episode_reward=770.00 +/- 87.18
Episode length: 5.60 +/- 1.74
---------------------------------
| eval/              |          |
|    mean_ep_length  | 5.6      |
|    mean_reward     | 770      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------




Eval num_timesteps=1100, episode_reward=630.00 +/- 116.62
Episode length: 8.40 +/- 2.33
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.4      |
|    mean_reward     | 630      |
| time/              |          |
|    total_timesteps | 1100     |
---------------------------------




Eval num_timesteps=1200, episode_reward=610.00 +/- 159.37
Episode length: 8.80 +/- 3.19
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.8      |
|    mean_reward     | 610      |
| time/              |          |
|    total_timesteps | 1200     |
---------------------------------




Eval num_timesteps=1300, episode_reward=760.00 +/- 120.00
Episode length: 5.80 +/- 2.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 5.8      |
|    mean_reward     | 760      |
| time/              |          |
|    total_timesteps | 1300     |
---------------------------------




Eval num_timesteps=1400, episode_reward=330.00 +/- 143.53
Episode length: 14.40 +/- 2.87
---------------------------------
| eval/              |          |
|    mean_ep_length  | 14.4     |
|    mean_reward     | 330      |
| time/              |          |
|    total_timesteps | 1400     |
---------------------------------




Eval num_timesteps=1500, episode_reward=620.00 +/- 166.13
Episode length: 8.60 +/- 3.32
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.6      |
|    mean_reward     | 620      |
| time/              |          |
|    total_timesteps | 1500     |
---------------------------------




Eval num_timesteps=1600, episode_reward=800.00 +/- 63.25
Episode length: 5.00 +/- 1.26
---------------------------------
| eval/              |          |
|    mean_ep_length  | 5        |
|    mean_reward     | 800      |
| time/              |          |
|    total_timesteps | 1600     |
---------------------------------




Eval num_timesteps=1700, episode_reward=660.00 +/- 295.63
Episode length: 7.80 +/- 5.91
---------------------------------
| eval/              |          |
|    mean_ep_length  | 7.8      |
|    mean_reward     | 660      |
| time/              |          |
|    total_timesteps | 1700     |
---------------------------------




Eval num_timesteps=1800, episode_reward=730.00 +/- 60.00
Episode length: 6.40 +/- 1.20
---------------------------------
| eval/              |          |
|    mean_ep_length  | 6.4      |
|    mean_reward     | 730      |
| time/              |          |
|    total_timesteps | 1800     |
---------------------------------




Eval num_timesteps=1900, episode_reward=400.00 +/- 320.94
Episode length: 13.00 +/- 6.42
---------------------------------
| eval/              |          |
|    mean_ep_length  | 13       |
|    mean_reward     | 400      |
| time/              |          |
|    total_timesteps | 1900     |
---------------------------------




Eval num_timesteps=2000, episode_reward=640.00 +/- 159.37
Episode length: 8.20 +/- 3.19
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.2      |
|    mean_reward     | 640      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 11.2     |
|    ep_rew_mean     | 456      |
| time/              |          |
|    fps             | 41       |
|    iterations      | 1        |
|    time_elapsed    | 49       |
|    total_timesteps | 2048     |
---------------------------------
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Training and testing complete!


: 