In [31]:
import gymnasium as gym
import numpy as np
import torch.nn as nn
from stable_baselines3 import A2C
from stable_baselines3.common.policies import ActorCriticPolicy, BaseFeaturesExtractor
from stable_baselines3.common.callbacks import EvalCallback
import pandas as pd
from tensorflow import keras

In [32]:

# ------------------ Custom Environment Wrapper ------------------

class CustomHighwayEnv(gym.Wrapper):
    def step(self, action):
        obs, reward, done, info,_ = self.env.step(action)
        if done:
            reward = +1000
        else:
            reward = -50
        return obs, reward, done, info,_  

# ------------------ Behavior Cloning ------------------

class BehaviorCloningPolicy:
    def __init__(self, model_path):
        self.model = keras.models.load_model(model_path)

    def predict(self, ego_vehicle, vehicles):
        obs = [self.vehicle_to_observation(ego_vehicle)]
        for vehicle in vehicles:
            obs.append(self.vehicle_to_observation(vehicle))
        while len(obs) < 5:
            obs.append([0, 0, 0, 0, 0])  # Padding
        obs = np.array(obs).reshape(1, 5, 5)
        action_probs = self.model.predict(obs)
        return np.argmax(action_probs[0])

    @staticmethod
    def vehicle_to_observation(vehicle):
        return [
            1.0,
            vehicle.position[0] / 100.0,
            vehicle.position[1] / 5.0,
            vehicle.speed / 30.0,
            vehicle.heading / (2 * np.pi)
        ]

# ------------------ Custom Network and Policy ------------------

class D2RLNetwork(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim: int = 256):
        super(D2RLNetwork, self).__init__(observation_space, features_dim)
        self.flatten = nn.Flatten()
        self.hidden = nn.Sequential(
            nn.Linear(np.prod(observation_space.shape), features_dim),
            nn.ReLU()
        )
        self.d2rl1 = nn.Sequential(
            nn.Linear(features_dim, features_dim),
            nn.ReLU()
        )
        self.d2rl2 = nn.Sequential(
            nn.Linear(features_dim, features_dim),
            nn.ReLU()
        )

    def forward(self, observations):
        x = self.flatten(observations)
        x1 = self.hidden(x)
        x2 = self.d2rl1(x1)
        x3 = self.d2rl2(x1 + x2)
        return x3

class D2RLPolicy(ActorCriticPolicy):
    def __init__(self, *args, **kwargs):
        super(D2RLPolicy, self).__init__(*args, **kwargs, features_extractor_class=D2RLNetwork, features_extractor_kwargs=dict(features_dim=256))


In [33]:

if __name__ == "__main__":
    # Create environment
    base_env = gym.make("highway-fast-v0")
    env = CustomHighwayEnv(gym.make("highway-fast-v0", render_mode="rgb_array"))

    # Load the pretrained behavior cloning model
    model_path = "C:\\Users\\Ram\\highway\\perturbed_models\\perturbed_model_1.keras"
    bc_policy = BehaviorCloningPolicy(model_path)

    # Callbacks for evaluation during training and tensorboard
    eval_callback = EvalCallback(env, best_model_save_path='./logs_a2c/best_model',
                                 log_path='./logs_a2c/results', eval_freq=100, n_eval_episodes=5)

    # Train A2C with the custom D2RL policy
    model = A2C(D2RLPolicy, env, verbose=1, tensorboard_log="./tensorboard_a2c/")
    model.learn(total_timesteps=1000, callback=eval_callback)

    # Test the trained model and save rewards to CSV
    model = A2C.load("C:\\Users\\Ram\\highway\\logs_a2c\\best_model\\best_model.zip", env=env)

    episode_data = {}
    for ep in range(10000):
        obs = env.reset()
        if len(obs) > 1:
            obs = obs[0]
        done = False
        episode_reward = 0
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, _,_ = env.step(action)
            episode_reward += reward
            env.render()
        episode_data[ep+1] = episode_reward

    # Save episode rewards to CSV
    pd.DataFrame({"Episode ": list(episode_data.keys()), "Reward": list(episode_data.values())}).to_csv("d2rl_A2C.csv", index=False)
    print("Training and testing complete!")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./tensorboard_a2c/A2C_1


  logger.warn(
  logger.warn(


Eval num_timesteps=100, episode_reward=630.00 +/- 215.87
Episode length: 8.40 +/- 4.32
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 8.4       |
|    mean_reward        | 630       |
| time/                 |           |
|    total_timesteps    | 100       |
| train/                |           |
|    entropy_loss       | -1.54     |
|    explained_variance | -6.06e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 19        |
|    policy_loss        | 556       |
|    value_loss         | 3.89e+05  |
-------------------------------------
New best mean reward!




Eval num_timesteps=200, episode_reward=620.00 +/- 222.71
Episode length: 8.60 +/- 4.45
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 8.6      |
|    mean_reward        | 620      |
| time/                 |          |
|    total_timesteps    | 200      |
| train/                |          |
|    entropy_loss       | -1.56    |
|    explained_variance | 2.58e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 39       |
|    policy_loss        | 1.31e+03 |
|    value_loss         | 7.44e+05 |
------------------------------------




Eval num_timesteps=300, episode_reward=830.00 +/- 102.96
Episode length: 4.40 +/- 2.06
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 4.4      |
|    mean_reward        | 830      |
| time/                 |          |
|    total_timesteps    | 300      |
| train/                |          |
|    entropy_loss       | -1.53    |
|    explained_variance | 2.2e-05  |
|    learning_rate      | 0.0007   |
|    n_updates          | 59       |
|    policy_loss        | 808      |
|    value_loss         | 5.42e+05 |
------------------------------------
New best mean reward!




Eval num_timesteps=400, episode_reward=690.00 +/- 222.26
Episode length: 7.20 +/- 4.45
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 7.2      |
|    mean_reward        | 690      |
| time/                 |          |
|    total_timesteps    | 400      |
| train/                |          |
|    entropy_loss       | -1.55    |
|    explained_variance | 6.14e-06 |
|    learning_rate      | 0.0007   |
|    n_updates          | 79       |
|    policy_loss        | 1.1e+03  |
|    value_loss         | 6.76e+05 |
------------------------------------




Eval num_timesteps=500, episode_reward=240.00 +/- 308.87
Episode length: 16.20 +/- 6.18
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 16.2      |
|    mean_reward        | 240       |
| time/                 |           |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.53     |
|    explained_variance | -2.03e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | 1e+03     |
|    value_loss         | 6.76e+05  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 9.04     |
|    ep_rew_mean     | 598      |
| time/              |          |
|    fps             | 37       |
|    iterations      | 100      |
|    time_elapsed    | 13       |
|    total_timesteps | 500      |
---------------------------------




Eval num_timesteps=600, episode_reward=280.00 +/- 455.63
Episode length: 15.40 +/- 9.11
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 15.4     |
|    mean_reward        | 280      |
| time/                 |          |
|    total_timesteps    | 600      |
| train/                |          |
|    entropy_loss       | -1.47    |
|    explained_variance | 9.78e-06 |
|    learning_rate      | 0.0007   |
|    n_updates          | 119      |
|    policy_loss        | 77       |
|    value_loss         | 2.17e+05 |
------------------------------------




Eval num_timesteps=700, episode_reward=140.00 +/- 356.93
Episode length: 18.20 +/- 7.14
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 18.2     |
|    mean_reward        | 140      |
| time/                 |          |
|    total_timesteps    | 700      |
| train/                |          |
|    entropy_loss       | -1.43    |
|    explained_variance | 3.74e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 139      |
|    policy_loss        | -189     |
|    value_loss         | 2.66e+04 |
------------------------------------




Eval num_timesteps=800, episode_reward=420.00 +/- 338.53
Episode length: 12.60 +/- 6.77
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 12.6      |
|    mean_reward        | 420       |
| time/                 |           |
|    total_timesteps    | 800       |
| train/                |           |
|    entropy_loss       | -1.41     |
|    explained_variance | -4.29e-06 |
|    learning_rate      | 0.0007    |
|    n_updates          | 159       |
|    policy_loss        | 4.5       |
|    value_loss         | 2.17e+05  |
-------------------------------------




Eval num_timesteps=900, episode_reward=660.00 +/- 124.10
Episode length: 7.80 +/- 2.48
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 7.8       |
|    mean_reward        | 660       |
| time/                 |           |
|    total_timesteps    | 900       |
| train/                |           |
|    entropy_loss       | -1.34     |
|    explained_variance | -1.07e-06 |
|    learning_rate      | 0.0007    |
|    n_updates          | 179       |
|    policy_loss        | 679       |
|    value_loss         | 5.38e+05  |
-------------------------------------




Eval num_timesteps=1000, episode_reward=620.00 +/- 102.96
Episode length: 8.60 +/- 2.06
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 8.6      |
|    mean_reward        | 620      |
| time/                 |          |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.23    |
|    explained_variance | 6.32e-06 |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 739      |
|    value_loss         | 5.32e+05 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 7.2      |
|    ep_rew_mean     | 690      |
| time/              |          |
|    fps             | 31       |
|    iterations      | 200      |
|    time_elapsed    | 31       |
|    total_timesteps | 1000     |
---------------------------------
Wrapping the env with a `Monitor` wrapper
Wrapping the

: 