In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install gymnasium
!pip install pygame



In [2]:
!pip install swig
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m31.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparin

In [3]:
import gymnasium as gym

gym.pprint_registry()

===== classic_control =====
Acrobot-v1             CartPole-v0            CartPole-v1
MountainCar-v0         MountainCarContinuous-v0 Pendulum-v1
===== phys2d =====
phys2d/CartPole-v0     phys2d/CartPole-v1     phys2d/Pendulum-v0
===== box2d =====
BipedalWalker-v3       BipedalWalkerHardcore-v3 CarRacing-v3
LunarLander-v3         LunarLanderContinuous-v3
===== toy_text =====
Blackjack-v1           CliffWalking-v0        FrozenLake-v1
FrozenLake8x8-v1       Taxi-v3
===== tabular =====
tabular/Blackjack-v0   tabular/CliffWalking-v0
===== mujoco =====
Ant-v2                 Ant-v3                 Ant-v4
Ant-v5                 HalfCheetah-v2         HalfCheetah-v3
HalfCheetah-v4         HalfCheetah-v5         Hopper-v2
Hopper-v3              Hopper-v4              Hopper-v5
Humanoid-v2            Humanoid-v3            Humanoid-v4
Humanoid-v5            HumanoidStandup-v2     HumanoidStandup-v4
HumanoidStandup-v5     InvertedDoublePendulum-v2 InvertedDoublePendulum-v4
InvertedDoublePendulu

In [45]:
import torch.nn as nn
import torch.optim as optim
import torch

class GaussianActorCritic(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dims=(64, 64)):
        super().__init__()
        # shared layers
        self.share = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )

        # actor layers. actor_mean estimates the mean of the Gaussian distribution, and actor_log_std estimates the log_std
        self.actor_mean = nn.Linear(hidden_dims[1], output_dim)

        # log_std is safe from having negative values, therefore more stable than estimating std. Since action is bounded from -1 to 1, initialize it as e^-2
        self.actor_log_std = nn.Parameter(torch.ones(output_dim) * -2)

        # critic layer estimates value function
        self.critic = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.share(x)

        # actor's mean layer estimates mean.
        mean = self.actor_mean(x)

        # actor's log_std layer estimates log_std. Then, convert it to std
        log_std = self.actor_log_std
        std = torch.exp(log_std)

        # critic layer estimates value
        value = self.critic(x)

        return mean, std, value


In [75]:
from tqdm import tqdm
import numpy as np

class A2CAgent:
    def __init__(self, env_id, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5, num_envs = 8, vectorization_mode = "sync"):
        # using vectorized environments to boost training
        # sync is more stable, async is faster
        self.env = gym.make_vec(env_id, num_envs = num_envs, vectorization_mode=vectorization_mode)
        self.num_envs = num_envs
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # DEBUG NOTE must use single_action_space, not action_space since action_space.shape has dimension (num_env, num_actions)
        # self.policy_net = GaussianActorCritic(self.env.single_observation_space.shape[0], self.env.action_space.shape[0]).to(self.device)
        self.policy_net = GaussianActorCritic(self.env.single_observation_space.shape[0], self.env.single_action_space.shape[0]).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        mean, std, _ = self.policy_net(state)
        action_dist = torch.distributions.Normal(mean, std)
        action = action_dist.sample()

        # bipedal walker's action space is bounded to [-1, 1]. Apply tanh function to keep the action in range
        action = torch.tanh(action)
        return action

    # computing the gamma decaying rewards in Monte carlo
    def compute_return(self, rewards):
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return torch.stack(returns)

    # computing the n step rewards
    def compute_n_step_returns(self, rewards, next_value):

        # Bootstraps the future reward using value estimate
        R = next_value
        returns = []
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return torch.stack(returns)


    # mostly equal to A2C_cartpole
    def train(self):
        episode_rewards = []
        episode_steps = []
        step_sum = 0
        random_seed = 1
        torch.manual_seed(random_seed)

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            done = np.zeros(self.num_envs, dtype=bool)
            episode_reward = np.zeros(self.num_envs)
            values, rewards, log_probs = [], [], []
            done_mask = np.zeros(self.num_envs, dtype=bool)
            done_steps = np.zeros(self.num_envs)
            steps = 0

            while not np.all(done_mask) and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)

                # differs from A2C_cartpole when sampling action
                mean, std, value = self.policy_net(state_tensor)
                action_dist = torch.distributions.Normal(mean, std)
                action = action_dist.sample()

                # not exactly the log probability, but log(probability density) since it is a continuous space
                # sum it in the dimension of num_actions (multiplying probability)
                log_prob = action_dist.log_prob(action).sum(dim=-1)

                # need to move the tensor to the cpu to convert it to numpy
                next_state, reward, terminated, truncated, _ = self.env.step(action.cpu().numpy())
                done = np.logical_or(terminated, truncated)
                done_steps = np.where(np.logical_and(done, ~done_mask), steps, done_steps)
                done_mask = np.logical_or(done_mask, done)
                reward = np.where(done_mask, 0.0, reward)

                # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                values.append(value.squeeze())
                rewards.append(torch.tensor(reward, dtype=torch.float32).to(self.device))
                log_probs.append(log_prob)

                episode_reward += reward
                state = next_state

                # every n steps, calculate losses, update the actor & critic, then refresh the saved lists
                # if (steps % self.num_steps == 0) or np.any(done):
                if np.any(done):
                    with torch.no_grad():
                        next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                        _, _, next_value = self.policy_net(next_state_tensor)
                        done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                        next_value = next_value.squeeze() * (1 - done_tensor)

                    # returns = self.compute_n_step_returns(rewards, next_value)  # shape: (n_steps, num_envs)
                    returns = self.compute_return(rewards)
                    returns = returns.transpose(0, 1)  # shape: (num_envs, n_steps)
                    values = torch.stack(values).transpose(0, 1)  # shape: (num_envs, n_steps)
                    log_probs = torch.stack(log_probs).transpose(0, 1)  # shape: (num_envs, n_steps)
                    advantages = returns - values
                    # matching the dimensions fo log_probs

                    # print(steps, "values", values.shape, "returns", returns.shape, "log probs", log_probs.shape)

                    # calculate actor_loss by multiplying log probabilities to advantages. This will decrease the action probability of negative advantages, and vice-versa
                    actor_loss = - (log_probs * advantages.detach()).mean()

                    # updates the critic to find better estimate of values that matches the n-step reward
                    critic_loss = self.loss(returns, values)

                    print(steps, "\nLOSS", actor_loss, critic_loss, "R/V", returns, values)

                    loss = actor_loss + 0.4 * critic_loss
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    values = []
                    rewards = []
                    log_probs = []

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

            if episode % 20 == 0:
               print('Episode {}\tlengths: {}\treward: {}]\tfull length: {}'.format(episode, done_steps, episode_reward, steps))

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

        self.env.close()
        return np.array(episode_rewards)


In [76]:
import gymnasium as gym

env_id = 'LunarLanderContinuous-v3'
gamma = 0.99
num_episodes = 1
max_steps = 1000
num_steps = 8
lr = 7e-3

a2c_model =  A2CAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = num_steps)

rewards = a2c_model.train()

  0%|          | 0/1 [00:00<?, ?it/s]

71 
LOSS tensor(24.7913, device='cuda:0', grad_fn=<NegBackward0>) tensor(894.9126, device='cuda:0', grad_fn=<MseLossBackward0>) R/V tensor([[-7.3137e+00, -7.2056e+00, -9.5065e+00, -1.1881e+01, -1.1437e+01,
         -1.3863e+01, -1.3170e+01, -1.3660e+01, -1.3650e+01, -1.3412e+01,
         -1.5899e+01, -1.8395e+01, -2.0777e+01, -2.0804e+01, -2.2035e+01,
         -2.3475e+01, -2.2657e+01, -2.1051e+01, -1.8917e+01, -1.8464e+01,
         -1.6236e+01, -1.3996e+01, -1.5230e+01, -1.3091e+01, -1.2309e+01,
         -1.0105e+01, -1.0944e+01, -8.8366e+00, -9.1341e+00, -8.6338e+00,
         -6.6275e+00, -7.1739e+00, -6.1091e+00, -6.8650e+00, -4.8969e+00,
         -2.9626e+00, -3.7250e+00, -1.8606e+00, -3.2789e-02,  2.7975e-01,
         -7.4783e-01,  9.5235e-01, -1.1153e+00, -2.6298e+00, -3.6539e+00,
         -3.1986e+00, -1.6547e+00, -3.3880e-02,  1.7801e+00,  1.7302e+00,
          1.8226e+00,  3.5953e+00,  3.1707e+00,  3.3494e+00,  3.1456e+00,
          1.5281e+00,  3.2459e+00,  2.6427e+00,  4.316

100%|██████████| 1/1 [00:00<00:00,  2.35it/s]

119 
LOSS tensor(7.8479, device='cuda:0', grad_fn=<NegBackward0>) tensor(271.8023, device='cuda:0', grad_fn=<MseLossBackward0>) R/V tensor([[ 13.5675,  12.9365,  10.6420,  11.5526,  10.9891,   8.2531,   5.3555,
           6.2450,   7.0896,   6.0635,   6.8373,   6.1888,   5.6782,   3.8648,
           1.2203,   0.7525,  -0.9287],
        [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [-56.5082, -55.5832, -54.2372, -52.0556, -50.9956, -48.1263, -43.825




In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


In [None]:
import matplotlib.pyplot as plt

env = gym.make('LunarLanderContinuous-v3')
test_episodes = 10
test_rewards = []

for _ in range(test_episodes):
    state, _ = env.reset()
    done = False
    ep_reward = 0

    while not done:
        with torch.no_grad():
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(a2c_model.device)
            mean, std, _ = a2c_model.policy_net(state_tensor)
            action = mean

        next_state, reward, terminated, truncated, _ = env.step(action.cpu().numpy().squeeze(0))
        done = terminated or truncated
        ep_reward += reward
        state = next_state

    test_rewards.append(ep_reward)


print(f"Average reward over {test_episodes} test episodes: {np.mean(test_rewards):.2f}")

plt.plot(test_rewards)
plt.title("A2C Test Episode Rewards")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.grid(True)
plt.show()


In [None]:
import gymnasium as gym
import numpy as np


env = gym.make('LunarLanderContinuous-v3')
test_episodes = 10
random_rewards = []

for ep in range(test_episodes):
    state, _ = env.reset()
    done = False
    ep_reward = 0

    while not done:
        action = env.action_space.sample()  # random continuous action
        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        ep_reward += reward

    random_rewards.append(ep_reward)
    print(f"Episode {ep+1} reward: {ep_reward:.2f}")

env.close()

avg_reward = np.mean(random_rewards)
print(f"\n✅ Average reward over {test_episodes} random episodes: {avg_reward:.2f}")


In [None]:
# from tqdm import tqdm
# import torch.profiler

# n_envs = 1
# envs = [gym.make('LunarLanderContinuous-v3') for _ in range(n_envs)]

# n_states = envs[0].observation_space.shape[0]
# n_actions = envs[0].action_space.shape[0]

# actor_network = GaussianActor(n_states, n_actions)
# critic_network = Critic(n_states)

# state_normalizer = Normalizer(shape=(envs[0].observation_space.shape[0],))
# optimizer = optim.Adam(list(actor_network.parameters()) + list(critic_network.parameters()), lr=initial_lr)
# mse_loss = nn.MSELoss()

# max_step = 1600
# states, actions, log_probs, rewards, values, dones, entropies = [], [], [], [], [], [], []


# for ep in tqdm(range(episodes)):
#     states, rewards, dones, values, actions = [], [], [], [], []
#     ep_reward = 0
#     latest_debug_info = {}

#     for env in envs:
#         state, _ = env.reset()
#         done = False
#         steps = 0

#         while not done:
#             state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
#             mean, std = actor_network(state_tensor)
#             dist = torch.distributions.Normal(mean, std)
#             action = dist.sample()
#             log_prob = dist.log_prob(action).sum(dim=-1)
#             entropy = dist.entropy().sum(dim=-1)
#             value = critic_network(state_tensor)
#             steps += 1

#             next_state, reward, terminated, truncated, _ = env.step(action.detach().numpy().squeeze(0))
#             done = terminated or truncated or steps > max_step

#             states.append(state_tensor)
#             actions.append(action)
#             log_probs.append(log_prob)
#             values.append(value)
#             rewards.append(reward)
#             dones.append(done)
#             entropies.append(entropy)
#             ep_reward += reward

#             if len(rewards) >= num_steps or done:

#                 next_state_tensor = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
#                 if dones[-1]:
#                     bootstrap_value = 0
#                 else:
#                     bootstrap_value = critic_network(next_state_tensor).item()

#                 returns = []
#                 R = bootstrap_value
#                 for i in reversed(range(len(rewards))):
#                     R = rewards[-1 - i] + gamma * R
#                     returns.insert(0, R)
#                 returns = torch.tensor(returns, dtype=torch.float32)

#                 values_tensor = torch.stack(values[-num_steps:]).squeeze(-1)
#                 log_probs_tensor = torch.stack(log_probs[-num_steps:]).squeeze(-1)
#                 entropies_tensor = torch.stack(entropies[-num_steps:]).squeeze(-1)
#                 returns_tensor = returns.clone().detach()
#                 advantages = returns - values_tensor
#                 clipped_advantages = torch.clamp(advantages, min=-10.0, max=10.0)

#                 policy_loss = -(log_probs_tensor * clipped_advantages.detach()).mean()
#                 value_loss = mse_loss(values_tensor, returns_tensor)
#                 entropy_loss = -entropies_tensor.mean()

#                 # total_loss = policy_loss + 0.5 * value_loss + 0.001 * entropy_loss
#                 total_loss = policy_loss + 0.4 * value_loss    #following rl-baselines3-zoo

#                 optimizer.zero_grad()
#                 total_loss.backward()
#                 grad_norm = torch.nn.utils.clip_grad_norm_(
#                     list(actor_network.parameters()) + list(critic_network.parameters()),
#                     max_norm=0.5
#                 )
#                 optimizer.step()


#                 latest_debug_info = {
#                     "values": values_tensor,
#                     "returns": returns,
#                     "log_probs": log_probs_tensor,
#                     "entropies": entropies_tensor,
#                     "policy_loss": policy_loss,
#                     "value_loss": value_loss,
#                     "grad_norm": grad_norm
#                 }


#                 states, actions, log_probs, rewards, values, dones, entropies = [], [], [], [], [], [], []

#             state = next_state


#     if ep % 10 == 0 and latest_debug_info:
#         a2c_debug_log(ep, steps, ep_reward, **latest_debug_info)

In [None]:
def a2c_debug_log(ep, steps, ep_reward, values, returns, log_probs, entropies, policy_loss, value_loss, grad_norm):
    advantages = returns - values

    print(f"\n[Episode {ep}] Debug Summary")
    print(f"# of Steps:         {steps:.2f}")
    print(f"Total Reward:       {ep_reward:.2f}")
    print(f"Mean V(s):          {values.mean().item():.4f}")
    print(f"Advantage Mean:     {advantages.mean().item():.4f}")
    print(f"Advantage Std:      {advantages.std().item():.4f}")
    print(f"Entropy (avg):      {entropies.mean().item():.4f}")
    print(f"Policy Loss:        {policy_loss.item():.4f}")
    print(f"Value Loss:         {value_loss.item():.4f}")
    print(f"Gradient Norm:      {grad_norm:.4f}")
    print(f"Log Prob Mean:      {log_probs.mean().item():.4f}")
    print("-" * 50)


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


# class Normalizer:
#     def __init__(self, shape, epsilon=1e-8):
#         self.shape = shape
#         self.mean = np.zeros(shape)
#         self.var = np.ones(shape)
#         self.count = epsilon

#     def update(self, x):
#         batch_mean = np.mean(x, axis=0)
#         batch_var = np.var(x, axis=0)

#         batch_size = x.shape[0]
#         self.count += batch_size
#         self.mean += (batch_mean - self.mean) * batch_size / self.count
#         self.var += (batch_var - self.var) * batch_size / self.count

#     def normalize(self, x):
#         return (x - self.mean) / np.sqrt(self.var + 1e-8)


def compute_gae(rewards, values, next_values, dones, gamma, gae_lambda):
    advantages = []
    advantage = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * next_values[i] * dones[i] - values[i]
        advantage = delta + gamma * gae_lambda * dones[i] * advantage
        advantages.insert(0, advantage)
    return advantages

