In [8]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install gymnasium
!pip install pygame
!pip install swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.1.1-py3-none-any.whl (965 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.4/965.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.1.1
Collecting pygame
  Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
Successfully installed pyga

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCritic(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dims=(32, 32)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[1], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value


# def compute_gae(rewards, values, next_values, dones, gamma, gae_lambda):
#     advantages = []
#     advantage = 0
#     for i in reversed(range(len(rewards))):
#         delta = rewards[i] + gamma * next_values[i] * dones[i] - values[i]
#         advantage = delta + gamma * gae_lambda * dones[i] * advantage
#         advantages.insert(0, advantage)
#     return advantages



In [None]:
from tqdm import tqdm

class A2CAgent:
    def __init__(self, env, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5):
        self.env = env
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCritic(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()

    # Choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # Computing the gamma decaying rewards
    def compute_return(self, rewards):
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return returns

    # Computing the n step rewards
    def compute_n_step_returns(self, rewards, next_value):
        # Bootstraps the future reward using value estimate
        R = next_value
        returns = []
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return torch.stack(returns)

    def train(self):
        episode_rewards = []

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            episode_reward = 0
            values = []
            rewards = []
            log_probs = []
            steps = 0
            done = False

            while not done and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
                action_probs, value = self.policy_net(state_tensor)
                action_dist = torch.distributions.Categorical(action_probs)
                action = action_dist.sample()
                log_prob = action_dist.log_prob(action)

                next_state, reward, terminated, truncated, _ = self.env.step(action.item())
                done = terminated or truncated

                # Saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                values.append(value.squeeze())
                rewards.append(reward)
                log_probs.append(log_prob)

                episode_reward += reward
                state = next_state

                # Every n steps, calculate losses, update the actor & critic, then refresh the saved lists
                if (steps % self.num_steps == 0) or done:
                    _, next_value = self.policy_net(torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(self.device))
                    next_value = next_value.squeeze()
                    # BUG ALERT
                    # MUST MULTIPLY (1 - done) to next_value to mask the bootstrapped next_value when the game is over. CRITICAL BUG THAT TOOK HOURS TO FIND
                    returns = self.compute_n_step_returns(rewards, next_value * (1 - done))
                    values = torch.stack(values)
                    log_probs = torch.stack(log_probs)
                    advantages = returns - values
                    # Calculate actor_loss by multiplying log probabilities to advantages. This will decrease the action probability of negative advantages, and vice-versa
                    actor_loss = - (log_probs * advantages.detach()).mean()
                    # Updates the critic to find better estimate of values that matches the n-step reward
                    critic_loss = self.loss(returns, values)
                    loss = actor_loss + 0.4 * critic_loss
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    values = []
                    rewards = []
                    log_probs = []

            episode_rewards.append(episode_reward)

        self.env.close()
        return np.array(episode_rewards)


In [None]:
import gymnasium as gym

env = gym.make('CartPole-v1')
num_episodes = 1000
max_steps = 500
lr = 1e-3

#num_steps = max_steps make it Monte-Carlo
a2c_model =  A2CAgent(env, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = max_steps)

rewards = a2c_model.train()

100%|██████████| 1000/1000 [00:21<00:00, 46.10it/s]


In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: 80.0
Episode 2 Reward: 72.0
Episode 3 Reward: 114.0
Episode 4 Reward: 127.0
Episode 5 Reward: 84.0
Episode 6 Reward: 101.0
Episode 7 Reward: 77.0
Episode 8 Reward: 86.0
Episode 9 Reward: 71.0
Episode 10 Reward: 89.0
Average Reward over 10 episodes: 90.1


In [None]:
import gymnasium as gym

env = gym.make('CartPole-v1')
num_episodes = 1000
max_steps = 500
lr = 1e-3

#num_steps = 5. Updates more frequently than Monte Carlo which takes more training time
a2c_model =  A2CAgent(env, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = 5)

rewards = a2c_model.train()

100%|██████████| 1000/1000 [03:36<00:00,  4.63it/s]


In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: 500.0
Episode 2 Reward: 500.0
Episode 3 Reward: 500.0
Episode 4 Reward: 500.0
Episode 5 Reward: 500.0
Episode 6 Reward: 500.0
Episode 7 Reward: 500.0
Episode 8 Reward: 500.0
Episode 9 Reward: 500.0
Episode 10 Reward: 500.0
Average Reward over 10 episodes: 500.0


In [None]:
### Comparing Monte Carlo A2C vs. n-step A2C
# Monte Carlo takes significantly smaller time to train, given the same amount of episodes. This results due to the less frequent update compared to n-step A2C.
# While Monte Carlo updates once, 5-step A2C updates up to 100(max_step / n) times, causing this difference.
# While the Monte Carlo A2C showed an average return of 90 per episode, the n step A2C reached the max return 500 in only 1000 epsiodes trained.
# When each epsiode takes significantly long time to simulate, n-step A2C will have notable advantage over Monte Carlo A2C.