In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools pip --upgrade
!pip install swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.1.1-py3-none-any.whl (965 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.4/965.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.1.1
Collecting pygame
  Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m128.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
Successfully installed py

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCritic(nn.Module):

    # initially started with (64, 64) hidden dimension, but emprically found out (32, 32) works better. (64, 64) might be too much power for simple game like cartpole
    def __init__(self, input_dim, output_dim, hidden_dims=(32, 32)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[1], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value


# def compute_gae(rewards, values, next_values, dones, gamma, gae_lambda):
#     advantages = []
#     advantage = 0
#     for i in reversed(range(len(rewards))):
#         delta = rewards[i] + gamma * next_values[i] * dones[i] - values[i]
#         advantage = delta + gamma * gae_lambda * dones[i] * advantage
#         advantages.insert(0, advantage)
#     return advantages



In [4]:
from tqdm import tqdm

class A2CAgent:
    def __init__(self, env, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5):
        self.env = env
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCritic(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing the gamma decaying rewards
    def compute_return(self, rewards):
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return returns

    # computing the n step rewards
    def compute_n_step_returns(self, rewards, next_value):
        # bootstraps the future reward using value estimate
        R = next_value
        returns = []
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return torch.stack(returns)

    def train(self):
        episode_rewards = []

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            episode_reward = 0
            values = []
            rewards = []
            log_probs = []
            steps = 0
            done = False

            while not done and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
                action_probs, value = self.policy_net(state_tensor)
                action_dist = torch.distributions.Categorical(action_probs)
                action = action_dist.sample()
                log_prob = action_dist.log_prob(action)

                next_state, reward, terminated, truncated, _ = self.env.step(action.item())
                done = terminated or truncated

                # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                values.append(value.squeeze())
                rewards.append(reward)
                log_probs.append(log_prob)

                episode_reward += reward
                state = next_state

                # every n steps, calculate losses, update the actor & critic, then refresh the saved lists
                if (steps % self.num_steps == 0) or done:
                    _, next_value = self.policy_net(torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(self.device))
                    next_value = next_value.squeeze()
                    # BUG ALERT
                    # MUST MULTIPLY (1 - done) to next_value to mask the bootstrapped next_value when the game is over. CRITICAL BUG THAT TOOK HOURS TO FIND
                    returns = self.compute_n_step_returns(rewards, next_value * (1 - done))
                    values = torch.stack(values)
                    log_probs = torch.stack(log_probs)
                    advantages = returns - values
                    # calculate actor_loss by multiplying log probabilities to advantages. This will decrease the action probability of negative advantages, and vice-versa
                    actor_loss = - (log_probs * advantages.detach()).mean()
                    # updates the critic to find better estimate of values that matches the n-step reward
                    critic_loss = self.loss(returns, values)

                    # penalize using entropy to encourage exploration
                    entropy = action_dist.entropy().mean()
                    loss = actor_loss + 0.4 * critic_loss- 0.01 * entropy
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    values = []
                    rewards = []
                    log_probs = []

            episode_rewards.append(episode_reward)

        self.env.close()
        return np.array(episode_rewards)


In [5]:
import gymnasium as gym

env = gym.make('CartPole-v1')
num_episodes = 1000
max_steps = 500
lr = 1e-3

#num_steps = max_steps make it Monte-Carlo
a2c_model_mc =  A2CAgent(env, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = max_steps)

rewards = a2c_model_mc.train()

100%|██████████| 1/1 [00:00<00:00, 38.15it/s]

21 RETURN:  tensor([19.0272, 18.2093, 17.3831, 16.5486, 15.7057, 14.8542, 13.9942, 13.1254,
        12.2479, 11.3615, 10.4662,  9.5618,  8.6483,  7.7255,  6.7935,  5.8520,
         4.9010,  3.9404,  2.9701,  1.9900,  1.0000], grad_fn=<StackBackward0>) VALUE:  tensor([-0.0810, -0.0815, -0.0793, -0.0827, -0.0823, -0.0830, -0.0804, -0.0806,
        -0.0816, -0.0850, -0.0844, -0.0848, -0.0821, -0.0852, -0.0825, -0.0851,
        -0.0827, -0.0843, -0.0870, -0.0912, -0.1068], grad_fn=<StackBackward0>) ADVANTAGE:  tensor([19.1082, 18.2909, 17.4624, 16.6313, 15.7880, 14.9372, 14.0746, 13.2060,
        12.3295, 11.4465, 10.5505,  9.6466,  8.7304,  7.8107,  6.8760,  5.9371,
         4.9837,  4.0247,  3.0571,  2.0812,  1.1068], grad_fn=<SubBackward0>) tensor([[-0.5417],
        [-0.5277],
        [-0.9120],
        [-0.8915],
        [-0.5424],
        [-0.5278],
        [-0.5139],
        [-0.9297],
        [-0.9124],
        [-0.8947],
        [-0.5390],
        [-0.5241],
        [-0.9157],
   




In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_mc.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: 123.0
Episode 2 Reward: 125.0
Episode 3 Reward: 123.0
Episode 4 Reward: 127.0
Episode 5 Reward: 114.0
Episode 6 Reward: 135.0
Episode 7 Reward: 110.0
Episode 8 Reward: 104.0
Episode 9 Reward: 126.0
Episode 10 Reward: 115.0
Average Reward over 10 episodes: 120.2


In [None]:
import gymnasium as gym

env = gym.make('CartPole-v1')
num_episodes = 1000
max_steps = 500
lr = 1e-3

#num_steps = 5. Updates more frequently than Monte Carlo which takes more training time
a2c_model_n =  A2CAgent(env, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = 5)

rewards = a2c_model_n.train()

100%|██████████| 1000/1000 [02:58<00:00,  5.62it/s]


In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_n.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        print(action_probs)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


tensor([[0.4631, 0.5369]])
tensor([[0.7609, 0.2391]])
tensor([[0.4404, 0.5596]])
tensor([[0.7441, 0.2559]])
tensor([[0.4204, 0.5796]])
tensor([[0.7283, 0.2717]])
tensor([[0.4026, 0.5974]])
tensor([[0.7134, 0.2866]])
tensor([[0.3861, 0.6139]])
tensor([[0.6989, 0.3011]])
tensor([[0.3705, 0.6295]])
tensor([[0.6844, 0.3156]])
tensor([[0.3559, 0.6441]])
tensor([[0.6706, 0.3294]])
tensor([[0.3416, 0.6584]])
tensor([[0.6560, 0.3440]])
tensor([[0.3267, 0.6733]])
tensor([[0.6397, 0.3603]])
tensor([[0.3107, 0.6893]])
tensor([[0.6212, 0.3788]])
tensor([[0.2933, 0.7067]])
tensor([[0.5998, 0.4002]])
tensor([[0.2741, 0.7259]])
tensor([[0.5747, 0.4253]])
tensor([[0.2531, 0.7469]])
tensor([[0.5459, 0.4541]])
tensor([[0.2299, 0.7701]])
tensor([[0.5119, 0.4881]])
tensor([[0.2049, 0.7951]])
tensor([[0.4701, 0.5299]])
tensor([[0.7904, 0.2096]])
tensor([[0.5001, 0.4999]])
tensor([[0.1970, 0.8030]])
tensor([[0.4576, 0.5424]])
tensor([[0.7896, 0.2104]])
tensor([[0.5007, 0.4993]])
tensor([[0.1965, 0.8035]])
t

In [None]:
### Comparing Monte Carlo A2C vs. n-step A2C
# Monte Carlo takes significantly smaller time to train, given the same amount of episodes. This results due to the less frequent update compared to n-step A2C.
# While Monte Carlo updates once, 5-step A2C updates up to 100(max_step / n) times, causing this difference.
# While the Monte Carlo A2C showed an average return of 90 per episode, the n step A2C reached the max return 500 in only 1000 epsiodes trained.
# When each epsiode takes significantly long time to simulate, n-step A2C will have notable advantage over Monte Carlo A2C.

tensor([[0.2660, 0.2257, 0.2731, 0.2352]], grad_fn=<SoftmaxBackward0>)


100%|██████████| 1000/1000 [01:10<00:00, 14.20it/s]


tensor([[9.9943e-01, 4.8898e-04, 7.9276e-05, 4.1656e-08]])
2.0338323507709504
tensor([[9.9946e-01, 4.6518e-04, 7.4246e-05, 3.6973e-08]])
2.1811371717801364
tensor([[9.9949e-01, 4.4278e-04, 6.9579e-05, 3.2851e-08]])
2.23622086783152
tensor([[9.9951e-01, 4.2224e-04, 6.5362e-05, 2.9313e-08]])
2.282917824490397
tensor([[9.9953e-01, 4.0343e-04, 6.1552e-05, 2.6270e-08]])
2.3206429004874565
tensor([[9.9956e-01, 3.8620e-04, 5.8106e-05, 2.3647e-08]])
2.338467110229601
tensor([[9.9957e-01, 3.7041e-04, 5.4989e-05, 2.1379e-08]])
2.30420809961646
tensor([[9.9959e-01, 3.5595e-04, 5.2168e-05, 1.9414e-08]])
2.0940201751723464
tensor([[9.9961e-01, 3.4271e-04, 4.9613e-05, 1.7706e-08]])
1.1679417968564394
tensor([[9.9962e-01, 3.3059e-04, 4.7299e-05, 1.6219e-08]])
-0.8787000441500936
tensor([[9.9964e-01, 3.1952e-04, 4.5205e-05, 1.4923e-08]])
-2.019954707609486
tensor([[9.9965e-01, 3.0940e-04, 4.3309e-05, 1.3790e-08]])
-2.2913394482749823
tensor([[9.9966e-01, 3.0019e-04, 4.1596e-05, 1.2799e-08]])
-2.341326

  gym.logger.warn(
