In [None]:
### Comparing single environment vs. synchronized environments vs. asynchronized environments

# | Mode              | Training Time | Avg. Reward |
# |------------------|---------------|-------------|
# | Single Env       | 00:40         | 130.9       |
# | Sync 8 Envs      | 00:16         | 194.8       |
# | Async 8 Envs     | 00:35         | 307.5       |

# Training performance async >> sync > single, training time sync >> async > single. Expected async to train faster and sync to show better reward, but turned out opposite.
# Training time dominatnly depended on how many steps the model ran while training, faster training in terms of reward led to longer steps to train, and longer training time.
# My hypothesis is while async's environments that processed earlier gave information to slower training environments in their early episodes, and that led to better earlier policies and therefore longer steps in further episodes.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools
!pip install swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.1.1-py3-none-any.whl (965 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.4/965.4 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.1.1
Collecting pygame
  Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m112.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
Successfully installed py

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCritic(nn.Module):

    # initially started with (64, 64) hidden dimension, but emprically found out (32, 32) works better. (64, 64) might be too much power for simple game like cartpole
    def __init__(self, input_dim, output_dim, hidden_dims=(32, 32)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[1], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value



In [None]:
from tqdm import tqdm

class A2CAgent:
    def __init__(self, env_id, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5, num_envs = 8, vectorization_mode = "sync"):
        # using vectorized environments to boost training
        # sync is more stable, async is faster
        self.env = gym.make_vec(env_id, num_envs = num_envs, vectorization_mode=vectorization_mode)
        self.num_envs = num_envs
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCritic(self.env.single_observation_space.shape[0], self.env.single_action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing the gamma decaying rewards
    def compute_return(self, rewards):
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return returns

    # computing the n step rewards
    def compute_n_step_returns(self, rewards, next_value):
        # bootstraps the future reward using value estimate
        R = next_value  # shape: (num_envs,)
        returns = []
        for r in reversed(rewards):  # each r: (num_envs,)
            R = r + self.gamma * R
            returns.insert(0, R)
        return torch.stack(returns)  # shape: (n_steps, num_envs)

    def train(self):
        episode_rewards = []
        episode_steps = []
        step_sum = 0
        random_seed = 543
        torch.manual_seed(random_seed)

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            done = np.zeros(self.num_envs, dtype=bool)
            episode_reward = np.zeros(self.num_envs)
            values, rewards, log_probs = [], [], []
            done_mask = np.zeros(self.num_envs, dtype=bool)
            steps = 0

            while not np.all(done_mask) and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
                action_probs, value = self.policy_net(state_tensor)
                action_dist = torch.distributions.Categorical(action_probs)
                action = action_dist.sample()
                log_prob = action_dist.log_prob(action)

                next_state, reward, terminated, truncated, _ = self.env.step(action.cpu().numpy())
                done = np.logical_or(terminated, truncated)
                done_mask = np.logical_or(done_mask, done)
                reward = np.where(done_mask, 0.0, reward)

                # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                values.append(value.squeeze())
                rewards.append(torch.tensor(reward, dtype=torch.float32).to(self.device))  # shape: (num_envs,)
                log_probs.append(log_prob)

                episode_reward += reward
                state = next_state

                # every n steps for each environment, calculate losses, update the actor & critic, then refresh the saved lists
                if (steps % self.num_steps == 0) or np.any(done):
                    with torch.no_grad():
                        next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                        _, next_value = self.policy_net(next_state_tensor)
                        done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                        next_value = next_value.squeeze() * (1 - done_tensor)

                    returns = self.compute_n_step_returns(rewards, next_value)  # shape: (n_steps, num_envs)
                    returns = returns.transpose(0, 1)  # shape: (num_envs, n_steps)
                    values = torch.stack(values).transpose(0, 1)  # shape: (num_envs, n_steps)
                    log_probs = torch.stack(log_probs).transpose(0, 1)  # shape: (num_envs, n_steps)
                    advantages = returns - values

                    # calculate actor_loss by multiplying log probabilities to advantages. This will decrease the action probability of negative advantages, and vice-versa
                    actor_loss = - (log_probs * advantages.detach()).mean()
                    # updates the critic to find better estimate of values that matches the n-step reward
                    critic_loss = self.loss(returns, values)
                    # penalize using entropy to encourage exploration
                    entropy = action_dist.entropy().mean()

                    loss = actor_loss + 0.4 * critic_loss- 0.01 * entropy
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    values = []
                    rewards = []
                    log_probs = []

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)
            step_sum += steps

        self.env.close()
        return np.array(episode_rewards), np.array(episode_steps)


In [41]:
class A2CAgent_single:
    def __init__(self, env, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5):
        self.env = env
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCritic(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing the gamma decaying rewards
    def compute_return(self, rewards):
        returns = []
        R = 0
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return returns

    # computing the n step rewards
    def compute_n_step_returns(self, rewards, next_value):
        # bootstraps the future reward using value estimate
        R = next_value
        returns = []
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        return torch.stack(returns)

    def train(self):
        episode_steps = []
        episode_rewards = []
        step_sum = 0

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            episode_reward = 0
            values = []
            rewards = []
            log_probs = []
            steps = 0
            done = False

            while not done and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
                action_probs, value = self.policy_net(state_tensor)
                action_dist = torch.distributions.Categorical(action_probs)
                action = action_dist.sample()
                log_prob = action_dist.log_prob(action)

                next_state, reward, terminated, truncated, _ = self.env.step(action.item())
                done = terminated or truncated

                # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                values.append(value.squeeze())
                rewards.append(reward)
                log_probs.append(log_prob)

                episode_reward += reward
                state = next_state

                # every n steps, calculate losses, update the actor & critic, then refresh the saved lists
                if (steps % self.num_steps == 0) or done:
                    _, next_value = self.policy_net(torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(self.device))
                    next_value = next_value.squeeze()
                    # BUG ALERT
                    # MUST MULTIPLY (1 - done) to next_value to mask the bootstrapped next_value when the game is over. CRITICAL BUG THAT TOOK HOURS TO FIND
                    returns = self.compute_n_step_returns(rewards, next_value * (1 - done))
                    values = torch.stack(values)
                    log_probs = torch.stack(log_probs)
                    advantages = returns - values
                    # calculate actor_loss by multiplying log probabilities to advantages. This will decrease the action probability of negative advantages, and vice-versa
                    actor_loss = - (log_probs * advantages.detach()).mean()
                    # updates the critic to find better estimate of values that matches the n-step reward
                    critic_loss = self.loss(returns, values)

                    # penalize using entropy to encourage exploration
                    entropy = action_dist.entropy().mean()
                    loss = actor_loss + 0.4 * critic_loss- 0.01 * entropy
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    values = []
                    rewards = []
                    log_probs = []

            episode_steps.append(steps)
            episode_rewards.append(episode_reward)
            step_sum += steps

        self.env.close()
        return np.array(episode_rewards), np.array(episode_steps)


In [42]:
import gymnasium as gym

env = gym.make("CartPole-v1")
num_episodes = 1000
max_steps = 500
lr = 1e-3

# total of 1000 episodes explored
a2c_model_single_env =  A2CAgent_single(env, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = max_steps)

rewards, steps = a2c_model_single_env.train()

100%|██████████| 1000/1000 [00:55<00:00, 17.96it/s]


In [None]:
env_id = "CartPole-v1"
num_episodes = 1000
max_steps = 500
lr = 1e-3

# total of 8000 episodes explored
a2c_model_multiple_env =  A2CAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = max_steps)

rewards_mul_env, steps_mul_env = a2c_model_multiple_env.train()

 34%|███▍      | 340/1000 [02:41<05:14,  2.10it/s]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

# Assuming you already have:
# rewards, steps = agent.train()

episodes = range(1, len(rewards) + 1)

fig, ax1 = plt.subplots(figsize=(12, 5))

# --- Plot rewards (left y-axis)
color = 'tab:blue'
ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward', color=color)
ax1.plot(episodes, rewards, color=color, label='Reward')
ax1.tick_params(axis='y', labelcolor=color)

# # --- Plot steps (right y-axis)
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Episode Length (Steps)', color=color)
ax2.plot(episodes, steps, color=color, linestyle='--', label='Steps')
ax2.tick_params(axis='y', labelcolor=color)

# --- Add titles and grid
plt.title('A2C Training Progress: Rewards and Episode Lengths')
fig.tight_layout()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming you already have:
# rewards, steps = agent.train()

episodes = range(1, len(rewards_mul_env) + 1)

fig, ax1 = plt.subplots(figsize=(12, 5))

# --- Plot rewards (left y-axis)
# color = 'tab:blue'
# ax1.set_xlabel('Episode')
# ax1.set_ylabel('Reward', color=color)
# ax1.plot(episodes, rewards_mul_env, color=color, label='Reward')
# ax1.tick_params(axis='y', labelcolor=color)

# --- Plot steps (right y-axis)
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Episode Length (Steps)', color=color)
ax2.plot(episodes, steps_mul_env, color=color, linestyle='--', label='Steps')
ax2.tick_params(axis='y', labelcolor=color)

# --- Add titles and grid
plt.title('A2C Training Progress: Rewards and Episode Lengths')
fig.tight_layout()
plt.grid(True)
plt.show()


In [None]:
sum(steps), sum(steps_mul_env)

In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_single_env.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: 112.0
Episode 2 Reward: 116.0
Episode 3 Reward: 157.0
Episode 4 Reward: 125.0
Episode 5 Reward: 140.0
Episode 6 Reward: 116.0
Episode 7 Reward: 178.0
Episode 8 Reward: 117.0
Episode 9 Reward: 115.0
Episode 10 Reward: 133.0
Average Reward over 10 episodes: 130.9


In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_multiple_env.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


In [None]:
env_id = "CartPole-v1"
num_episodes = 125
max_steps = 500
lr = 1e-3

# total of 1000 episoded explored
a2c_model_multiple_env_125 =  A2CAgent(env_id, num_episodes=125, max_steps=max_steps, lr=lr, num_steps = max_steps, vectorization_mode= "sync")

rewards_125, steps_125 = a2c_model_multiple_env_125.train()

100%|██████████| 125/125 [00:16<00:00,  7.69it/s]


In [None]:
env_id = "CartPole-v1"
num_episodes = 125
max_steps = 500
lr = 1e-3

# total of 1000 episoded explored
a2c_model_multiple_env_125_async =  A2CAgent(env_id, num_episodes=125, max_steps=max_steps, lr=lr, num_steps = max_steps, vectorization_mode= "async")

rewards_125_async, steps_125_async = a2c_model_multiple_env_125_async.train()

100%|██████████| 125/125 [00:35<00:00,  3.55it/s]


In [None]:
sum(steps_125), sum(steps_125_async)

(np.int64(8683), np.int64(17795))

In [None]:
steps_125_async

array([ 56,  28,  35,  28,  23,  32,  79,  30,  20,  60,  47,  35,  34,
        29,  30,  35,  35,  34,  26,  31,  32,  31,  37,  64,  34,  45,
        61,  31,  73,  42,  41,  63,  44,  31,  63,  18,  32,  26,  51,
        37,  36,  35,  22,  40,  41,  44,  43,  48,  30,  27,  36,  85,
        29,  58,  79,  56,  67,  40,  96,  44, 102,  57, 127,  85,  64,
       155,  61,  87,  87,  98, 109, 108,  87, 116, 140, 121, 171, 241,
       101, 196, 153, 266, 202, 194, 194, 183, 142, 192, 219, 174, 288,
       338, 202, 162, 330, 262, 390, 446, 192, 202, 286, 302, 149, 181,
       269, 297, 471, 413, 440, 478, 394, 433, 250, 317, 245, 130, 142,
       215, 196, 327, 284, 499, 346, 211, 208])

In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_multiple_env_125.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: 196.0
Episode 2 Reward: 220.0
Episode 3 Reward: 192.0
Episode 4 Reward: 164.0
Episode 5 Reward: 204.0
Episode 6 Reward: 202.0
Episode 7 Reward: 162.0
Episode 8 Reward: 196.0
Episode 9 Reward: 197.0
Episode 10 Reward: 215.0
Average Reward over 10 episodes: 194.8


In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_multiple_env_125_async.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: 274.0
Episode 2 Reward: 330.0
Episode 3 Reward: 361.0
Episode 4 Reward: 280.0
Episode 5 Reward: 418.0
Episode 6 Reward: 327.0
Episode 7 Reward: 338.0
Episode 8 Reward: 226.0
Episode 9 Reward: 267.0
Episode 10 Reward: 254.0
Average Reward over 10 episodes: 307.5


In [None]:
env_id = "CartPole-v1"
num_episodes = 10
max_steps = 500
lr = 1e-3

# total of 80 episodes explored
a2c_model_multiple_env_10 =  A2CAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = max_steps)

rewards = a2c_model_multiple_env_10.train()

100%|██████████| 10/10 [00:10<00:00,  1.04s/it]

5000





In [None]:
import numpy as np
import torch

env = gym.make('CartPole-v1', render_mode='rgb_array')

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

frames = []
episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_multiple_env_10.policy_net(state)
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        if i == 0:
            frame = env.render()
            frames.append(frame)

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: 134.0
Episode 2 Reward: 132.0
Episode 3 Reward: 127.0
Episode 4 Reward: 122.0
Episode 5 Reward: 121.0
Episode 6 Reward: 107.0
Episode 7 Reward: 111.0
Episode 8 Reward: 101.0
Episode 9 Reward: 115.0
Episode 10 Reward: 123.0
Average Reward over 10 episodes: 119.3
