In [None]:
# LunarLander debug note
# Initially, used the same code I used for cartpole.
# Would not train, so printed out the probs. When the model is first made, it is fairly distributed like this [[0.2660, 0.2257, 0.2731, 0.2352]]. However, after training for 1000 steps,
# it converges to a certain action like this - tensor([[9.9943e-01, 4.8898e-04, 7.9276e-05, 4.1656e-08]]
# Applied entropy for more exploration. Didn't work
# Found much larger absolute value of advantages, returns, compared to cartpole. Especially, when terminated, the return is -100 which is the dominant cause for the returns.
# Normalizing advantage solved the problem above. Now it doesn't fixate in a certain action.
# Then, action_probability turned into a somewhat uniform distribution. Therefore, printed out the losses.
# Actor loss:  tensor(-1.9073e-08, device='cuda:0', grad_fn=<NegBackward0>) Critic loss:  tensor(15.9172, device='cuda:0', grad_fn=<MseLossBackward0>)
# Entropy:  tensor(1.3784, device='cuda:0', grad_fn=<MeanBackward0>)
# Found out that Actor loss is extremely small. Therefore, actor layer was barely getting updated.
# Reason: log prob is pretty much uniform when model is initialized and the advantage is normalized with mean 0. We calculate actor_loss = (advantages * log_probs).mean()
# This converges to the mean of the normal distribution (since log_probs is uniform), and therefore to 0.
# Try with normalizing the n-step rewards instead of the advantage.
# Reference https://github.com/nikhilbarhate99/Actor-Critic-PyTorch/blob/master/train.py
# The only difference between my model and the reference model was the reduction method of actor and critic loss. Reference model used sum, my model used mean.
# mean vs. sum was not the problem. The dimensions of log_probs, returns, state_values were all different in my code and was broadcasting very wierdly. That's why step-wise calculation of the reference code
# worked, but my code didn't. Should pay more attention to dimensions, and also debugging warnings because I was ignoring the dimension disparity.
# Trains well using mean. Maybe even better.
# Solved all the bugs and trained both sync and async models. Both took too much time and didn't train well.
# Compared to the reference code which ran in a single environment, I had my code running in 8 environments, and therefore decreased the learning rate from 0.02 -> 0.005.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: 

In [1]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools
!pip install swig
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379412 sha256=fb06db08284b83565f988532c29aa4bc5905aa0239f232e37dda892d10f2a008
  Stored in directory: /root/.cache/pip/wheels/ab

In [None]:
!pip uninstall -y box2d-py
!pip install box2d pygame swig
!pip install "gymnasium[box2d]" --no-deps

[0mCollecting box2d
  Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (573 bytes)
Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m102.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: box2d
Successfully installed box2d-2.3.10


In [2]:
# New implementation referring to https://github.com/nikhilbarhate99/Actor-Critic-PyTorch/blob/master/train.py

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

class ActorCritic(nn.Module):

    # Increased hidden dim to (128, 128) compared to cartpole as the input_dim is more complex
    def __init__(self, input_dim, output_dim, hidden_dims=(128, 128)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            # nn.Linear(hidden_dims[0], hidden_dims[1]),
            # nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[0], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[0], 1)

        self.logprobs = []
        self.state_values = []
        self.rewards = []

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        state_value = self.critic_layer(x)

        action_distribution = Categorical(action_probs)
        action = action_distribution.sample()

        self.logprobs.append(action_distribution.log_prob(action))
        self.state_values.append(state_value.squeeze())

        return action.item()

    def compute_return(self, gamma):
        returns = []
        R = 0
        for r in reversed(self.rewards):
            R = r + gamma * R
            returns.insert(0, R)

        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        return returns

    def calculate_loss(self, gamma):
        rewards = self.compute_return(gamma).detach()

        # CRITICAL BUG - log_probs' shape was [90, 1], state_values' [90, 1, 1], rewards' [90] -- should read the warning messages more carefully from now on.
        # print("log_probs.shape", self.logprobs.shape)
        # print("state_values.shape", self.state_values.shape)
        # print("returns.shape", returns.shape)

        loss = 0
        for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
            value = value.to('cpu')
            advantage = reward - value.detach()
            action_loss = -logprob * advantage
            value_loss = F.smooth_l1_loss(value, reward)
            loss += (action_loss + value_loss)
        return loss


    def clearMemory(self):
        del self.logprobs[:]
        del self.state_values[:]
        del self.rewards[:]



In [3]:
from tqdm import tqdm

class A2CAgent:
    def __init__(self, env, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5):
        random_seed = 543
        torch.manual_seed(random_seed)
        self.env = env
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCritic(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

    def train(self):
        episode_rewards = []
        episode_steps = []

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            episode_reward = 0
            steps = 0
            done = False

            while not done and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
                action = self.policy_net(state_tensor)

                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated
                self.policy_net.rewards.append(reward)
                episode_reward += reward
                state = next_state

                # every n steps, calculate losses, update the actor & critic, then refresh the saved lists
                # if (steps % self.num_steps == 0) or done:
                if done:
                    self.optimizer.zero_grad()
                    loss = self.policy_net.calculate_loss(self.gamma)
                    loss.backward()
                    self.optimizer.step()
                    self.policy_net.clearMemory()

            if episode % 100 == 0:
                print(episode, "reward: ", episode_reward, "steps: ", steps)

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

        self.env.close()
        return np.array(episode_rewards), np.array(episode_steps)


In [4]:
import gymnasium as gym

env = gym.make('LunarLander-v3')
env.reset(seed=543)
num_episodes = 2000
max_steps = 1000
lr = 0.02


a2c_model_ll =  A2CAgent(env, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = 8)

state, _ = env.reset()
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(a2c_model_ll.device)
rewards, steps = a2c_model_ll.train()


  0%|          | 2/2000 [00:01<19:16,  1.73it/s]

0 reward:  -305.98365456353747 steps:  90


  5%|▌         | 102/2000 [00:27<11:52,  2.67it/s]

100 reward:  -398.5175385073768 steps:  195


  6%|▋         | 127/2000 [00:37<09:14,  3.38it/s]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(len(rewards)), rewards)
plt.show()

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCritic(nn.Module):

    # reference model used only 1 layer. Will experiment both
    def __init__(self, input_dim, output_dim, hidden_dims=(128, 128)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            # nn.Linear(hidden_dims[0], hidden_dims[1]),
            # nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[1], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value


In [35]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR

class A2CAgent:
    def __init__(self, env_id, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5, num_envs = 8, vectorization_mode = "sync"):
        # using vectorized environments to boost training speed
        def make_env(env_id, seed, idx):
            def thunk():
                env = gym.make(env_id)
                env.reset(seed=seed + idx)
                return env
            return thunk
        env_fns = [make_env(env_id, seed=543, idx=i) for i in range(num_envs)]
        self.env = gym.vector.SyncVectorEnv(env_fns)
        self.num_envs = num_envs
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCritic(self.env.single_observation_space.shape[0], self.env.single_action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        # added scheduler after observing divergence after getting close to solving
        self.scheduler = StepLR(self.optimizer, step_size=100, gamma=0.9)
        self.loss = nn.MSELoss()

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing the gamma decaying rewards
    def compute_returns(self, rewards):
        """
        Args:
            rewards: torch.Tensor of shape [T, N] where
                    T = rollout steps, N = num_envs
        Returns:
            returns: torch.Tensor of shape [T, N], normalized
        """
        rewards = torch.stack(rewards)

        T, N = rewards.shape
        returns = torch.zeros_like(rewards)
        R = torch.zeros(N, device=rewards.device)
        for t in reversed(range(T)):
            R = rewards[t] + self.gamma * R
            returns[t] = R

        # Normalize returns across all timesteps and environments
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        return returns


    # computing the n step rewards
    def compute_n_step_returns(self, rewards, next_value):
        # bootstraps the future reward using value estimate
        R = next_value
        returns = []
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.stack(returns)
        if returns.numel() > 1:
            return (returns - returns.mean()) / (returns.std() + 1e-8)
        else:
            return returns * 0

    def train(self):
        episode_rewards = []
        episode_steps = []

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            done = np.zeros(self.num_envs, dtype=bool)
            episode_reward = np.zeros(self.num_envs)
            values, rewards, log_probs = [], [], []
            done_mask = np.zeros(self.num_envs, dtype=bool)
            done_steps = np.zeros(self.num_envs)
            steps = 0

            while not np.all(done_mask) and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
                action_probs, value = self.policy_net(state_tensor)
                action_dist = torch.distributions.Categorical(action_probs)
                action = action_dist.sample()
                log_prob = action_dist.log_prob(action)

                next_state, reward, terminated, truncated, _ = self.env.step(action.cpu().numpy())
                done = np.logical_or(terminated, truncated)
                done_steps = np.where(np.logical_and(done, ~done_mask), steps, done_steps)
                done_mask = np.logical_or(done_mask, done)
                # record when each environment is done
                reward = np.where(done_mask, 0.0, reward)

                # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                values.append(value.squeeze())
                rewards.append(torch.tensor(reward, dtype=torch.float32).to(self.device))  # shape: (num_envs,)
                log_probs.append(log_prob)

                episode_reward += reward
                state = next_state


                # every n steps for each environment, calculate losses, update the actor & critic, then refresh the saved lists
                # if (steps % self.num_steps == 0) or np.any(done):
                if np.any(done):
                    with torch.no_grad():
                        next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                        _, next_value = self.policy_net(next_state_tensor)
                        done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                        next_value = next_value.squeeze() * (1 - done_tensor)

                    returns = self.compute_returns(rewards)  # shape: (n_steps, num_envs)
                    returns = returns.transpose(0, 1)  # shape: (num_envs, n_steps)
                    values = torch.stack(values).transpose(0, 1)  # shape: (num_envs, n_steps)
                    log_probs = torch.stack(log_probs).transpose(0, 1)  # shape: (num_envs, n_steps)
                    advantages = returns - values

                    # calculate sum instead of mean
                    actor_loss = - (log_probs * advantages.detach()).sum()
                    critic_loss = self.loss(returns, values)

                    loss = actor_loss + critic_loss
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    # self.scheduler.step()

                    values = []
                    rewards = []
                    log_probs = []

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

            if episode % 20 == 0:
               print('Episode {}\tlengths: {}\treward: {}]\tfull length: {}'.format(episode, done_steps, episode_reward, steps))

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

        self.env.close()
        return np.array(episode_rewards), np.array(episode_steps)


In [38]:
import gymnasium as gym

env_id = 'LunarLander-v3'
num_episodes = 80
max_steps = 500
lr = 3e-3


a2c_model_ll =  A2CAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = 8)

rewards, steps = a2c_model_ll.train()



  1%|▏         | 1/80 [00:00<00:57,  1.38it/s]

Episode 0	lengths: [ 99.  98.  94.  70. 109.  63. 126.  69.]	reward: [-166.39995955  -51.20753231 -242.42402877  -89.14457934  -93.3466435
   37.57557693  -23.76910308   21.95506093]]	full length: 126


 26%|██▋       | 21/80 [00:11<00:50,  1.17it/s]

Episode 20	lengths: [  0. 167. 135. 200. 160. 130. 201. 211.]	reward: [ 113.22333902 -155.05465395   70.33711321   92.95241842 -173.71621362
   49.55416256   32.52047522   88.13744365]]	full length: 500


 51%|█████▏    | 41/80 [00:22<00:26,  1.45it/s]

Episode 40	lengths: [156. 125. 109.  99.  95. 128. 124. 156.]	reward: [ 40.43781212 108.08222072  93.48589167 136.50936247  94.33565674
 102.27260246 107.14904484 123.99814906]]	full length: 156


 76%|███████▋  | 61/80 [00:38<00:12,  1.49it/s]

Episode 60	lengths: [ 91. 122.  98.  97.  85. 122. 121. 122.]	reward: [104.27137147 113.65733053 146.81514964  68.16942896  27.57427676
  95.27770793  87.95648027 112.62542008]]	full length: 122


100%|██████████| 80/80 [00:57<00:00,  1.39it/s]


In [18]:
# env_id = 'LunarLander-v3'
# num_episodes = 250
# max_steps = 500
# lr = 3e-3

# a2c_model_ll_async =  A2CAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = 8, vectorization_mode="async")

# rewards_async, steps_async = a2c_model_ll_async.train()

In [39]:
import gymnasium as gym
import torch
import numpy as np
from gymnasium.wrappers import RecordVideo
import os

# Create folder to save the video
video_folder = "./video"
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with RecordVideo
env = gym.make('LunarLander-v3', render_mode='rgb_array')
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda e: True)

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_ll.policy_net(state)
            # action_dist = torch.distributions.Categorical(action_probs)
            # action = action_dist.sample().item()
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


  logger.warn(


Episode 1 Reward: -189.19590152462467
Episode 2 Reward: -125.67309362208087
Episode 3 Reward: -122.27225200262494
Episode 4 Reward: -240.09653210736192
Episode 5 Reward: -209.4713997490183
Episode 6 Reward: -218.93050849994881
Episode 7 Reward: -311.4440448859407
Episode 8 Reward: -198.4887331130318
Episode 9 Reward: -217.7466268388127
Episode 10 Reward: -199.33068248707443
Average Reward over 10 episodes: -203.26497748305192


In [None]:
import gymnasium as gym
import torch
import numpy as np
from gymnasium.wrappers import RecordVideo
import os

# Create folder to save the video
video_folder = "./video"
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with RecordVideo
env = gym.make('LunarLander-v3', render_mode='rgb_array')
env = RecordVideo(env, video_folder=video_folder, name_prefix="async_demo", episode_trigger=lambda e: True)

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_ll_async.policy_net(state)
            # action_dist = torch.distributions.Categorical(action_probs)
            # action = action_dist.sample().item()
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")
