In [19]:
# LunarLander debug note
# Initially, used the same code I used for cartpole.
# Would not train, so printed out the probs. When the model is first made, it is fairly distributed like this [[0.2660, 0.2257, 0.2731, 0.2352]]. However, after training for 1000 steps,
# it converges to a certain action like this - tensor([[9.9943e-01, 4.8898e-04, 7.9276e-05, 4.1656e-08]]
# Applied entropy for more exploration. Didn't work
# Found much larger absolute value of advantages, returns, compared to cartpole. Especially, when terminated, the return is -100 which is the dominant cause for the returns.
# Normalizing advantage solved the problem above. Now it doesn't fixate in a certain action.
# Then, action_probability turned into a somewhat uniform distribution. Therefore, printed out the losses.
# Actor loss:  tensor(-1.9073e-08, device='cuda:0', grad_fn=<NegBackward0>) Critic loss:  tensor(15.9172, device='cuda:0', grad_fn=<MseLossBackward0>)
# Entropy:  tensor(1.3784, device='cuda:0', grad_fn=<MeanBackward0>)
# Found out that Actor loss is extremely small. Therefore, actor layer was barely getting updated.
# Reason: log prob is pretty much uniform when model is initialized and the advantage is normalized with mean 0. We calculate actor_loss = (advantages * log_probs).mean()
# This converges to the mean of the normal distribution (since log_probs is uniform), and therefore to 0.
# Try with normalizing the n-step rewards instead of the advantage.
# Reference https://github.com/nikhilbarhate99/Actor-Critic-PyTorch/blob/master/train.py
# The only difference between my model and the reference model was the reduction method of actor and critic loss. Reference model used sum, my model used mean.
# mean vs. sum was not the problem. The dimensions of log_probs, returns, state_values were all different in my code and was broadcasting very wierdly. That's why step-wise calculation of the reference code
# worked, but my code didn't. Should pay more attention to dimensions, and also debugging warnings because I was ignoring the dimension disparity.
# Trains well using mean. Maybe even better.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: 

In [7]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools pip --upgrade
!pip install swig
!pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
[33m  DEPRECATION: Building 'box2d-py' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'box2d-py'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building w

In [8]:
!pip uninstall -y box2d-py
!pip install box2d pygame swig
!pip install "gymnasium[box2d]" --no-deps

[0mCollecting box2d
  Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (573 bytes)
Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m102.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: box2d
Successfully installed box2d-2.3.10


In [2]:
# New implementation following https://github.com/nikhilbarhate99/Actor-Critic-PyTorch/blob/master/train.py

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

class ActorCritic(nn.Module):

    # Increased hidden dim to (128, 128) compared to cartpole as the input_dim is more complex
    def __init__(self, input_dim, output_dim, hidden_dims=(128, 128)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            # nn.Linear(hidden_dims[0], hidden_dims[1]),
            # nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[0], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[0], 1)

        self.logprobs = []
        self.state_values = []
        self.rewards = []

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        state_value = self.critic_layer(x)

        action_distribution = Categorical(action_probs)
        action = action_distribution.sample()

        self.logprobs.append(action_distribution.log_prob(action))
        self.state_values.append(state_value.squeeze())

        return action.item()

    def compute_return(self, gamma):
        returns = []
        R = 0
        for r in reversed(self.rewards):
            R = r + gamma * R
            returns.insert(0, R)

        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        return returns

    def calculate_loss(self, gamma):
        rewards = self.compute_return(gamma).detach()

        # CRITICAL BUG - log_probs' shape was [90, 1], state_values' [90, 1, 1], rewards' [90] -- should read the warning messages more carefully from now on.
        # print("log_probs.shape", self.logprobs.shape)
        # print("state_values.shape", self.state_values.shape)
        # print("returns.shape", returns.shape)

        loss = 0
        for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
            value = value.to('cpu')
            advantage = reward - value.detach()
            action_loss = -logprob * advantage
            value_loss = F.smooth_l1_loss(value, reward)
            loss += (action_loss + value_loss)
        return loss


    def clearMemory(self):
        del self.logprobs[:]
        del self.state_values[:]
        del self.rewards[:]



In [3]:
from tqdm import tqdm

class A2CAgent:
    def __init__(self, env, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5):
        random_seed = 543
        torch.manual_seed(random_seed)
        self.env = env
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCritic(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

    def train(self):
        episode_rewards = []
        episode_steps = []

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            episode_reward = 0
            steps = 0
            done = False

            while not done and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
                action = self.policy_net(state_tensor)

                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated
                self.policy_net.rewards.append(reward)
                episode_reward += reward
                state = next_state

                # every n steps, calculate losses, update the actor & critic, then refresh the saved lists
                # if (steps % self.num_steps == 0) or done:
                if done:
                    self.optimizer.zero_grad()
                    loss = self.policy_net.calculate_loss(self.gamma)
                    loss.backward()
                    self.optimizer.step()
                    self.policy_net.clearMemory()

            if episode % 100 == 0:
                print(episode, "reward: ", episode_reward, "steps: ", steps)

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

        self.env.close()
        return np.array(episode_rewards), np.array(episode_steps)


In [None]:
import gymnasium as gym

env = gym.make('LunarLander-v3')
env.reset(seed=543)
num_episodes = 2000
max_steps = 1000
lr = 0.02


a2c_model_ll =  A2CAgent(env, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = 8)

state, _ = env.reset()
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(a2c_model_ll.device)
rewards, steps = a2c_model_ll.train()


In [None]:
import matplotlib.pyplot as plt

plt.plot(range(len(rewards)), rewards)
plt.show()

In [9]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCritic(nn.Module):

    # reference model used only 1 layer. Will experiment both
    def __init__(self, input_dim, output_dim, hidden_dims=(128, 128)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            # nn.Linear(hidden_dims[0], hidden_dims[1]),
            # nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[1], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value


In [14]:
from tqdm import tqdm

class A2CAgent:
    def __init__(self, env_id, num_episodes=1000, max_steps=500, gamma=0.99, lr=1e-3, num_steps = 5, num_envs = 8, vectorization_mode = "sync"):
        # using vectorized environments to boost training speed
        self.env = gym.make_vec(env_id, num_envs = num_envs, vectorization_mode=vectorization_mode)
        self.env.reset(seed=543)
        self.num_envs = num_envs
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.gamma = gamma
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCritic(self.env.single_observation_space.shape[0], self.env.single_action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing the gamma decaying rewards
    def compute_returns(self, rewards):
        """
        Args:
            rewards: torch.Tensor of shape [T, N] where
                    T = rollout steps, N = num_envs
        Returns:
            returns: torch.Tensor of shape [T, N], normalized
        """
        rewards = torch.stack(rewards)

        T, N = rewards.shape
        returns = torch.zeros_like(rewards)
        R = torch.zeros(N, device=rewards.device)
        for t in reversed(range(T)):
            R = rewards[t] + self.gamma * R
            returns[t] = R

        # Normalize returns across all timesteps and environments
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        return returns


    # computing the n step rewards
    def compute_n_step_returns(self, rewards, next_value):
        # bootstraps the future reward using value estimate
        R = next_value
        returns = []
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.stack(returns)
        if returns.numel() > 1:
            return (returns - returns.mean()) / (returns.std() + 1e-8)
        else:
            return returns * 0

    def train(self):
        episode_rewards = []
        episode_steps = []

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset()
            done = np.zeros(self.num_envs, dtype=bool)
            episode_reward = np.zeros(self.num_envs)
            values, rewards, log_probs = [], [], []
            done_mask = np.zeros(self.num_envs, dtype=bool)
            done_steps = np.zeros(self.num_envs)
            steps = 0

            while not np.all(done_mask) and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
                action_probs, value = self.policy_net(state_tensor)
                action_dist = torch.distributions.Categorical(action_probs)
                action = action_dist.sample()
                log_prob = action_dist.log_prob(action)

                next_state, reward, terminated, truncated, _ = self.env.step(action.cpu().numpy())
                done = np.logical_or(terminated, truncated)
                done_steps = np.where(np.logical_and(done, ~done_mask), steps, done_steps)
                done_mask = np.logical_or(done_mask, done)
                # record when each environment is done
                reward = np.where(done_mask, 0.0, reward)

                # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                values.append(value.squeeze())
                rewards.append(torch.tensor(reward, dtype=torch.float32).to(self.device))  # shape: (num_envs,)
                log_probs.append(log_prob)

                episode_reward += reward
                state = next_state


                # every n steps for each environment, calculate losses, update the actor & critic, then refresh the saved lists
                # if (steps % self.num_steps == 0) or np.any(done):
                if np.any(done):
                    with torch.no_grad():
                        next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                        _, next_value = self.policy_net(next_state_tensor)
                        done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                        next_value = next_value.squeeze() * (1 - done_tensor)

                    returns = self.compute_returns(rewards)  # shape: (n_steps, num_envs)
                    returns = returns.transpose(0, 1)  # shape: (num_envs, n_steps)
                    values = torch.stack(values).transpose(0, 1)  # shape: (num_envs, n_steps)
                    log_probs = torch.stack(log_probs).transpose(0, 1)  # shape: (num_envs, n_steps)
                    advantages = returns - values

                    # calculate sum instead of mean
                    actor_loss = - (log_probs * advantages.detach()).sum()
                    critic_loss = self.loss(returns, values)

                    loss = actor_loss + critic_loss
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    values = []
                    rewards = []
                    log_probs = []

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

            if episode % 20 == 0:
               print('Episode {}\tlengths: {}\treward: {}]\tfull length: {}'.format(episode, done_steps, episode_reward, steps))

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

        self.env.close()
        return np.array(episode_rewards), np.array(episode_steps)


In [12]:
import gymnasium as gym

env_id = 'LunarLander-v3'
num_episodes = 1000
max_steps = 1000
lr = 0.02


a2c_model_ll =  A2CAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = 8)

rewards, steps = a2c_model_ll.train()



  0%|          | 1/1000 [00:01<27:29,  1.65s/it]

Episode 0	lengths: [115. 112. 192.  92. 118.  60. 123.  84.]	reward: [-245.34536021 -191.66144945 -500.03720977  -11.33437027 -178.22374885
    9.63657237 -559.72998484   96.26161105]]	full length


  2%|▏         | 21/1000 [00:23<07:55,  2.06it/s]

Episode 20	lengths: [101. 167.  94. 102.  70.  97.  95.  69.]	reward: [-155.8002251  -243.81072087 -426.3456884    24.2093625  -445.11672289
 -281.91675918  -47.03076698 -378.58775796]]	full length


  4%|▍         | 41/1000 [00:42<29:40,  1.86s/it]

Episode 40	lengths: [ 300.  157.  315. 1000.  197.  204.  108. 1000.]	reward: [ 36.25827272  54.9444442  -95.51194537  99.16392461  92.59645224
 -94.05307007  41.41175421  25.86264929]]	full length


  6%|▌         | 61/1000 [01:16<39:03,  2.50s/it]

Episode 60	lengths: [ 683.  106.  127.  116.  412. 1000.  428.  148.]	reward: [131.71595876  84.68805584 105.18670653 106.84486447 118.98577649
 151.13403931 121.03806236 166.19470372]]	full length


  8%|▊         | 81/1000 [02:04<37:54,  2.47s/it]

Episode 80	lengths: [ 137.   94. 1000.   82.   86. 1000. 1000. 1000.]	reward: [ 121.24043136   98.28497141 -123.28026087   46.47805361   92.10324289
 -183.81792048  156.28795757  148.53421755]]	full length


 10%|█         | 101/1000 [02:36<12:27,  1.20it/s]

Episode 100	lengths: [154. 211. 109. 130. 124. 159. 191. 136.]	reward: [  87.26877879   76.60109185    1.07319248  -84.91215836   56.39140083
   65.89283573 -110.52570303  -96.25679812]]	full length


 12%|█▏        | 121/1000 [02:58<17:14,  1.18s/it]

Episode 120	lengths: [262. 135. 333. 392. 188. 306. 349. 119.]	reward: [102.6902332  116.68060071 137.06094067  47.44396511 -76.53314038
 135.41830985 132.38750216 -61.77191887]]	full length


 14%|█▍        | 141/1000 [03:27<25:42,  1.80s/it]

Episode 140	lengths: [ 89.  82. 133. 148. 120. 100. 127.  91.]	reward: [ 85.1159127   82.63594322  79.58670875 110.04740411 -42.28148933
  65.11869822 135.94782051  81.35190039]]	full length


 16%|█▌        | 161/1000 [03:57<13:16,  1.05it/s]

Episode 160	lengths: [279. 266. 146. 265. 185. 288. 274. 244.]	reward: [-678.58758938 -568.01361594 -190.00911658 -486.5083296  -653.58940941
 -723.4504952  -629.01211003 -674.01577993]]	full length


 18%|█▊        | 181/1000 [04:19<15:12,  1.11s/it]

Episode 180	lengths: [199. 323. 267. 241. 148. 246. 225. 164.]	reward: [ -62.72877465   -3.50574871  -89.90072411  -75.10888283 -153.97001547
 -100.95841971   72.44573928  -32.04543514]]	full length


 20%|██        | 201/1000 [04:56<33:41,  2.53s/it]

Episode 200	lengths: [ 386.  155.  151.  316. 1000.  486. 1000.  408.]	reward: [ 127.92530205  -74.14099929  -78.17412575  204.43273283 -142.81996895
  170.62510681 -145.79560213  114.35030478]]	full length


 22%|██▏       | 221/1000 [05:51<35:57,  2.77s/it]

Episode 220	lengths: [ 143. 1000.  190. 1000.  108. 1000.  150.  783.]	reward: [  75.93219279 -105.79053423  101.68530347 -117.26778223   77.13229268
  -82.99989202   91.23330814    1.12782198]]	full length


 24%|██▍       | 241/1000 [06:46<36:08,  2.86s/it]

Episode 240	lengths: [ 383. 1000. 1000. 1000.  242.  601. 1000. 1000.]	reward: [  74.63851506  -89.3659809  -105.53907031 -102.94729505  -55.07631686
  119.19244829    3.40923666 -124.26429833]]	full length


 26%|██▌       | 261/1000 [07:42<33:56,  2.76s/it]

Episode 260	lengths: [ 371.  102.  231. 1000. 1000. 1000.  286. 1000.]	reward: [-133.89641451   99.67162432   78.97629744 -112.28776086 -123.30926123
 -136.94298051  160.27291714 -138.39553816]]	full length


 28%|██▊       | 281/1000 [08:38<33:38,  2.81s/it]

Episode 280	lengths: [ 164.  156. 1000. 1000.  154. 1000.  166. 1000.]	reward: [ 139.76030029  131.06837481  -90.48632439 -120.44240772  -11.98157949
  -66.41738716  109.03068275 -124.38967718]]	full length


 30%|███       | 301/1000 [09:32<29:31,  2.53s/it]

Episode 300	lengths: [ 320.  224. 1000.  168. 1000. 1000. 1000. 1000.]	reward: [  72.33825265   92.75781625 -100.53717755  119.3523404  -119.69584177
  -54.63099816 -101.01475561  -34.0739966 ]]	full length


 32%|███▏      | 321/1000 [10:26<26:19,  2.33s/it]

Episode 320	lengths: [511. 282. 414. 156. 394. 548. 546. 145.]	reward: [178.59451195 109.10239143  31.06058293  94.8979631   10.57932359
 -14.29096039  55.8626344  -38.12067883]]	full length


 34%|███▍      | 341/1000 [11:15<19:46,  1.80s/it]

Episode 340	lengths: [132. 171. 199. 310. 191. 393. 302. 159.]	reward: [-58.5285509   95.03906084  88.16874432  74.88118691 -65.85876407
  64.29318345  41.58052574 127.55689323]]	full length


 36%|███▌      | 361/1000 [11:47<16:40,  1.56s/it]

Episode 360	lengths: [297.  92. 205. 434. 197. 296. 153. 136.]	reward: [ 80.86221921 137.15593677  91.40627644  81.59909336 -51.02244472
  78.06286163  93.93002093 -55.64185208]]	full length


 38%|███▊      | 381/1000 [12:37<29:05,  2.82s/it]

Episode 380	lengths: [205. 825. 196. 268.  92. 650. 164. 904.]	reward: [ 81.34834166  23.0333727  -65.91814556  72.10469494 128.4677137
  70.38575124 116.36489928  51.65949671]]	full length


 40%|████      | 401/1000 [13:26<24:53,  2.49s/it]

Episode 400	lengths: [ 292.  694.  163. 1000. 1000.  754. 1000.  612.]	reward: [  32.77006438   23.14179414 -106.84157152 -158.06209929 -174.40479767
 -249.20073494  100.12105731  -37.91379588]]	full length


 42%|████▏     | 421/1000 [14:14<21:27,  2.22s/it]

Episode 420	lengths: [112. 164. 330. 150. 346.  81. 113. 362.]	reward: [105.80808001 169.41386285 167.40635582 111.84117864 151.39108756
  80.73344567 -42.60983933 162.38802472]]	full length


 44%|████▍     | 441/1000 [15:01<19:03,  2.05s/it]

Episode 440	lengths: [ 74. 207. 342. 207. 513. 158. 234. 205.]	reward: [-59.27552048  95.4300757   63.30034576 118.06193808  88.79323914
 -68.19937428  52.05749778  51.58770277]]	full length


 46%|████▌     | 461/1000 [15:31<11:00,  1.23s/it]

Episode 460	lengths: [339. 250. 311. 269. 212. 261. 228. 240.]	reward: [ 174.460396     43.58129604    6.06236435  -77.55874279 -109.98656502
 -126.78305143 -117.78754448 -163.08289765]]	full length


 48%|████▊     | 481/1000 [16:07<21:29,  2.48s/it]

Episode 480	lengths: [ 184.  244.  832.  438.   71.  157. 1000.  144.]	reward: [  51.13555438   37.15444509 -109.1954511    59.38020324 -108.25502635
   82.95503617 -211.88334161   73.34869258]]	full length


 50%|█████     | 501/1000 [16:35<09:08,  1.10s/it]

Episode 500	lengths: [172. 214.  70. 121. 314.  87. 199. 146.]	reward: [119.78704893 129.39036292 -59.19749181 -52.50968776  97.56239334
 103.46110488  65.6335222   84.09609062]]	full length


 52%|█████▏    | 521/1000 [16:56<08:15,  1.03s/it]

Episode 520	lengths: [ 94.  76. 251. 114. 163.  59. 274. 166.]	reward: [-118.26326498 -127.04786604   -1.3836776   -74.37574693   70.84393706
 -178.54553924  140.99598524   63.65890687]]	full length


 54%|█████▍    | 541/1000 [17:16<08:11,  1.07s/it]

Episode 540	lengths: [174. 468.  74.  91. 133.  84.  78. 100.]	reward: [  58.72060986   89.51618692 -249.25307947 -233.20476807   44.65433467
 -128.67555112 -247.30756508  121.66489266]]	full length


 56%|█████▌    | 561/1000 [17:36<06:34,  1.11it/s]

Episode 560	lengths: [190. 131. 133.  91. 147.  84.  95. 146.]	reward: [ -65.06832145 -105.85827197   71.70869902  -48.75376526   97.37811616
 -282.84069006 -125.180682    114.80074004]]	full length


 58%|█████▊    | 581/1000 [18:17<15:28,  2.22s/it]

Episode 580	lengths: [1000.  343.  336.  825.  105.   83.   95.  152.]	reward: [-119.26616631   98.29182254   66.71973997   38.6068371    63.25992627
  -49.77663382  100.72464711  138.8558806 ]]	full length


 60%|██████    | 601/1000 [19:10<18:37,  2.80s/it]

Episode 600	lengths: [ 238.  150.  883.  383. 1000. 1000. 1000. 1000.]	reward: [-125.7155418  -159.28458166 -103.31086137   55.78530788  -85.40632807
 -148.62462273 -191.14069943 -137.78676338]]	full length


 62%|██████▏   | 621/1000 [20:00<17:23,  2.75s/it]

Episode 620	lengths: [ 237.  225.  158.  100. 1000.  231.  206.  350.]	reward: [  67.37747472   90.45867616  -43.84796599   32.26899408 -122.55685083
   82.11232101  -83.58658458 -236.19173889]]	full length


 64%|██████▍   | 641/1000 [20:49<12:19,  2.06s/it]

Episode 640	lengths: [438. 136.  93. 728. 134. 161. 215. 232.]	reward: [ 125.05805414  -91.83819208  -49.9161833   -30.41033699  -79.8770048
   80.49001576 -128.62000897  116.50498041]]	full length


 66%|██████▌   | 661/1000 [21:21<09:07,  1.61s/it]

Episode 660	lengths: [191.  95.  98. 229. 127. 324. 178. 195.]	reward: [ 130.55773653   74.43907236   54.83478175  -98.45494295 -117.59201963
   17.39386498   58.69168772  -89.21085109]]	full length


 68%|██████▊   | 681/1000 [21:46<07:14,  1.36s/it]

Episode 680	lengths: [ 80. 107. 206. 137. 353. 311. 254. 295.]	reward: [ 44.8232395  -67.00974626  71.83005556 119.15018302 150.72253742
  40.17496413  61.58742817 154.14449279]]	full length


 70%|███████   | 701/1000 [22:15<05:35,  1.12s/it]

Episode 700	lengths: [478. 415. 158. 140.  90.  99. 133. 294.]	reward: [108.42309629 117.61005584 140.55860475 136.5528924   67.34218513
  68.3949471  -49.03626577  80.85962704]]	full length


 72%|███████▏  | 721/1000 [22:47<06:31,  1.40s/it]

Episode 720	lengths: [340. 147.  79. 193.  66.  66.  79.  53.]	reward: [  71.52600322   58.03084623 -209.55964911  141.63479287 -219.79755649
 -151.89928434  -45.56016623 -189.07465092]]	full length


 74%|███████▍  | 741/1000 [23:20<10:46,  2.49s/it]

Episode 740	lengths: [290. 277. 296.  80.  85. 283. 372. 810.]	reward: [ -67.29206288   98.22519476  119.35863638   48.07049934   55.75295514
  125.61177833  -27.20088073 -102.26978281]]	full length


 76%|███████▌  | 761/1000 [24:02<06:42,  1.68s/it]

Episode 760	lengths: [134. 366. 724. 127. 238. 111. 164. 325.]	reward: [116.77773477  88.03754466  13.99733532 -83.16667117  57.60059822
 -35.87212923 115.74310745 174.02564522]]	full length


 78%|███████▊  | 781/1000 [24:29<03:58,  1.09s/it]

Episode 780	lengths: [396. 352. 159. 252. 264. 160. 225. 232.]	reward: [-134.38696067    3.60052413  -84.51360507   38.91071199   19.39984031
   20.85179663   77.16206825  -99.67072692]]	full length


 80%|████████  | 801/1000 [24:51<03:23,  1.02s/it]

Episode 800	lengths: [249. 232. 456. 237. 224. 253. 132. 156.]	reward: [ 69.12882639 205.77474962 108.01877663 -91.29077284 -83.2714081
 -88.28082838 -63.62172644 -90.14823556]]	full length


 82%|████████▏ | 821/1000 [25:16<03:28,  1.16s/it]

Episode 820	lengths: [222. 218. 392. 165. 339. 242. 369. 163.]	reward: [ 74.55110002 112.7158559  140.96138643 168.69438227 -79.87556682
  93.81456088 151.05397347  78.17752628]]	full length


 84%|████████▍ | 841/1000 [25:40<03:16,  1.23s/it]

Episode 840	lengths: [163. 314. 416. 256. 306. 211. 247. 208.]	reward: [ 45.75726139  14.74570441  -7.88837625 110.4376182    0.80634217
 -53.14588687  93.23230597 -74.18582462]]	full length


 86%|████████▌ | 861/1000 [26:03<02:42,  1.17s/it]

Episode 860	lengths: [302. 478. 253. 465. 257. 375. 498. 264.]	reward: [-108.71186244  152.16549746   90.95988728 -217.59617464   69.35195708
   46.76918596  -57.03299706 -124.94465846]]	full length


 88%|████████▊ | 881/1000 [26:31<02:39,  1.34s/it]

Episode 880	lengths: [407. 300. 277. 208. 476. 349. 288. 489.]	reward: [ 107.74613327   80.26340373 -112.73295543  -53.59094314   79.87784966
  170.74108868   90.65675995  150.19819008]]	full length


 90%|█████████ | 901/1000 [26:58<02:07,  1.29s/it]

Episode 900	lengths: [279. 244. 282. 462. 190. 142. 204. 265.]	reward: [  21.3207728    72.92064394 -114.72126275   16.15195808  145.59188791
  -50.25099448  -47.62084701  -70.13688599]]	full length


 92%|█████████▏| 921/1000 [27:23<01:22,  1.05s/it]

Episode 920	lengths: [386.  80. 136. 325. 118. 150. 134.  78.]	reward: [  8.72059442  75.05343801 -70.34026224  94.88434423 134.27390313
 -79.09664676 110.4687373   98.81491069]]	full length


 94%|█████████▍| 941/1000 [27:44<01:02,  1.05s/it]

Episode 940	lengths: [198. 224. 122. 390. 292. 237. 253. 385.]	reward: [-74.84835203  88.30448988 -72.80138677 135.6611101  176.10431465
  91.25914809  90.67378591 127.15762369]]	full length


 96%|█████████▌| 961/1000 [28:06<00:44,  1.15s/it]

Episode 960	lengths: [189. 188. 245. 240. 178. 337. 126. 201.]	reward: [136.20201953  73.94018374  96.70209153  88.408784   -52.37408652
 172.53432012 -43.3308698  -89.94665456]]	full length


 98%|█████████▊| 981/1000 [28:26<00:19,  1.02s/it]

Episode 980	lengths: [145.  80. 214. 241. 188. 378. 382.  85.]	reward: [-43.64471542  35.81622811  86.844281    99.04345521 -62.70261642
 155.28235398  51.19856435 101.08168946]]	full length


100%|██████████| 1000/1000 [28:48<00:00,  1.73s/it]


In [17]:
env_id = 'LunarLander-v3'
num_episodes = 1000
max_steps = 1000
lr = 0.02

a2c_model_ll_async =  A2CAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, num_steps = 8, vectorization_mode="async")

rewards_async, steps_async = a2c_model_ll_async.train()

  0%|          | 1/1000 [00:00<08:47,  1.89it/s]

Episode 0	lengths: [109. 105.  91.  86. 108.  71. 120.  88.]	reward: [-384.05631434 -244.53897027 -475.32608407 -237.94813173 -252.59032813
 -274.03278797 -330.4445024  -269.70100465]]	full length: 120


  2%|▏         | 21/1000 [01:02<55:04,  3.38s/it]

Episode 20	lengths: [ 196. 1000.  796.  218. 1000.  159. 1000. 1000.]	reward: [  82.64851266   36.39450977   -7.41036394  -48.31003876 -148.97778193
  -32.81208858 -198.13951374    6.12669533]]	full length: 1000


  4%|▍         | 41/1000 [01:22<13:59,  1.14it/s]

Episode 40	lengths: [ 86. 230. 109. 159.  87.  93.  92. 143.]	reward: [ -462.59093989 -1577.94356144  -607.13759573 -1055.76156587
  -367.76281493  -493.07035469  -435.92103231  -858.91124158]]	full length: 230


  6%|▌         | 61/1000 [01:40<13:16,  1.18it/s]

Episode 60	lengths: [ 87. 178.  86. 138.  98. 137.  78. 239.]	reward: [ -576.03430351 -1138.8067684   -535.20599488  -854.11062355
  -492.03414504  -989.66993346  -314.19442624 -1851.93755169]]	full length: 239


  8%|▊         | 81/1000 [01:58<14:35,  1.05it/s]

Episode 80	lengths: [ 75. 207. 126.  80. 102.  84.  90. 174.]	reward: [ -414.16012077 -1557.14632454  -834.5660788   -300.10120843
  -574.44005449  -500.04567563  -519.98510693  -909.04609315]]	full length: 207


 10%|█         | 101/1000 [02:13<10:06,  1.48it/s]

Episode 100	lengths: [177.  90.  90. 127. 100.  97.  95.  91.]	reward: [-1125.02442561  -500.26380887  -505.99300807  -690.90158127
  -484.1613093   -577.31650747  -530.5431576   -457.41590699]]	full length: 177


 12%|█▏        | 121/1000 [02:30<11:30,  1.27it/s]

Episode 120	lengths: [126.  79. 110.  78. 144. 126.  74. 165.]	reward: [-635.34711291 -358.60807026 -337.11017892 -393.39840076 -923.98736053
 -503.23676003 -382.96365575 -931.78235867]]	full length: 165


 14%|█▍        | 141/1000 [02:49<12:14,  1.17it/s]

Episode 140	lengths: [181. 241. 113. 105. 143. 138. 110.  75.]	reward: [-1004.74683975 -2002.88165614  -625.36933763  -604.30711679
  -767.86910851  -811.07509087  -674.91567238  -459.55873753]]	full length: 241


 16%|█▌        | 161/1000 [03:06<10:01,  1.40it/s]

Episode 160	lengths: [118. 101. 101. 106.  85. 146.  98.  82.]	reward: [-573.8544654  -625.44755596 -538.28693969 -540.80240612 -400.56939056
 -638.58159372 -365.28652018 -421.45363421]]	full length: 146


 18%|█▊        | 181/1000 [03:25<12:12,  1.12it/s]

Episode 180	lengths: [ 88. 194. 175. 118.  76.  75.  83. 218.]	reward: [ -537.91701816 -1289.54441715 -1335.90524056  -835.37844455
  -291.01067031  -388.99840621  -358.12932771 -1506.09880187]]	full length: 218


 20%|██        | 201/1000 [03:40<11:59,  1.11it/s]

Episode 200	lengths: [112.  78.  90. 163.  79.  93. 101.  83.]	reward: [-344.10018878 -516.75436063 -451.47057422 -812.47074513 -365.58444936
 -474.98576271 -509.10252341 -456.41762508]]	full length: 163


 22%|██▏       | 221/1000 [03:57<10:44,  1.21it/s]

Episode 220	lengths: [100. 130.  83.  85. 168.  78.  78. 104.]	reward: [-438.52111468 -726.46274799 -414.34656165 -362.20719961 -677.58437644
 -475.01365097 -272.50379489 -646.89626347]]	full length: 168


 24%|██▍       | 241/1000 [04:16<12:51,  1.02s/it]

Episode 240	lengths: [179.  92. 110. 329.  76. 182.  76.  83.]	reward: [-1357.48252832  -578.33687813  -550.062748   -2864.78439036
  -455.0147292  -1360.55624176  -345.55442762  -463.88710304]]	full length: 329


 26%|██▌       | 261/1000 [04:37<14:48,  1.20s/it]

Episode 260	lengths: [ 86. 104. 179. 195. 113. 102.  76. 136.]	reward: [-114.05910248 -206.37115741 -232.75354921 -395.45025727 -469.96378992
 -530.61670258 -311.16094141 -459.83693494]]	full length: 195


 28%|██▊       | 281/1000 [05:46<46:13,  3.86s/it]

Episode 280	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [-103.80873131 -125.93983482 -131.77512129 -108.96766129  -71.94045862
 -147.0252354   -82.21261674  -75.41661402]]	full length: 1000


 30%|███       | 301/1000 [07:07<50:06,  4.30s/it]

Episode 300	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [-112.72238691 -114.08257973 -125.7871353  -118.66178169 -108.21215252
 -148.91787382 -130.92372483 -141.95922252]]	full length: 1000


 32%|███▏      | 321/1000 [08:26<44:19,  3.92s/it]

Episode 320	lengths: [1000.  404.  163. 1000. 1000. 1000. 1000. 1000.]	reward: [-105.91387056   -9.69604308    5.24867955 -131.25136476  -80.84445754
 -132.11230226 -117.31658717 -105.56074613]]	full length: 1000


 34%|███▍      | 341/1000 [09:44<42:30,  3.87s/it]

Episode 340	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [ -98.80234954  -81.13904957 -101.5375782   -98.80752704  -95.41806789
 -131.1010752  -101.66802325 -106.36929191]]	full length: 1000


 36%|███▌      | 361/1000 [11:03<41:58,  3.94s/it]

Episode 360	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [-125.37832482 -124.32265362 -107.75263136  -96.57335201  -51.60568299
 -134.24102012 -150.19918119 -104.27719949]]	full length: 1000


 38%|███▊      | 381/1000 [12:19<39:12,  3.80s/it]

Episode 380	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [ -97.8834318  -110.0607733   -64.04976144  -97.31670689 -127.71550722
 -120.15630361 -102.01230769 -128.11051256]]	full length: 1000


 40%|████      | 401/1000 [13:35<37:48,  3.79s/it]

Episode 400	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [ -77.33219303 -100.761501   -121.24803773 -106.59239453  -99.50934654
  -73.45034814 -102.34248038  -90.80377348]]	full length: 1000


 42%|████▏     | 421/1000 [14:54<38:05,  3.95s/it]

Episode 420	lengths: [1000. 1000. 1000. 1000. 1000.  246. 1000.  172.]	reward: [-125.6751428  -136.52164014 -121.60852811 -114.61834222  -97.38230747
  -10.86311355 -145.75412635   -2.34751171]]	full length: 1000


 44%|████▍     | 441/1000 [16:10<36:09,  3.88s/it]

Episode 440	lengths: [1000. 1000. 1000. 1000.  709. 1000. 1000. 1000.]	reward: [-124.38287823 -131.25591348 -109.3124047   -85.10600116  -70.89103439
 -111.50592293 -146.91368623 -124.04390348]]	full length: 1000


 46%|████▌     | 461/1000 [17:27<34:20,  3.82s/it]

Episode 460	lengths: [1000.  832. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [ -85.03088912 -106.05297368 -121.96273146 -162.7281477  -116.61578999
  -87.07216886 -106.35855505  -92.22483346]]	full length: 1000


 48%|████▊     | 481/1000 [18:44<32:41,  3.78s/it]

Episode 480	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [-127.50867886 -119.49754911 -142.94080331 -106.05784875 -119.56905753
 -130.72519292 -140.35778265 -117.3400798 ]]	full length: 1000


 50%|█████     | 501/1000 [20:02<32:28,  3.90s/it]

Episode 500	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [-148.33848733 -106.13944292 -109.69188957 -126.4379701  -117.0449929
 -150.94980392 -142.39900136 -110.12102534]]	full length: 1000


 52%|█████▏    | 521/1000 [21:20<31:13,  3.91s/it]

Episode 520	lengths: [1000.  455.  784.  267. 1000.  397. 1000. 1000.]	reward: [-167.80537161  -76.54411736 -138.91259654  -44.74956744  -83.85821258
  -32.52678368 -153.31779996 -103.92165315]]	full length: 1000


 54%|█████▍    | 541/1000 [22:37<29:16,  3.83s/it]

Episode 540	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [ -98.4315443  -146.8360373  -138.73919074 -149.92435096 -133.57675605
  -99.53869552 -158.77597429 -140.40570733]]	full length: 1000


 56%|█████▌    | 561/1000 [23:55<27:44,  3.79s/it]

Episode 560	lengths: [ 263. 1000.  698. 1000. 1000. 1000. 1000. 1000.]	reward: [  -0.82392627 -118.89219745  -72.67126177 -122.77243524 -108.19115002
 -139.83415468 -147.12865315 -155.947741  ]]	full length: 1000


 58%|█████▊    | 581/1000 [25:14<28:08,  4.03s/it]

Episode 580	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [-132.52571491 -109.50593155 -123.95195609 -103.74374809 -129.33971572
 -118.30662941 -156.08269344  -98.86080416]]	full length: 1000


 60%|██████    | 601/1000 [26:32<25:50,  3.89s/it]

Episode 600	lengths: [1000.  897. 1000. 1000. 1000.  462. 1000. 1000.]	reward: [-127.51490491 -151.38590548 -180.0868826  -143.71513533 -148.83517231
  -44.64217862 -130.69619177 -131.8432758 ]]	full length: 1000


 62%|██████▏   | 621/1000 [27:51<24:28,  3.88s/it]

Episode 620	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [-115.81190946 -149.64022602 -141.37488358 -117.11607531 -188.56024124
 -161.33211079 -159.96060476 -167.20362474]]	full length: 1000


 64%|██████▍   | 641/1000 [29:09<23:40,  3.96s/it]

Episode 640	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000.  619.]	reward: [-107.42119823  -99.95123181 -162.60327712 -139.72996456 -107.23691177
 -101.14508185 -107.57401465  -90.39787444]]	full length: 1000


 66%|██████▌   | 661/1000 [30:27<22:06,  3.91s/it]

Episode 660	lengths: [1000. 1000.  151.  240.  824. 1000.  190.  914.]	reward: [ -94.22994952  -73.03564355   22.83749759  -36.87790023 -119.15365934
 -134.98197188  -30.29187938 -129.9644914 ]]	full length: 1000


 68%|██████▊   | 681/1000 [31:45<20:09,  3.79s/it]

Episode 680	lengths: [1000. 1000. 1000. 1000. 1000.  373.  557.  144.]	reward: [-124.9076301  -167.42043942 -125.90526925 -109.11310838 -108.56820878
  -53.82918903  -66.48287194  -13.96959938]]	full length: 1000


 70%|███████   | 701/1000 [33:03<19:46,  3.97s/it]

Episode 700	lengths: [1000. 1000.  923. 1000.  217.  300.  387. 1000.]	reward: [-134.31325276 -129.86751352 -133.21185951  -83.52078322   -3.19075195
  -50.81849813  -32.9090075  -113.28168364]]	full length: 1000


 72%|███████▏  | 721/1000 [34:20<18:01,  3.87s/it]

Episode 720	lengths: [1000. 1000. 1000.  208.  260. 1000. 1000.  322.]	reward: [-127.92133274 -107.77843997 -101.12177427   -1.55886637  -30.67612279
 -134.43234559 -134.66049343  -49.39066112]]	full length: 1000


 74%|███████▍  | 741/1000 [35:35<16:18,  3.78s/it]

Episode 740	lengths: [1000.  861. 1000. 1000.  588.  899.  232. 1000.]	reward: [-112.83056745 -121.22834973 -135.83444766  -83.30015513  -90.43588805
 -147.94335319  -16.25280227  -93.70333007]]	full length: 1000


 76%|███████▌  | 761/1000 [36:50<14:51,  3.73s/it]

Episode 760	lengths: [1000. 1000.  914. 1000. 1000. 1000.  746. 1000.]	reward: [-108.02040786 -117.08126796 -157.13048066  -88.40712798 -144.6296387
 -155.2548229  -122.04684502 -121.49398714]]	full length: 1000


 78%|███████▊  | 781/1000 [38:06<13:43,  3.76s/it]

Episode 780	lengths: [1000. 1000. 1000.  176. 1000. 1000. 1000.  152.]	reward: [-131.87365898 -163.36005007 -172.79696206   -9.9238602  -156.5885749
 -125.57787431 -133.05830114  -13.44881689]]	full length: 1000


 80%|████████  | 801/1000 [39:23<12:52,  3.88s/it]

Episode 800	lengths: [1000.  431. 1000. 1000. 1000. 1000.  405. 1000.]	reward: [-117.25783704  -43.10737909 -139.96676214 -150.78353279 -125.34889947
  -99.9548083   -33.0919144  -107.07932847]]	full length: 1000


 82%|████████▏ | 821/1000 [40:41<11:37,  3.90s/it]

Episode 820	lengths: [1000. 1000. 1000.  240. 1000. 1000. 1000.  985.]	reward: [ -73.50978099  -85.09655674 -125.84396878  -29.81979249 -108.54179216
 -132.27902463 -132.89961339 -130.8115245 ]]	full length: 1000


 84%|████████▍ | 841/1000 [41:59<10:10,  3.84s/it]

Episode 840	lengths: [ 311. 1000.  137.  121.  239. 1000.  953. 1000.]	reward: [ -15.60466556  -46.11310831   -4.71022413  -22.39613671  -18.46034174
 -163.84335683 -171.23959823 -131.36839132]]	full length: 1000


 86%|████████▌ | 861/1000 [43:15<08:37,  3.72s/it]

Episode 860	lengths: [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]	reward: [-138.68242825 -104.80301659 -105.61814214  -68.11996698  -79.27334072
 -121.85651721  -95.34440449 -134.90174182]]	full length: 1000


 88%|████████▊ | 881/1000 [44:32<07:50,  3.96s/it]

Episode 880	lengths: [ 333. 1000. 1000. 1000.  983.  765.  468. 1000.]	reward: [ -44.04476501 -158.58224446 -142.85835747 -185.98124971 -175.0697975
 -148.76592478  -95.55764368 -194.73055143]]	full length: 1000


 90%|█████████ | 901/1000 [45:50<06:27,  3.91s/it]

Episode 900	lengths: [ 417.  302. 1000. 1000.  836. 1000.  766. 1000.]	reward: [ -52.83557328  -25.79769614 -157.16363939 -143.26356869  -99.86224954
 -154.22643713  -92.88858716 -138.47737442]]	full length: 1000


 92%|█████████▏| 921/1000 [47:08<05:00,  3.80s/it]

Episode 920	lengths: [ 854.  215. 1000.  259.  603. 1000. 1000. 1000.]	reward: [ -96.10029362   -6.36942193  -94.73783624   -5.98232015  -81.00102342
 -103.70856738 -124.44304167 -139.39481338]]	full length: 1000


 94%|█████████▍| 941/1000 [48:26<03:50,  3.91s/it]

Episode 940	lengths: [ 597. 1000. 1000.  705.  161.  217.  228.  464.]	reward: [ -96.8805591  -194.48420117 -148.10633049 -122.27800902  -51.17079851
  -30.85328686   -4.36406524  -84.33573635]]	full length: 1000


 96%|█████████▌| 961/1000 [49:40<02:28,  3.80s/it]

Episode 960	lengths: [ 155.  363. 1000.  468.  148.  355.  116.  451.]	reward: [ -21.17706031  -37.88632335 -125.1789049   -69.86483816  -27.39609436
  -88.35731558  -48.01772539  -36.389359  ]]	full length: 1000


 98%|█████████▊| 981/1000 [50:58<01:17,  4.07s/it]

Episode 980	lengths: [ 322.  219.  565. 1000.  831. 1000.  728. 1000.]	reward: [ -55.8004386   -20.76729692  -68.17674496 -125.28326551 -116.31863609
  -99.77122249 -108.74760863  -98.86020141]]	full length: 1000


100%|██████████| 1000/1000 [52:13<00:00,  3.13s/it]


In [21]:
import gymnasium as gym
import torch
import numpy as np
from gymnasium.wrappers import RecordVideo
import os

# Create folder to save the video
video_folder = "./video"
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with RecordVideo
env = gym.make('LunarLander-v3', render_mode='rgb_array')
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda e: True)

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_ll.policy_net(state)
            # action_dist = torch.distributions.Categorical(action_probs)
            # action = action_dist.sample().item()
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: -52.894699397403144
Episode 2 Reward: -27.627745976441886
Episode 3 Reward: -236.95609091731924
Episode 4 Reward: 135.61399384725155
Episode 5 Reward: -66.2182336378679
Episode 6 Reward: 19.699363217334394
Episode 7 Reward: 26.81668761972692
Episode 8 Reward: -15.518179226312213
Episode 9 Reward: 13.410176329457428
Episode 10 Reward: -11.25524196136928
Average Reward over 10 episodes: -21.49299701029434


In [22]:
import gymnasium as gym
import torch
import numpy as np
from gymnasium.wrappers import RecordVideo
import os

# Create folder to save the video
video_folder = "./video"
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with RecordVideo
env = gym.make('LunarLander-v3', render_mode='rgb_array')
env = RecordVideo(env, video_folder=video_folder, name_prefix="async_demo", episode_trigger=lambda e: True)

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = a2c_model_ll_async.policy_net(state)
            # action_dist = torch.distributions.Categorical(action_probs)
            # action = action_dist.sample().item()
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: -113.4247666969921
Episode 2 Reward: -136.1491438008048
Episode 3 Reward: -113.35021589916234
Episode 4 Reward: -108.9208804584625
Episode 5 Reward: -88.38824172533324
Episode 6 Reward: -127.98817557744319
Episode 7 Reward: -129.28597991379633
Episode 8 Reward: -117.62409761534742
Episode 9 Reward: -104.91873832786548
Episode 10 Reward: -92.15144943763181
Average Reward over 10 episodes: -113.2201689452839
