In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools
!pip install swig
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379367 sha256=02a8c98464434f527806005df839f6072c62b5b45505f25cc8776a42132aec23
  Stored in directory: /root/.cache/pip/wheels/ab

In [2]:
!pip uninstall -y box2d-py
!pip install box2d pygame swig
!pip install "gymnasium[box2d]" --no-deps

Found existing installation: box2d-py 2.3.5
Uninstalling box2d-py-2.3.5:
  Successfully uninstalled box2d-py-2.3.5
Collecting box2d
  Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (573 bytes)
Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: box2d
Successfully installed box2d-2.3.10


In [101]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCriticPPO(nn.Module):

    def __init__(self, input_dim, output_dim, hidden_dims=(64, 64)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[1], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value


In [105]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
import numpy as np
import copy

class PPOAgent:
    def __init__(self, env_id, num_episodes=1000, max_steps=500, epsilon=float('inf'), gamma=0.99, lambda_GAE=1, lr=1e-3, num_steps=0, num_envs=8, vectorization_mode = "sync", seed=123):
        # using vectorized environments to boost training speed
        self.env = gym.make_vec(env_id, num_envs=num_envs, vectorization_mode=vectorization_mode)
        self.num_envs = num_envs
        self.num_episodes = num_episodes
        self.max_steps = max_steps
        self.epsilon = epsilon
        self.gamma = gamma
        self.lambda_GAE = lambda_GAE
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCriticPPO(self.env.single_observation_space.shape[0], self.env.single_action_space.n).to(self.device)
        self.old_policy_net = copy.deepcopy(self.policy_net)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        # added scheduler after observing divergence after getting close to solving
        self.scheduler = StepLR(self.optimizer, step_size=100, gamma=0.9)
        self.loss = nn.MSELoss()
        self.seed = seed

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.old_policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing GAE
    def compute_returns(self, rewards, values, next_value):
        rewards = torch.stack(rewards)
        values = torch.cat([values, next_value.unsqueeze(0)], dim=0)

        T, N = rewards.shape
        advantages = torch.zeros_like(rewards)
        gae = torch.zeros(N, device=rewards.device)
        for t in reversed(range(T)):
            # temporal difference error
            td = rewards[t] + self.gamma * values[t + 1] - values[t]
            # higher labmda -> more sampling, lower lambda -> more bootstrapping
            gae = td + self.gamma * self.lambda_GAE * gae
            advantages[t] = gae

        # compute returns by adding value to advantage
        returns = advantages + values[:-1]
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        # normalize advantage across timesteps and environments
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        return returns, advantages

    # computing the gamma decaying rewards
    # def compute_returns(self, rewards, values, next_value):
    #     """
    #     Args:
    #         rewards: torch.Tensor of shape [T, N] where
    #                 T = rollout steps, N = num_envs
    #     Returns:
    #         returns: torch.Tensor of shape [T, N], normalized
    #     """
    #     rewards = torch.stack(rewards)

    #     T, N = rewards.shape
    #     returns = torch.zeros_like(rewards)
    #     R = torch.zeros(N, device=rewards.device)
    #     for t in reversed(range(T)):
    #         R = rewards[t] + self.gamma * R
    #         returns[t] = R

    #     # Normalize returns across all timesteps and environments
    #     returns = (returns - returns.mean()) / (returns.std() + 1e-8)
    #     advantages = returns - values

    #     return returns, advantages


    def train(self):
        episode_rewards = []
        episode_steps = []

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset(seed=self.seed)
            done = np.zeros(self.num_envs, dtype=bool)
            episode_reward = np.zeros(self.num_envs)
            values, rewards, log_probs, old_log_probs, old_values = [], [], [], [], []
            done_mask = np.zeros(self.num_envs, dtype=bool)
            done_steps = np.zeros(self.num_envs)
            steps = 0

            while not np.all(done_mask) and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
                action_probs, value = self.policy_net(state_tensor)
                action_dist = torch.distributions.Categorical(action_probs)
                action = action_dist.sample()
                log_prob = action_dist.log_prob(action)

                with torch.no_grad():
                    old_action_probs, old_value = self.old_policy_net(state_tensor)
                    old_action_dist = torch.distributions.Categorical(old_action_probs)
                    old_log_prob = old_action_dist.log_prob(action)

                next_state, reward, terminated, truncated, _ = self.env.step(action.cpu().numpy())
                done = np.logical_or(terminated, truncated)
                done_steps = np.where(np.logical_and(done, ~done_mask), steps, done_steps)
                done_mask = np.logical_or(done_mask, done)
                # record when each environment is done
                reward = np.where(done_mask, 0.0, reward)

                # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                values.append(value.squeeze())
                rewards.append(torch.tensor(reward, dtype=torch.float32).to(self.device))  # shape: (num_envs,)
                log_probs.append(log_prob)
                old_log_probs.append(old_log_prob)
                old_values.append(old_value.squeeze())

                episode_reward += reward
                state = next_state


                # finish full trajectory, then update
                if self.num_steps == 0:
                    if np.any(done):
                        with torch.no_grad():
                            next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                            _, next_value = self.policy_net(next_state_tensor)
                            done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                            next_value = next_value.squeeze() * (1 - done_tensor)

                        values = torch.stack(values)  # shape: (n_steps, num_envs)
                        returns, advantages = self.compute_returns(rewards, values, next_value)  # shape: (n_steps, num_envs)
                        returns, values, advantages = returns.transpose(0, 1), values.transpose(0, 1), advantages.transpose(0, 1)  # shape: (num_envs, n_steps)
                        log_probs, old_log_probs = torch.stack(log_probs).transpose(0, 1), torch.stack(old_log_probs).transpose(0, 1)  # shape: (num_envs, n_steps)
                        prob_ratio = torch.exp(log_probs - old_log_probs)

                        surrogate = torch.min(prob_ratio * advantages, prob_ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * advantages)
                        actor_loss = -surrogate.mean()
                        critic_loss = self.loss(returns, values)

                        loss = actor_loss + critic_loss
                        self.optimizer.zero_grad()
                        loss.backward()
                        self.optimizer.step()
                        # self.scheduler.step()

                        values = []
                        rewards = []
                        log_probs = []
                        old_log_probs = []

                # every n steps for each environment, calculate losses, update the actor & critic, then refresh the saved lists
                else:
                    if (steps % self.num_steps == 0) or np.any(done):
                        with torch.no_grad():
                            next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                            _, next_value = self.policy_net(next_state_tensor)
                            done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                            next_value = next_value.squeeze() * (1 - done_tensor)

                        values = torch.stack(values)  # shape: (n_steps, num_envs)
                        old_values = torch.stack(old_values).transpose(0, 1)
                        returns, advantages = self.compute_returns(rewards, values, next_value)  # shape: (n_steps, num_envs)
                        returns, values, advantages = returns.transpose(0, 1), values.transpose(0, 1), advantages.transpose(0, 1)  # shape: (num_envs, n_steps)
                        log_probs, old_log_probs = torch.stack(log_probs).transpose(0, 1), torch.stack(old_log_probs).transpose(0, 1)  # shape: (num_envs, n_steps)
                        prob_ratio = torch.exp(log_probs - old_log_probs.detach())

                        clipped = (prob_ratio > 1 + self.epsilon) | (prob_ratio < 1 - self.epsilon)
                        clip_fraction = clipped.float().mean().item()
                        # print(f"Clipped fraction: {clip_fraction:.2%}")

                        surrogate = torch.min(prob_ratio * advantages, prob_ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * advantages)
                        actor_loss = -surrogate.mean()

                        # clipped_values = torch.clamp(values, old_values.detach() * (1 - self.epsilon), old_values.detach() * (1 + self.epsilon))
                        # critic_loss = min(self.loss(returns, values), self.loss(returns, clipped_values))
                        critic_loss = self.loss(returns, values)

                        loss = actor_loss + 0.5 * critic_loss
                        self.optimizer.zero_grad()
                        loss.backward()
                        self.optimizer.step()
                        # self.scheduler.step()
                        # if episode % 10 == 0:
                        #     print("VALUES", values, "\n", "RETURNS", returns)

                        values, rewards, log_probs, old_log_probs, old_values = [], [], [], [], []


            self.old_policy_net = copy.deepcopy(self.policy_net)

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

            if episode % 20 == 0:
               print('Episode {}\tlengths: {}\treward: {}]\tfull length: {}'.format(episode, done_steps, episode_reward, steps))
            if episode % 10 == 0:
                print(f"\n[Episode {episode}]")
                print(f"Reward (mean): {np.mean(episode_reward):.2f}")
                print(f"Actor Loss: {actor_loss.item():.4f} | Critic Loss: {critic_loss.item():.4f}")
                print(f"Prob Ratio - mean: {prob_ratio.mean().item():.4f}, max: {prob_ratio.max().item():.4f}, min: {prob_ratio.min().item():.4f}")

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

        self.env.close()
        return np.array(episode_rewards), np.array(episode_steps)


In [None]:
import gymnasium as gym

env_id = 'LunarLander-v3'
num_episodes = 500
max_steps = 500
lr = 1e-4


ppo_model =  PPOAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, epsilon=0.2, num_envs=8, num_steps=8)

rewards, steps = ppo_model.train()



  0%|          | 1/500 [00:00<04:14,  1.96it/s]

Episode 0	lengths: [ 64.  69. 109.  97.  89. 100.  90.  72.]	reward: [-167.85368813 -112.30575797  -19.21825992 -111.71412477 -243.36392964
  -20.24118961  -58.12143199  -61.3414641 ]]	full length: 109

[Episode 0]
Reward (mean): -99.27
Actor Loss: -0.0002 | Critic Loss: 0.9986
Prob Ratio - mean: 1.0005, max: 1.0094, min: 0.9910


  2%|▏         | 11/500 [00:06<05:23,  1.51it/s]


[Episode 10]
Reward (mean): -31.01
Actor Loss: -0.0012 | Critic Loss: 1.0522
Prob Ratio - mean: 0.9998, max: 1.0099, min: 0.9934


  4%|▍         | 21/500 [00:11<04:00,  1.99it/s]

Episode 20	lengths: [ 75.  71. 115.  92.  92. 101. 105.  71.]	reward: [-109.91888568   12.54096396  -54.51765037  -47.23984847 -252.27680638
 -152.98533693  -80.40698152   16.08809575]]	full length: 115

[Episode 20]
Reward (mean): -83.59
Actor Loss: 0.0020 | Critic Loss: 1.0547
Prob Ratio - mean: 1.0009, max: 1.0055, min: 0.9932


  6%|▌         | 31/500 [00:19<05:18,  1.47it/s]


[Episode 30]
Reward (mean): -58.89
Actor Loss: 0.0003 | Critic Loss: 1.1895
Prob Ratio - mean: 1.0003, max: 1.0107, min: 0.9826


  8%|▊         | 41/500 [00:25<05:14,  1.46it/s]

Episode 40	lengths: [ 81.  83. 129. 112. 103. 199.  89.  76.]	reward: [  16.27629173  -10.49972822   30.88871917 -104.60947435  -53.08857004
   89.4517117  -245.49828245   14.37917108]]	full length: 199

[Episode 40]
Reward (mean): -32.84
Actor Loss: -0.0061 | Critic Loss: 0.9782
Prob Ratio - mean: 1.0056, max: 1.0297, min: 0.9711


 10%|█         | 51/500 [00:31<05:12,  1.44it/s]


[Episode 50]
Reward (mean): -89.24
Actor Loss: -0.0008 | Critic Loss: 1.0184
Prob Ratio - mean: 0.9992, max: 1.0168, min: 0.9798


 12%|█▏        | 61/500 [00:38<05:14,  1.40it/s]

Episode 60	lengths: [ 80.  90. 180. 113. 111. 143.  90.  91.]	reward: [ -57.93899207  -83.64925274 -208.93097588 -100.12167251  -61.70605881
   10.25081273 -199.21157365   87.21567256]]	full length: 180

[Episode 60]
Reward (mean): -76.76
Actor Loss: -0.0028 | Critic Loss: 1.3052
Prob Ratio - mean: 1.0017, max: 1.0204, min: 0.9743


 14%|█▍        | 71/500 [00:46<05:57,  1.20it/s]


[Episode 70]
Reward (mean): -59.71
Actor Loss: -0.0005 | Critic Loss: 0.8676
Prob Ratio - mean: 0.9992, max: 1.0209, min: 0.9703


 16%|█▌        | 81/500 [00:55<06:40,  1.05it/s]

Episode 80	lengths: [108. 116. 179. 146. 123. 143. 174.  94.]	reward: [   9.11136123 -197.51816442 -276.41080272 -138.28411376  -75.71150763
 -182.67862781   48.22398545   50.81839938]]	full length: 179

[Episode 80]
Reward (mean): -95.31
Actor Loss: 0.0019 | Critic Loss: 0.9845
Prob Ratio - mean: 0.9962, max: 1.0310, min: 0.9628


 18%|█▊        | 91/500 [01:08<10:28,  1.54s/it]


[Episode 90]
Reward (mean): -57.75
Actor Loss: 0.0033 | Critic Loss: 0.9562
Prob Ratio - mean: 0.9981, max: 1.0400, min: 0.9729


 20%|██        | 101/500 [01:17<06:44,  1.02s/it]

Episode 100	lengths: [136. 116. 251. 146. 183. 189. 139. 192.]	reward: [  62.20572694   54.77111974  -57.42712938   18.07899594 -172.33669923
 -182.96143868  -82.34637169   39.85784717]]	full length: 251

[Episode 100]
Reward (mean): -40.02
Actor Loss: 0.0095 | Critic Loss: 1.0399
Prob Ratio - mean: 0.9986, max: 1.0474, min: 0.9650


 22%|██▏       | 111/500 [01:28<07:21,  1.13s/it]


[Episode 110]
Reward (mean): -74.00
Actor Loss: 0.0016 | Critic Loss: 0.8418
Prob Ratio - mean: 1.0008, max: 1.0192, min: 0.9753


 24%|██▍       | 121/500 [01:39<06:15,  1.01it/s]

Episode 120	lengths: [110. 161. 227. 124. 128. 223. 143. 138.]	reward: [  67.19934334  -12.76516794   23.70566946  -86.00856604   38.46255286
 -140.65763118   80.8899943    50.7060729 ]]	full length: 227

[Episode 120]
Reward (mean): 2.69
Actor Loss: 0.0006 | Critic Loss: 0.8874
Prob Ratio - mean: 1.0000, max: 1.0346, min: 0.9618


 26%|██▌       | 131/500 [01:55<09:47,  1.59s/it]


[Episode 130]
Reward (mean): -37.10
Actor Loss: 0.0010 | Critic Loss: 0.8136
Prob Ratio - mean: 1.0032, max: 1.0385, min: 0.9645


 28%|██▊       | 141/500 [02:11<11:07,  1.86s/it]

Episode 140	lengths: [225.   0. 135. 144. 188. 285. 214. 151.]	reward: [  99.80237338   74.75750321   18.47925088   -4.24190299  -98.6377475
   59.98680456 -217.99700208   54.08802943]]	full length: 500

[Episode 140]
Reward (mean): -1.72
Actor Loss: -0.0144 | Critic Loss: 0.9905
Prob Ratio - mean: 1.0103, max: 1.1321, min: 0.9231


 30%|███       | 151/500 [02:26<09:51,  1.70s/it]


[Episode 150]
Reward (mean): -33.07
Actor Loss: 0.0000 | Critic Loss: 0.9571
Prob Ratio - mean: 0.9983, max: 1.0475, min: 0.9620


 32%|███▏      | 161/500 [02:44<11:19,  2.00s/it]

Episode 160	lengths: [  0. 111. 157. 104. 149. 204. 175.   0.]	reward: [120.68614485  98.92749685 -84.72711435  56.66343257  19.39970616
 -18.44608254 170.88567528  96.76168424]]	full length: 500

[Episode 160]
Reward (mean): 57.52
Actor Loss: -0.0055 | Critic Loss: 1.0599
Prob Ratio - mean: 0.9964, max: 1.1097, min: 0.9017


 34%|███▍      | 171/500 [03:00<09:41,  1.77s/it]


[Episode 170]
Reward (mean): 45.20
Actor Loss: -0.0096 | Critic Loss: 0.7833
Prob Ratio - mean: 1.0089, max: 1.1395, min: 0.9250


 36%|███▌      | 181/500 [03:16<07:10,  1.35s/it]

Episode 180	lengths: [246. 142. 192. 141. 250. 262. 155. 232.]	reward: [  26.6531723   144.5542992    38.13452866  106.44026156   32.93601274
  -13.74532273  -90.47717048 -116.87950372]]	full length: 262

[Episode 180]
Reward (mean): 15.95
Actor Loss: -0.0014 | Critic Loss: 0.6927
Prob Ratio - mean: 0.9968, max: 1.0497, min: 0.9564


 38%|███▊      | 191/500 [03:31<07:43,  1.50s/it]


[Episode 190]
Reward (mean): -16.78
Actor Loss: 0.0022 | Critic Loss: 0.5561
Prob Ratio - mean: 1.0011, max: 1.0265, min: 0.9457


 40%|████      | 201/500 [03:49<08:01,  1.61s/it]

Episode 200	lengths: [182. 184. 229. 188. 150. 248. 110. 189.]	reward: [  52.58419708   37.06604105  -87.23396144   99.29518335   15.20711999
  -80.66591298   82.97095141 -107.08406302]]	full length: 248

[Episode 200]
Reward (mean): 1.52
Actor Loss: 0.0021 | Critic Loss: 0.6181
Prob Ratio - mean: 1.0074, max: 1.0893, min: 0.8762


 42%|████▏     | 211/500 [04:07<07:53,  1.64s/it]


[Episode 210]
Reward (mean): -7.45
Actor Loss: 0.0089 | Critic Loss: 1.8169
Prob Ratio - mean: 1.0111, max: 1.0407, min: 0.9684


 44%|████▍     | 221/500 [04:25<07:41,  1.66s/it]

Episode 220	lengths: [175. 152. 304. 169. 261. 360. 151. 199.]	reward: [  89.86735959  124.69108282 -107.88521081  109.48733763   68.84122199
   81.288607    -76.74526567   72.12195637]]	full length: 360

[Episode 220]
Reward (mean): 45.21
Actor Loss: 0.0072 | Critic Loss: 0.8084
Prob Ratio - mean: 1.0041, max: 1.0471, min: 0.9033


 46%|████▌     | 231/500 [04:44<08:23,  1.87s/it]


[Episode 230]
Reward (mean): 67.39
Actor Loss: -0.0029 | Critic Loss: 0.9765
Prob Ratio - mean: 1.0066, max: 1.1026, min: 0.9265


 48%|████▊     | 241/500 [05:04<07:56,  1.84s/it]

Episode 240	lengths: [196. 175. 240. 237. 184. 227. 167. 255.]	reward: [ 71.25428892 110.16454257 -96.5560944  -91.46226634  63.01679317
 -86.05406221 -72.80882115 100.30649125]]	full length: 255

[Episode 240]
Reward (mean): -0.27
Actor Loss: 0.0003 | Critic Loss: 0.8073
Prob Ratio - mean: 0.9998, max: 1.0324, min: 0.9329


 50%|█████     | 251/500 [05:25<09:02,  2.18s/it]


[Episode 250]
Reward (mean): -8.39
Actor Loss: -0.0080 | Critic Loss: 0.9386
Prob Ratio - mean: 1.0006, max: 1.0626, min: 0.9147


 52%|█████▏    | 261/500 [05:45<08:17,  2.08s/it]

Episode 260	lengths: [315.   0. 220. 214. 148. 312. 140. 270.]	reward: [  63.49178314  124.32548649 -108.90540368  -70.37579034    7.72297507
   44.51631574  -44.27585951  -96.0521518 ]]	full length: 500

[Episode 260]
Reward (mean): -9.94
Actor Loss: -0.0117 | Critic Loss: 0.8883
Prob Ratio - mean: 0.9955, max: 1.0253, min: 0.8236


 54%|█████▍    | 271/500 [06:06<07:22,  1.93s/it]


[Episode 270]
Reward (mean): 14.70
Actor Loss: 0.0214 | Critic Loss: 0.4785
Prob Ratio - mean: 0.9786, max: 1.0504, min: 0.9075


 56%|█████▌    | 281/500 [06:24<05:58,  1.64s/it]

Episode 280	lengths: [166. 152. 314. 227. 170. 174. 143. 310.]	reward: [146.16892652  64.65912465  57.99984384 -73.32625228 112.82790794
  83.27257737 -80.78585007  69.92201465]]	full length: 314

[Episode 280]
Reward (mean): 47.59
Actor Loss: -0.0015 | Critic Loss: 0.7089
Prob Ratio - mean: 1.0042, max: 1.0905, min: 0.9010


 58%|█████▊    | 291/500 [06:44<07:15,  2.09s/it]


[Episode 290]
Reward (mean): 7.82
Actor Loss: -0.0008 | Critic Loss: 0.8252
Prob Ratio - mean: 0.9954, max: 1.0187, min: 0.9483


 60%|██████    | 301/500 [07:05<06:37,  2.00s/it]

Episode 300	lengths: [313. 211. 307. 157. 169. 199. 170. 348.]	reward: [  24.39636583  146.18582264 -102.19473603   -2.99462985   40.73217125
 -123.84567255  102.22303642 -222.5030605 ]]	full length: 348

[Episode 300]
Reward (mean): -17.25
Actor Loss: -0.0020 | Critic Loss: 0.6685
Prob Ratio - mean: 1.0062, max: 1.0911, min: 0.9632


 62%|██████▏   | 311/500 [07:27<07:06,  2.25s/it]


[Episode 310]
Reward (mean): 50.38
Actor Loss: -0.0048 | Critic Loss: 0.8605
Prob Ratio - mean: 1.0057, max: 1.1192, min: 0.8941


 64%|██████▍   | 321/500 [07:47<06:13,  2.08s/it]

Episode 320	lengths: [173. 172.   0. 174. 471. 485. 207.   0.]	reward: [ 56.19164582 -31.65279165 139.21050101  77.71911511  78.87030846
   9.35084853 163.02595153  95.46558251]]	full length: 500

[Episode 320]
Reward (mean): 73.52
Actor Loss: 0.0114 | Critic Loss: 0.9695
Prob Ratio - mean: 1.0086, max: 1.1954, min: 0.9189


 66%|██████▌   | 331/500 [08:08<05:50,  2.07s/it]


[Episode 330]
Reward (mean): 20.24
Actor Loss: 0.0052 | Critic Loss: 0.6234
Prob Ratio - mean: 1.0115, max: 1.1552, min: 0.9317


 68%|██████▊   | 341/500 [08:29<05:32,  2.09s/it]

Episode 340	lengths: [  0.   0. 210. 217. 238. 280.   0. 274.]	reward: [  37.4110903   199.82178822 -102.18299642  -49.56216753   73.27709085
  108.2801022   171.37425046   81.63323202]]	full length: 500

[Episode 340]
Reward (mean): 65.01
Actor Loss: 0.0027 | Critic Loss: 0.5317
Prob Ratio - mean: 0.9999, max: 1.0842, min: 0.8840


 70%|███████   | 351/500 [08:49<04:44,  1.91s/it]


[Episode 350]
Reward (mean): 20.49
Actor Loss: -0.0021 | Critic Loss: 0.4305
Prob Ratio - mean: 0.9885, max: 1.1413, min: 0.9185


 72%|███████▏  | 361/500 [09:09<04:33,  1.97s/it]

Episode 360	lengths: [  0.   0. 223. 228. 180. 387. 200. 393.]	reward: [  10.24108112  125.88715188   83.39629896  119.17107966   73.0239187
 -111.99487718   87.69866983 -110.29764968]]	full length: 500

[Episode 360]
Reward (mean): 34.64
Actor Loss: 0.0029 | Critic Loss: 0.5141
Prob Ratio - mean: 0.9998, max: 1.0365, min: 0.9329


 74%|███████▍  | 371/500 [09:31<04:41,  2.19s/it]


[Episode 370]
Reward (mean): -10.52
Actor Loss: 0.0049 | Critic Loss: 0.2057
Prob Ratio - mean: 0.9942, max: 1.1837, min: 0.8529


 76%|███████▌  | 381/500 [09:52<04:05,  2.07s/it]

Episode 380	lengths: [235. 188. 320. 237. 282. 392. 241. 422.]	reward: [  84.66890578  119.14048831 -102.01977557   96.18744214   98.93191445
  -98.44069451  114.27922276   26.42679148]]	full length: 422

[Episode 380]
Reward (mean): 42.40
Actor Loss: 0.0012 | Critic Loss: 0.4379
Prob Ratio - mean: 1.0064, max: 1.1230, min: 0.8415


 78%|███████▊  | 391/500 [10:12<03:38,  2.01s/it]


[Episode 390]
Reward (mean): 76.07
Actor Loss: -0.0017 | Critic Loss: 0.5746
Prob Ratio - mean: 1.0315, max: 1.0734, min: 0.9699


 80%|████████  | 401/500 [10:33<03:33,  2.15s/it]

Episode 400	lengths: [222. 182.   0. 169.   0. 357. 166. 288.]	reward: [108.84792805 121.03055415 140.31965655  83.63628978 144.09797796
  58.88665244 124.51854866  75.29476461]]	full length: 500

[Episode 400]
Reward (mean): 107.08
Actor Loss: -0.0124 | Critic Loss: 0.5375
Prob Ratio - mean: 1.0020, max: 1.1415, min: 0.8707


 82%|████████▏ | 411/500 [10:55<03:19,  2.24s/it]


[Episode 410]
Reward (mean): 57.28
Actor Loss: -0.0049 | Critic Loss: 0.2723
Prob Ratio - mean: 0.9977, max: 1.0778, min: 0.8329


 84%|████████▍ | 421/500 [11:16<02:43,  2.07s/it]

Episode 420	lengths: [  0. 306.   0. 325.   0.   0. 177. 339.]	reward: [-43.79484087 114.38783137 122.11795208 -72.13869304 151.08476653
 127.29092339 130.33852983  70.64483997]]	full length: 500

[Episode 420]
Reward (mean): 74.99
Actor Loss: 0.0031 | Critic Loss: 0.5757
Prob Ratio - mean: 1.0092, max: 1.0892, min: 0.9080


 86%|████████▌ | 431/500 [11:38<02:38,  2.30s/it]


[Episode 430]
Reward (mean): 47.63
Actor Loss: -0.0010 | Critic Loss: 0.8727
Prob Ratio - mean: 1.0078, max: 1.1066, min: 0.8579


 88%|████████▊ | 441/500 [12:00<02:07,  2.16s/it]

Episode 440	lengths: [300.   0. 288. 365. 436. 355.   0.   0.]	reward: [-83.67264648 159.56514194  70.87474862  82.67831988 128.37851506
  58.5540066  164.95822921  87.58866523]]	full length: 500

[Episode 440]
Reward (mean): 83.62
Actor Loss: -0.0008 | Critic Loss: 0.7163
Prob Ratio - mean: 1.0138, max: 1.2885, min: 0.8486


 90%|█████████ | 451/500 [12:23<01:49,  2.23s/it]


[Episode 450]
Reward (mean): 61.22
Actor Loss: -0.0055 | Critic Loss: 0.7272
Prob Ratio - mean: 0.9987, max: 1.0955, min: 0.8883


 92%|█████████▏| 461/500 [12:44<01:28,  2.27s/it]

Episode 460	lengths: [309.   0. 269. 234. 206. 297. 233. 285.]	reward: [ 59.38963    160.99306874 -78.47757314  94.24199116 -57.57558018
  57.64165731 113.30390929 -80.896906  ]]	full length: 500

[Episode 460]
Reward (mean): 33.58
Actor Loss: 0.0000 | Critic Loss: 0.5503
Prob Ratio - mean: 1.0028, max: 1.0455, min: 0.9546


 94%|█████████▍| 471/500 [13:06<01:01,  2.12s/it]


[Episode 470]
Reward (mean): 29.12
Actor Loss: 0.0120 | Critic Loss: 1.0777
Prob Ratio - mean: 1.0043, max: 1.1285, min: 0.7898


 96%|█████████▌| 481/500 [13:28<00:40,  2.12s/it]

Episode 480	lengths: [381.   0. 278. 233. 293.   0. 248. 382.]	reward: [  86.83824918  134.00459355   86.16381746   86.95362427   73.09975959
  126.24677061  151.64064089 -108.56633362]]	full length: 500

[Episode 480]
Reward (mean): 79.55
Actor Loss: 0.0030 | Critic Loss: 0.7645
Prob Ratio - mean: 1.0016, max: 1.0333, min: 0.9782


 97%|█████████▋| 483/500 [13:33<00:38,  2.29s/it]

In [None]:
import gymnasium as gym
import torch
import numpy as np
from gymnasium.wrappers import RecordVideo
import os

# Create folder to save the video
video_folder = "./video"
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with RecordVideo
env = gym.make('LunarLander-v3', render_mode='rgb_array')
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda e: True)

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = ppo_model.policy_net(state)
            # action_dist = torch.distributions.Categorical(action_probs)
            # action = action_dist.sample().item()
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


In [64]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class LinearValueEnv(gym.Env):
    def __init__(self, gamma=0.99, episode_length=100):
        super().__init__()
        self.gamma = gamma
        self.episode_length = episode_length
        self.current_step = 0

        # Observation: continuous scalar in [-1, 1]
        self.observation_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)

        # Action: continuous scalar (not used)
        self.action_space = spaces.Discrete(1)

        self.state = None

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.random.uniform(-1.0, 1.0, size=(1,)).astype(np.float32)
        self.current_step = 0
        return self.state.copy(), {}

    def step(self, action):
        # Reward is simply the state value
        reward = float(self.state[0])
        self.current_step += 1

        terminated = self.current_step >= self.episode_length
        truncated = False
        return self.state.copy(), reward, terminated, truncated, {}

    def render(self):
        print(f"State: {self.state}")

    def close(self):
        pass




In [65]:
from gymnasium.envs.registration import register

register(
    id="LinearValue-v0",
    entry_point="__main__:LinearValueEnv",  # if you're running in a script
    max_episode_steps=100
)


In [100]:
env_id = "LinearValue-v0"
num_episodes = 1000
max_steps = 500
lr = 1e-4


ppo_model_value =  PPOAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, epsilon=0.2, num_envs=2, num_steps=0)

rewards, steps = ppo_model_value.train()


  0%|          | 1/1000 [00:00<12:09,  1.37it/s]

Episode 0	lengths: [100. 100.]	reward: [-72.96960783  84.58938628]]	full length: 100

[Episode 0]
Reward (mean): 5.81
Actor Loss: 0.3642 | Critic Loss: 0.9067
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  1%|          | 11/1000 [00:03<04:25,  3.73it/s]


[Episode 10]
Reward (mean): -41.61
Actor Loss: 0.2812 | Critic Loss: 1.0306
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  2%|▏         | 21/1000 [00:06<04:13,  3.86it/s]

Episode 20	lengths: [100. 100.]	reward: [-32.27667063  45.82619548]]	full length: 100

[Episode 20]
Reward (mean): 6.77
Actor Loss: 0.3617 | Critic Loss: 1.0112
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  3%|▎         | 31/1000 [00:08<04:14,  3.81it/s]


[Episode 30]
Reward (mean): 49.95
Actor Loss: 0.4339 | Critic Loss: 1.0572
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  4%|▍         | 41/1000 [00:11<04:05,  3.90it/s]

Episode 40	lengths: [100. 100.]	reward: [-12.85368513  34.16484547]]	full length: 100

[Episode 40]
Reward (mean): 10.66
Actor Loss: 0.3643 | Critic Loss: 1.0583
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  5%|▌         | 51/1000 [00:14<04:51,  3.26it/s]


[Episode 50]
Reward (mean): -12.35
Actor Loss: 0.3224 | Critic Loss: 0.9231
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  6%|▌         | 61/1000 [00:17<04:10,  3.75it/s]

Episode 60	lengths: [100. 100.]	reward: [ 41.3785345  -51.92305827]]	full length: 100

[Episode 60]
Reward (mean): -5.27
Actor Loss: 0.3325 | Critic Loss: 0.9675
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  7%|▋         | 71/1000 [00:19<03:58,  3.90it/s]


[Episode 70]
Reward (mean): -60.01
Actor Loss: 0.2350 | Critic Loss: 1.0500
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  8%|▊         | 81/1000 [00:22<03:56,  3.89it/s]

Episode 80	lengths: [100. 100.]	reward: [18.60985883 19.40882935]]	full length: 100

[Episode 80]
Reward (mean): 19.01
Actor Loss: 0.3707 | Critic Loss: 1.1324
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  9%|▉         | 91/1000 [00:25<04:43,  3.21it/s]


[Episode 90]
Reward (mean): 54.82
Actor Loss: 0.4316 | Critic Loss: 1.1794
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 10%|█         | 101/1000 [00:28<03:59,  3.75it/s]

Episode 100	lengths: [100. 100.]	reward: [36.77222595 51.44622749]]	full length: 100

[Episode 100]
Reward (mean): 44.11
Actor Loss: 0.4109 | Critic Loss: 1.1559
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 11%|█         | 111/1000 [00:30<03:50,  3.86it/s]


[Episode 110]
Reward (mean): 5.18
Actor Loss: 0.3404 | Critic Loss: 1.0804
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 12%|█▏        | 121/1000 [00:33<03:42,  3.95it/s]

Episode 120	lengths: [100. 100.]	reward: [-27.61505038  77.55969197]]	full length: 100

[Episode 120]
Reward (mean): 24.97
Actor Loss: 0.3734 | Critic Loss: 0.9805
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 13%|█▎        | 131/1000 [00:35<03:45,  3.85it/s]


[Episode 130]
Reward (mean): -27.30
Actor Loss: 0.2786 | Critic Loss: 0.9290
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 14%|█▍        | 141/1000 [00:39<04:24,  3.24it/s]

Episode 140	lengths: [100. 100.]	reward: [-91.24863052  44.33011508]]	full length: 100

[Episode 140]
Reward (mean): -23.46
Actor Loss: 0.2832 | Critic Loss: 0.8766
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 15%|█▌        | 151/1000 [00:41<03:39,  3.87it/s]


[Episode 150]
Reward (mean): -92.36
Actor Loss: 0.1577 | Critic Loss: 1.0184
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 16%|█▌        | 161/1000 [00:44<03:34,  3.91it/s]

Episode 160	lengths: [100. 100.]	reward: [ 3.80020684 92.52385944]]	full length: 100

[Episode 160]
Reward (mean): 48.16
Actor Loss: 0.4072 | Critic Loss: 1.0393
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 17%|█▋        | 171/1000 [00:46<03:33,  3.89it/s]


[Episode 170]
Reward (mean): -14.98
Actor Loss: 0.2918 | Critic Loss: 0.9436
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 18%|█▊        | 181/1000 [00:49<04:19,  3.15it/s]

Episode 180	lengths: [100. 100.]	reward: [-11.27119991   1.15310721]]	full length: 100

[Episode 180]
Reward (mean): -5.06
Actor Loss: 0.3075 | Critic Loss: 1.0708
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 19%|█▉        | 191/1000 [00:52<03:34,  3.77it/s]


[Episode 190]
Reward (mean): 41.69
Actor Loss: 0.3900 | Critic Loss: 1.0467
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 20%|██        | 201/1000 [00:55<03:23,  3.92it/s]

Episode 200	lengths: [100. 100.]	reward: [-81.71754402 -39.18708792]]	full length: 100

[Episode 200]
Reward (mean): -60.45
Actor Loss: 0.2027 | Critic Loss: 0.9949
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 21%|██        | 211/1000 [00:57<03:23,  3.87it/s]


[Episode 210]
Reward (mean): -59.89
Actor Loss: 0.2012 | Critic Loss: 1.0341
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 21%|██▏       | 213/1000 [00:58<03:35,  3.65it/s]


KeyboardInterrupt: 