In [None]:
"""
Implementing PPO
First mistake I made -> ran the action using the policy, then calculated the ratio using old_policy. The correct way is to run the action using old_policy, then calculate the ratio using the new policy.
Second mistake -> didn't run K epochs while calculating the surrogate objective.
Then, it started behaving like a trained model, though did not achieve goal.
Another fix. Added minibatching according to the original paper.
Minibatching made the training significantly slower due to more frequent optimizer steps.
Experimented both GAE and decaying returns. Both did not show good results. Will try debugging in Cartpole.
"""

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools
!pip install swig
!pip install gymnasium[box2d]



In [2]:
!pip uninstall -y box2d-py
!pip install box2d pygame swig
!pip install "gymnasium[box2d]" --no-deps

Found existing installation: box2d-py 2.3.5
Uninstalling box2d-py-2.3.5:
  Successfully uninstalled box2d-py-2.3.5


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCriticPPO(nn.Module):

    def __init__(self, input_dim, output_dim, hidden_dims=(64, 64)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[1], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value


In [27]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
import numpy as np
import copy
import time

class PPOAgent:
    def __init__(self, env_id, num_episodes=1000, max_steps=500, epsilon=float('inf'), gamma=0.99, lambda_GAE=0.95, lr=1e-3, num_steps=0, num_envs=8, num_epochs=4, minibatch_size=4, vectorization_mode = "sync", seed=123):
        # using vectorized environments to boost training speed
        self.env = gym.make_vec(env_id, num_envs=num_envs, vectorization_mode=vectorization_mode)
        self.num_envs = num_envs
        self.num_episodes = num_episodes
        self.num_epochs = num_epochs
        self.max_steps = max_steps
        self.minibatch_size = minibatch_size
        self.epsilon = epsilon
        self.gamma = gamma
        self.lambda_GAE = lambda_GAE
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCriticPPO(self.env.single_observation_space.shape[0], self.env.single_action_space.n).to(self.device)
        self.old_policy_net = copy.deepcopy(self.policy_net)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        # added scheduler after observing divergence after getting close to solving
        self.scheduler = StepLR(self.optimizer, step_size=100, gamma=0.9)
        self.loss = nn.MSELoss()
        self.seed = seed

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.old_policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing GAE
    def compute_returns(self, rewards, values, next_value):
        rewards = torch.stack(rewards)
        values = torch.cat([values, next_value.unsqueeze(0)], dim=0)

        T, N = rewards.shape
        advantages = torch.zeros_like(rewards)
        gae = torch.zeros(N, device=rewards.device)
        for t in reversed(range(T)):
            # temporal difference error
            td = rewards[t] + self.gamma * values[t + 1] - values[t]
            # higher labmda -> more sampling, lower lambda -> more bootstrapping
            gae = td + self.gamma * self.lambda_GAE * gae
            advantages[t] = gae

        # compute returns by adding value to advantage
        returns = advantages + values[:-1]
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        # normalize advantage across timesteps and environments
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        return returns, advantages

    # computing the gamma decaying rewards
    # def compute_returns(self, rewards, values, next_value):
    #     """
    #     Args:
    #         rewards: torch.Tensor of shape [T, N] where
    #                 T = rollout steps, N = num_envs
    #     Returns:
    #         returns: torch.Tensor of shape [T, N], normalized
    #     """
    #     rewards = torch.stack(rewards)

    #     T, N = rewards.shape
    #     returns = torch.zeros_like(rewards)
    #     R = torch.zeros(N, device=rewards.device)
    #     for t in reversed(range(T)):
    #         R = rewards[t] + self.gamma * R
    #         returns[t] = R

    #     # Normalize returns across all timesteps and environments
    #     returns = (returns - returns.mean()) / (returns.std() + 1e-8)
    #     advantages = returns - values

    #     return returns, advantages


def train(self):
    episode_rewards = []
    episode_steps = []

    for episode in tqdm(range(self.num_episodes)):
        start_episode_time = time.time()  # ⏱ total episode start
        env_time = 0                      # ⏱ initialize environment interaction time
        train_time = 0                    # ⏱ initialize training time

        state, _ = self.env.reset(seed=self.seed + episode)
        done = np.zeros(self.num_envs, dtype=bool)
        episode_reward = np.zeros(self.num_envs)
        old_states, old_actions, old_log_probs, old_rewards, old_values = [], [], [], [], []
        done_mask = np.zeros(self.num_envs, dtype=bool)
        done_steps = np.zeros(self.num_envs)
        steps = 0

        while not np.all(done_mask) and steps < self.max_steps:
            steps += 1
            state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
            old_states.append(state_tensor)

            # ⏱ start measuring env time
            start_env_time = time.time()
            with torch.no_grad():
                old_action_probs, old_value = self.old_policy_net(state_tensor)
                old_action_dist = torch.distributions.Categorical(old_action_probs)
                old_action = old_action_dist.sample()
                old_log_prob = old_action_dist.log_prob(old_action)

            next_state, old_reward, terminated, truncated, _ = self.env.step(old_action.cpu().numpy())
            env_time += time.time() - start_env_time  # ⏱ accumulate env time

            done = np.logical_or(terminated, truncated)
            done_steps = np.where(np.logical_and(done, ~done_mask), steps, done_steps)
            done_mask = np.logical_or(done_mask, done)
            # record when each environment is done
            old_reward = np.where(done_mask, 0.0, old_reward)

            # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
            old_values.append(old_value.squeeze())
            old_rewards.append(torch.tensor(old_reward, dtype=torch.float32).to(self.device))  # shape: (num_envs,)
            old_log_probs.append(old_log_prob)
            old_actions.append(old_action)

            episode_reward += old_reward
            state = next_state

            # finish full trajectory, then update
            if self.num_steps == 0:
                if np.any(done):
                    with torch.no_grad():
                        next_old_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                        _, next_old_value = self.policy_net(next_old_state_tensor)
                        done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                        next_old_value = next_old_value.squeeze() * (1 - done_tensor)

                    old_states = torch.stack(old_states).to(self.device)
                    old_values = torch.stack(old_values).to(self.device)
                    old_log_probs = torch.stack(old_log_probs).to(self.device)
                    old_actions = torch.stack(old_actions).to(self.device)
                    old_returns, old_advantages = self.compute_returns(old_rewards, old_values, next_old_value)

                    # flatten inputs
                    T, N = old_actions.shape[:2]
                    old_states = old_states.reshape(T * N, -1)
                    old_actions = old_actions.reshape(T * N)
                    old_log_probs = old_log_probs.reshape(T * N)
                    old_returns = old_returns.reshape(T * N)
                    old_advantages = old_advantages.reshape(T * N)

                    batch_size = old_states.shape[0]
                    minibatch_size = self.minibatch_size

                    # ⏱ measure train time
                    start_train_time = time.time()

                    # repeat in K epochs
                    for _ in range(self.num_epochs):
                        indices = torch.randperm(batch_size)

                        # mini batching
                        for i in range(0, batch_size, minibatch_size):
                            # selecting inputs for this mini batch
                            mb_idx = indices[i : i + minibatch_size]
                            mb_states = old_states[mb_idx]
                            mb_actions = old_actions[mb_idx]
                            mb_log_probs_old = old_log_probs[mb_idx]
                            mb_returns = old_returns[mb_idx]
                            mb_advantages = old_advantages[mb_idx]

                            # use current policy to find log_probs of the trajectory ran by old_policy
                            action_probs, values = self.policy_net(mb_states)
                            action_dist = torch.distributions.Categorical(action_probs)
                            mb_log_probs = action_dist.log_prob(mb_actions)

                            prob_ratio = torch.exp(mb_log_probs - mb_log_probs_old.detach())
                            values = values.squeeze(-1)
                            surrogate = torch.min(prob_ratio * mb_advantages, prob_ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * mb_advantages)
                            actor_loss = -surrogate.mean()
                            critic_loss = self.loss(mb_returns, values)

                            loss = actor_loss + 0.5 * critic_loss
                            self.optimizer.zero_grad()
                            loss.backward()
                            self.optimizer.step()

                    train_time += time.time() - start_train_time  # ⏱ add train time
                    old_states, old_actions, old_log_probs, old_rewards, old_values = [], [], [], [], []
                    # self.scheduler.step()

            # every n steps for each environment, calculate losses, update the actor & critic, then refresh the saved lists
            else:
                if (steps % self.num_steps == 0) or np.any(done):

                    with torch.no_grad():
                        next_old_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                        _, next_old_value = self.policy_net(next_old_state_tensor)
                        done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                        next_old_value = next_old_value.squeeze() * (1 - done_tensor)

                    old_states = torch.stack(old_states).to(self.device)
                    old_values = torch.stack(old_values).to(self.device)
                    old_log_probs = torch.stack(old_log_probs).to(self.device)
                    old_actions = torch.stack(old_actions).to(self.device)
                    old_returns, old_advantages = self.compute_returns(old_rewards, old_values, next_old_value)

                    # flatten inputs
                    T, N = old_actions.shape[:2]
                    old_states = old_states.reshape(T * N, -1)
                    old_actions = old_actions.reshape(T * N)
                    old_log_probs = old_log_probs.reshape(T * N)
                    old_returns = old_returns.reshape(T * N)
                    old_advantages = old_advantages.reshape(T * N)

                    batch_size = old_states.shape[0]
                    minibatch_size = self.minibatch_size

                    # ⏱ measure train time
                    start_train_time = time.time()

                    # repeat in K epochs
                    for _ in range(self.num_epochs):
                        indices = torch.randperm(batch_size)

                        # mini batching
                        for i in range(0, batch_size, minibatch_size):
                            # selecting inputs for this mini batch
                            mb_idx = indices[i : i + minibatch_size]
                            mb_states = old_states[mb_idx]
                            mb_actions = old_actions[mb_idx]
                            mb_log_probs_old = old_log_probs[mb_idx]
                            mb_returns = old_returns[mb_idx]
                            mb_advantages = old_advantages[mb_idx]

                            # use current policy to find log_probs of the trajectory ran by old_policy
                            action_probs, values = self.policy_net(mb_states)
                            action_dist = torch.distributions.Categorical(action_probs)
                            mb_log_probs = action_dist.log_prob(mb_actions)

                            prob_ratio = torch.exp(mb_log_probs - mb_log_probs_old.detach())
                            values = values.squeeze(-1)
                            surrogate = torch.min(prob_ratio * mb_advantages, prob_ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * mb_advantages)
                            actor_loss = -surrogate.mean()
                            critic_loss = self.loss(mb_returns, values)

                            loss = actor_loss + 0.5 * critic_loss
                            self.optimizer.zero_grad()
                            loss.backward()
                            self.optimizer.step()

                    train_time += time.time() - start_train_time  # ⏱ add train time
                    old_states, old_actions, old_log_probs, old_rewards, old_values = [], [], [], [], []
                    # self.scheduler.step()

        self.old_policy_net = copy.deepcopy(self.policy_net)
        episode_rewards.append(episode_reward)
        episode_steps.append(steps)

        if episode % 10 == 0:
            print(f"\n[Episode {episode}]")
            print(f"Reward (mean): {np.mean(episode_reward):.2f}")
            print(f"Actor Loss: {actor_loss.item():.4f} | Critic Loss: {critic_loss.item():.4f}")
            print(f"Env Time: {env_time:.2f}s | Train Time: {train_time:.2f}s | Total: {time.time() - start_episode_time:.2f}s")

    self.env.close()
    return np.array(episode_rewards), np.array(episode_steps)


In [None]:
import gymnasium as gym

env_id = 'LunarLander-v3'
num_episodes = 300
max_steps = 500
lr = 1e-4


ppo_model =  PPOAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, epsilon=0.2, num_envs=8, num_steps=32, minibatch_size=16)

rewards, steps = ppo_model.train()



  0%|          | 1/300 [00:00<04:44,  1.05it/s]


[Episode 0]
Reward (mean): -79.91
Actor Loss: -0.2480 | Critic Loss: 0.0765
Env Time: 0.18s | Train Time: 0.74s | Total: 0.95s


  4%|▎         | 11/300 [00:12<05:37,  1.17s/it]


[Episode 10]
Reward (mean): -93.07
Actor Loss: -0.0391 | Critic Loss: 1.3283
Env Time: 0.21s | Train Time: 0.87s | Total: 1.11s


  7%|▋         | 21/300 [00:33<09:01,  1.94s/it]


[Episode 20]
Reward (mean): -65.31
Actor Loss: -0.0768 | Critic Loss: 0.0860
Env Time: 0.36s | Train Time: 1.50s | Total: 1.92s


 10%|█         | 31/300 [01:04<15:04,  3.36s/it]


[Episode 30]
Reward (mean): 83.27
Actor Loss: 0.0847 | Critic Loss: 0.3503
Env Time: 0.74s | Train Time: 3.01s | Total: 3.84s


 14%|█▎        | 41/300 [01:45<17:24,  4.03s/it]


[Episode 40]
Reward (mean): -45.42
Actor Loss: -0.1455 | Critic Loss: 0.4381
Env Time: 0.81s | Train Time: 3.12s | Total: 4.04s


 17%|█▋        | 51/300 [02:26<16:56,  4.08s/it]


[Episode 50]
Reward (mean): -37.39
Actor Loss: 0.1478 | Critic Loss: 0.6866
Env Time: 0.81s | Train Time: 3.14s | Total: 4.05s


 20%|██        | 61/300 [03:07<16:10,  4.06s/it]


[Episode 60]
Reward (mean): -5.33
Actor Loss: 0.2784 | Critic Loss: 0.9136
Env Time: 0.81s | Train Time: 3.19s | Total: 4.11s


 24%|██▎       | 71/300 [03:47<15:29,  4.06s/it]


[Episode 70]
Reward (mean): -30.92
Actor Loss: -0.3160 | Critic Loss: 0.4654
Env Time: 0.81s | Train Time: 3.13s | Total: 4.05s


 27%|██▋       | 81/300 [04:28<14:48,  4.06s/it]


[Episode 80]
Reward (mean): -27.40
Actor Loss: 0.1365 | Critic Loss: 0.2028
Env Time: 0.82s | Train Time: 3.11s | Total: 4.03s


 30%|███       | 91/300 [05:08<13:56,  4.00s/it]


[Episode 90]
Reward (mean): -30.17
Actor Loss: 0.2723 | Critic Loss: 0.7108
Env Time: 0.79s | Train Time: 3.06s | Total: 3.96s


 34%|███▎      | 101/300 [05:49<13:25,  4.05s/it]


[Episode 100]
Reward (mean): -38.96
Actor Loss: 0.3184 | Critic Loss: 0.5561
Env Time: 0.81s | Train Time: 3.11s | Total: 4.03s


 37%|███▋      | 111/300 [06:29<12:39,  4.02s/it]


[Episode 110]
Reward (mean): -34.27
Actor Loss: 0.0169 | Critic Loss: 0.2468
Env Time: 0.80s | Train Time: 3.09s | Total: 3.99s


 38%|███▊      | 113/300 [06:37<12:37,  4.05s/it]

In [24]:
import gymnasium as gym
import torch
import numpy as np
from gymnasium.wrappers import RecordVideo
import os

# Create folder to save the video
video_folder = "./video"
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with RecordVideo
env = gym.make('LunarLander-v3', render_mode='rgb_array')
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda e: True)

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = ppo_model.policy_net(state)
            # action_dist = torch.distributions.Categorical(action_probs)
            # action = action_dist.sample().item()
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


Episode 1 Reward: -209.6376773931259
Episode 2 Reward: -253.3317896907504
Episode 3 Reward: -127.91441076284075
Episode 4 Reward: -182.79585312779028
Episode 5 Reward: -119.73963978202433
Episode 6 Reward: -174.02446610394807
Episode 7 Reward: -147.8954565560111
Episode 8 Reward: -149.09936367264245
Episode 9 Reward: -141.18966495671833
Episode 10 Reward: -198.60302177229488
Average Reward over 10 episodes: -170.42313438181466


In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class LinearValueEnv(gym.Env):
    def __init__(self, gamma=0.99, episode_length=100):
        super().__init__()
        self.gamma = gamma
        self.episode_length = episode_length
        self.current_step = 0

        # Observation: continuous scalar in [-1, 1]
        self.observation_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)

        # Action: continuous scalar (not used)
        self.action_space = spaces.Discrete(1)

        self.state = None

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.random.uniform(-1.0, 1.0, size=(1,)).astype(np.float32)
        self.current_step = 0
        return self.state.copy(), {}

    def step(self, action):
        # Reward is simply the state value
        reward = float(self.state[0])
        self.current_step += 1

        terminated = self.current_step >= self.episode_length
        truncated = False
        return self.state.copy(), reward, terminated, truncated, {}

    def render(self):
        print(f"State: {self.state}")

    def close(self):
        pass




In [None]:
from gymnasium.envs.registration import register

register(
    id="LinearValue-v0",
    entry_point="__main__:LinearValueEnv",  # if you're running in a script
    max_episode_steps=100
)


In [None]:
env_id = "LinearValue-v0"
num_episodes = 1000
max_steps = 500
lr = 1e-4


ppo_model_value =  PPOAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, epsilon=0.2, num_envs=2, num_steps=0)

rewards, steps = ppo_model_value.train()
