In [107]:
"""
Implementing PPO
First mistake I made -> ran the action using the policy, then calculated the ratio using old_policy. The correct way is to run the action using old_policy, then calculate the ratio using the new policy.
Second mistake -> didn't run K epochs while calculating the surrogate objective.
Then, it started behaving like a trained model, though did not achieve goal.
Another fix. Added minibatching according to the original paper.
Minibatching made the training significantly slower due to more frequent optimizer steps.
Experimented both GAE and decaying returns. Both did not show good results. Will try debugging in Cartpole.
Tried using single environment and no minibatching like the reference model -> didn't work.
Switched to time step based update instead of episode based updates -> didn't work.
Went back to decaying returns -> didn't work.
Found dimension bugs when calculating returns -> didn't fix.
Debugged by resetting decaying reward in terminated states 0> fixed!
"""

"\nImplementing PPO\nFirst mistake I made -> ran the action using the policy, then calculated the ratio using old_policy. The correct way is to run the action using old_policy, then calculate the ratio using the new policy.\nSecond mistake -> didn't run K epochs while calculating the surrogate objective.\nThen, it started behaving like a trained model, though did not achieve goal.\nAnother fix. Added minibatching according to the original paper.\nMinibatching made the training significantly slower due to more frequent optimizer steps.\nExperimented both GAE and decaying returns. Both did not show good results. Will try debugging in Cartpole.\nTried using single environment and no minibatching like the reference model.\n"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools
!pip install swig
!pip install gymnasium[box2d]



In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCriticPPO(nn.Module):

    def __init__(self, input_dim, output_dim, hidden_dims=(64, 64)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )
        self.actor_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[1]),
            nn.ReLU(),
            nn.Linear(hidden_dims[1], output_dim)
        )
        self.critic_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[1]),
            nn.ReLU(),
            nn.Linear(hidden_dims[1], 1)
        )

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value


In [110]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
import numpy as np
import copy
import time

class PPOAgent:
    def __init__(self, env_id, total_timesteps=1e5, max_steps=500, epsilon=float('inf'), gamma=0.99, lambda_GAE=0.95,
                 lr=1e-3, num_steps=0, num_envs=8, num_epochs=4, minibatch_size=4, vectorization_mode="sync", seed=123):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # using vectorized environments to boost training speed
        if num_envs == 1:
            self.env = gym.make(env_id)
            self.policy_net = ActorCriticPPO(self.env.observation_space.shape[0], self.env.action_space.n).to(self.device)
        else:
            self.env = gym.make_vec(env_id, num_envs=num_envs, vectorization_mode=vectorization_mode)
            self.policy_net = ActorCriticPPO(self.env.single_observation_space.shape[0], self.env.single_action_space.n).to(self.device)
        self.num_envs = num_envs
        self.total_timesteps = int(total_timesteps)
        self.num_epochs = num_epochs
        self.max_steps = max_steps
        self.minibatch_size = minibatch_size
        self.epsilon = epsilon
        self.gamma = gamma
        self.lambda_GAE = lambda_GAE
        self.lr = lr
        self.num_steps = num_steps
        self.old_policy_net = copy.deepcopy(self.policy_net)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        # added scheduler after observing divergence after getting close to solving
        self.scheduler = StepLR(self.optimizer, step_size=500 * num_epochs, gamma=0.9)
        self.loss = nn.MSELoss()
        self.seed = seed

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.old_policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing the gamma decaying rewards
    def compute_returns(self, rewards, values, next_value, is_dones):
        """
        Args:
            rewards: torch.Tensor of shape [T, N] where
                    T = rollout steps, N = num_envs
        Returns:
            returns: torch.Tensor of shape [T, N], normalized
        """
        # rewards' shape was [N, 1] -> caused bug
        rewards = torch.stack(rewards).squeeze()
        # also, haven't masked terminal states which led R to build up forever, not reset upon new episodes.
        is_dones = torch.stack([torch.tensor(d, device=rewards.device, dtype=torch.float32) for d in is_dones]).squeeze()

        if self.num_envs == 1:
            T = rewards.shape[0]
            N = 1
        else:
            T, N = rewards.shape
        returns = torch.zeros_like(rewards)
        R = torch.zeros(N, device=rewards.device)

        for t in reversed(range(T)):
            R = rewards[t] + self.gamma * R * (1 - is_dones[t])
            returns[t] = R

        # normalize returns across all timesteps and environments
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        advantages = returns - values

        return returns, advantages

    # computing GAE
    # def compute_returns(self, rewards, values, next_value):
    #     rewards = torch.stack(rewards)
    #     values = torch.cat([values, next_value.unsqueeze(0)], dim=0)

    #     T, N = rewards.shape
    #     advantages = torch.zeros_like(rewards)
    #     gae = torch.zeros(N, device=rewards.device)
    #     for t in reversed(range(T)):
    #         # temporal difference error
    #         td = rewards[t] + self.gamma * values[t + 1] - values[t]
    #         # higher labmda -> more sampling, lower lambda -> more bootstrapping
    #         gae = td + self.gamma * self.lambda_GAE * gae
    #         advantages[t] = gae

    #     # compute returns by adding value to advantage
    #     returns = advantages + values[:-1]
    #     # returns = (returns - returns.mean()) / (returns.std() + 1e-8)

    #     # normalize advantage across timesteps and environments
    #     advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    #     return returns, advantages



    def train(self):
        episode_rewards = []
        steps = 0
        recent_rewards = []
        old_states, old_actions, old_log_probs, old_rewards, old_values, is_dones = [], [], [], [], [], []

        while steps < self.total_timesteps:
            start_episode_time = time.time()  # ⏱ total episode start
            env_time = 0                      # ⏱ initialize environment interaction time
            train_time = 0                    # ⏱ initialize training time

            state, _ = self.env.reset()
            done = np.zeros(self.num_envs, dtype=bool)
            episode_reward = np.zeros(self.num_envs)
            done_mask = np.zeros(self.num_envs, dtype=bool)
            done_steps = np.zeros(self.num_envs)

            for _ in range(self.num_steps):
                if steps >= self.total_timesteps:
                    break
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
                old_states.append(state_tensor)

                # ⏱ start measuring env time
                start_env_time = time.time()
                with torch.no_grad():
                    old_action_probs, old_value = self.old_policy_net(state_tensor)
                    old_action_dist = torch.distributions.Categorical(old_action_probs)
                    old_action = old_action_dist.sample()
                    old_log_prob = old_action_dist.log_prob(old_action)

                next_state, old_reward, terminated, truncated, _ = self.env.step(old_action.cpu().numpy())
                env_time += time.time() - start_env_time  # ⏱ accumulate env time

                done = np.logical_or(terminated, truncated)
                if self.num_envs == 1:
                    if done:
                        next_state, _ = self.env.reset()
                else:
                    if np.any(done):
                        reset_indices = np.where(done)[0]
                        reset_obs, _ = self.env.reset(seed=None, options=None, indices=reset_indices)
                        next_state[reset_indices] = reset_obs
                done_steps = np.where(np.logical_and(done, ~done_mask), steps, done_steps)
                done_mask = np.logical_or(done_mask, done)
                old_reward = np.where(done_mask, 0.0, old_reward)

                old_values.append(old_value.squeeze())
                old_rewards.append(torch.tensor(old_reward, dtype=torch.float32).to(self.device))
                old_log_probs.append(old_log_prob)
                old_actions.append(old_action)
                is_dones.append(done_mask)

                episode_reward += old_reward
                state = next_state

                if steps % (self.num_steps * 4) == 0:
                    with torch.no_grad():
                        next_old_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                        _, next_old_value = self.policy_net(next_old_state_tensor)
                        done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                        next_old_value = next_old_value.squeeze() * (1 - done_tensor)

                    old_states = torch.stack(old_states).to(self.device)
                    old_values = torch.stack(old_values).to(self.device)
                    old_log_probs = torch.stack(old_log_probs).to(self.device)
                    old_actions = torch.stack(old_actions).to(self.device)
                    old_returns, old_advantages = self.compute_returns(old_rewards, old_values, next_old_value, is_dones)

                    if self.num_envs == 1:
                        old_states = old_states.squeeze()         # shape: [T, obs_dim]
                        old_actions = old_actions.squeeze()       # shape: [T]
                        old_log_probs = old_log_probs.squeeze()   # shape: [T]
                        old_returns = old_returns.squeeze()       # shape: [T]
                        old_advantages = old_advantages.squeeze() # shape: [T]
                    else:
                        T, N = old_actions.shape[:2]
                        old_states = old_states.reshape(T * N, -1)
                        old_actions = old_actions.reshape(T * N)
                        old_log_probs = old_log_probs.reshape(T * N)
                        old_returns = old_returns.reshape(T * N)
                        old_advantages = old_advantages.reshape(T * N)

                    batch_size = old_states.shape[0]
                    minibatch_size = self.minibatch_size

                    # ⏱ measure train time
                    start_train_time = time.time()

                    for _ in range(self.num_epochs):
                        action_probs, values = self.policy_net(old_states)
                        action_dist = torch.distributions.Categorical(action_probs)
                        log_probs = action_dist.log_prob(old_actions)

                                # indices = torch.randperm(batch_size)
                                # # mini batching
                                # for i in range(0, batch_size, minibatch_size):
                                #     # selecting inputs for this mini batch
                                #     mb_idx = indices[i : i + minibatch_size]
                                #     mb_states = old_states[mb_idx]
                                #     mb_actions = old_actions[mb_idx]
                                #     mb_log_probs_old = old_log_probs[mb_idx]
                                #     mb_returns = old_returns[mb_idx]
                                #     mb_advantages = old_advantages[mb_idx]

                                #     # use current policy to find log_probs of the trajectory ran by old_policy
                                #     action_probs, values = self.policy_net(mb_states)
                                #     action_dist = torch.distributions.Categorical(action_probs)
                                #     mb_log_probs = action_dist.log_prob(mb_actions)

                                #     prob_ratio = torch.exp(mb_log_probs - mb_log_probs_old.detach())
                                #     values = values.squeeze(-1)
                                #     surrogate = torch.min(prob_ratio * mb_advantages, prob_ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * mb_advantages)
                                #     actor_loss = -surrogate.mean()
                                #     critic_loss = self.loss(mb_returns, values)

                                #     loss = actor_loss + 0.5 * critic_loss
                                #     self.optimizer.zero_grad()
                                #     loss.backward()
                                #     self.optimizer.step()

                        prob_ratio = torch.exp(log_probs - old_log_probs.detach())
                        values = values.squeeze(-1)
                        surrogate = torch.min(
                            prob_ratio * old_advantages,
                            prob_ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * old_advantages
                        )
                        actor_loss = -surrogate.mean()

                        critic_loss = self.loss(old_returns, values)

                        entropy = action_dist.entropy().mean()

                        loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy

                        self.optimizer.zero_grad()
                        loss.mean().backward()
                        self.optimizer.step()

                    train_time += time.time() - start_train_time  # ⏱ add train time
                    old_states, old_actions, old_log_probs, old_rewards, old_values, is_dones = [], [], [], [], [], []
                    # self.scheduler.step()
                    self.old_policy_net = copy.deepcopy(self.policy_net)

            episode_rewards.append(episode_reward)
            recent_rewards.append(np.mean(episode_reward))
            if len(recent_rewards) > 10:
                recent_rewards.pop(0)

            # Log every batch of self.num_steps * num_envs timesteps
            if steps % (self.num_envs * self.num_steps * 10) == 0:
                print(f"\n[Timesteps: {steps}]")
                print(f"Reward (mean over envs): {np.mean(episode_reward):.2f} | Avg Reward (last 10): {np.mean(recent_rewards):.2f}")
                print(f"Actor Loss: {actor_loss.item():.4f} | Critic Loss: {critic_loss.item():.4f} | Entropy Loss: {entropy.item():.4f}")
                print(f"Env Time: {env_time:.2f}s | Train Time: {train_time:.2f}s | Total: {time.time() - start_episode_time:.2f}s")
                print(f"Action std: {action_probs.std().item():.4f}")

        self.env.close()
        return np.array(episode_rewards)


In [109]:
import gymnasium as gym

env_id = 'CartPole-v1'
max_steps = 1000
lr = 3e-4


ppo_model =  PPOAgent(env_id, max_steps=max_steps, lr=lr, epsilon=0.2, num_envs=1, num_steps=1000, num_epochs = 80, minibatch_size=16)

rewards = ppo_model.train()




[Timesteps: 10000]
Reward (mean over envs): 42.00 | Avg Reward (last 10): 24.20
Actor Loss: 0.0084 | Critic Loss: 0.9889 | Entropy Loss: 0.6536
Env Time: 1.06s | Train Time: 0.00s | Total: 1.19s
Action std: 0.1386

[Timesteps: 20000]
Reward (mean over envs): 24.00 | Avg Reward (last 10): 61.90
Actor Loss: 0.0874 | Critic Loss: 0.9738 | Entropy Loss: 0.6115
Env Time: 1.04s | Train Time: 0.29s | Total: 1.92s
Action std: 0.1961

[Timesteps: 30000]
Reward (mean over envs): 63.00 | Avg Reward (last 10): 80.50
Actor Loss: 0.0001 | Critic Loss: 0.9772 | Entropy Loss: 0.5898
Env Time: 1.07s | Train Time: 0.00s | Total: 1.20s
Action std: 0.2187

[Timesteps: 40000]
Reward (mean over envs): 232.00 | Avg Reward (last 10): 107.70
Actor Loss: 0.0186 | Critic Loss: 0.9392 | Entropy Loss: 0.5811
Env Time: 1.04s | Train Time: 0.29s | Total: 1.92s
Action std: 0.2258

[Timesteps: 50000]
Reward (mean over envs): 57.00 | Avg Reward (last 10): 107.00
Actor Loss: 0.0600 | Critic Loss: 0.9668 | Entropy Loss:

In [111]:
import gymnasium as gym
import torch
import numpy as np
from gymnasium.wrappers import RecordVideo
import os

# Create folder to save the video
video_folder = "./video"
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with RecordVideo
env = gym.make('CartPole-v1', render_mode='rgb_array')
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda e: True)

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = ppo_model.policy_net(state)
            # action_dist = torch.distributions.Categorical(action_probs)
            # action = action_dist.sample().item()
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


  logger.warn(


Episode 1 Reward: 179.0
Episode 2 Reward: 473.0
Episode 3 Reward: 169.0
Episode 4 Reward: 251.0
Episode 5 Reward: 291.0
Episode 6 Reward: 200.0
Episode 7 Reward: 500.0
Episode 8 Reward: 166.0
Episode 9 Reward: 227.0
Episode 10 Reward: 240.0
Average Reward over 10 episodes: 269.6


In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class LinearValueEnv(gym.Env):
    def __init__(self, gamma=0.99, episode_length=100):
        super().__init__()
        self.gamma = gamma
        self.episode_length = episode_length
        self.current_step = 0

        # Observation: continuous scalar in [-1, 1]
        self.observation_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)

        # Action: continuous scalar (not used)
        self.action_space = spaces.Discrete(1)

        self.state = None

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.random.uniform(-1.0, 1.0, size=(1,)).astype(np.float32)
        self.current_step = 0
        return self.state.copy(), {}

    def step(self, action):
        # Reward is simply the state value
        reward = float(self.state[0])
        self.current_step += 1

        terminated = self.current_step >= self.episode_length
        truncated = False
        return self.state.copy(), reward, terminated, truncated, {}

    def render(self):
        print(f"State: {self.state}")

    def close(self):
        pass




In [None]:
from gymnasium.envs.registration import register

register(
    id="LinearValue-v0",
    entry_point="__main__:LinearValueEnv",  # if you're running in a script
    max_episode_steps=100
)


In [None]:
env_id = "LinearValue-v0"
num_episodes = 1000
max_steps = 500
lr = 1e-4


ppo_model_value =  PPOAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, epsilon=0.2, num_envs=2, num_steps=0)

rewards, steps = ppo_model_value.train()
