# PPO with CleanRL

```{note}
Now that we studied the theory behind PPO, the best way to understand how it works is to implement it from scratch.<br>
Implementing an architecture from scratch is the best way to understand it, and it’s a good habit. We have already done it for a value-based method with Q-Learning and a Policy-based method with Reinforce.
```

## Env

In [1]:
import time
import gymnasium as gym

def make_env(env_id):
    def thunk():
        env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        return env
    
    return thunk

In [2]:
env_id = "CartPole-v1"
num_envs = 3
seed = 1

envs = gym.vector.SyncVectorEnv([make_env(env_id) for _ in range(num_envs)])
assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"
envs

SyncVectorEnv(3)

## Agent

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical


def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
        )

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = Agent(envs).to(device)

In [5]:
learning_rate = 2.5e-4
optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)

## Start the game

In [6]:
# the number of steps to run in each environment per policy rollout
num_steps = 128

# ALGO Logic: Storage setup
obs = torch.zeros((num_steps, num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((num_steps, num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((num_steps, num_envs)).to(device)
rewards = torch.zeros((num_steps, num_envs)).to(device)
dones = torch.zeros((num_steps, num_envs)).to(device)
values = torch.zeros((num_steps, num_envs)).to(device)

In [7]:
# start the game
global_step = 0
start_time = time.time()
next_obs, _ = envs.reset(seed=seed) # shape (num_envs, single_observation_space.shape)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(num_envs).to(device)

## Main loop

In [8]:
total_timesteps = 5000
num_minibatches = 4
batch_size = int(num_envs * num_steps)
minibatch_size = int(batch_size // num_minibatches)
num_iterations = total_timesteps // batch_size

gamma = 0.99
gae_lambda = 0.95  # Use GAE for advantage computation
global_step = 0

In [9]:
for iteration in range(1, num_iterations + 1):
    
    # fill obs, actions, logprobs, rewards, dones, values
    for step in range(num_steps):
        global_step += num_envs
        # s_t
        obs[step] = next_obs
        dones[step] = next_done
        
        # a_t, v_t
        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob

        # s_{t+1}, r_t
        next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
        next_done = np.logical_or(terminations, truncations)
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)
        
        
    # advantage computation
    with torch.no_grad():
        # v_{T+1}
        next_value = agent.get_value(next_obs).reshape(1, -1)
        advantages = torch.zeros_like(rewards).to(device)
        lastgaelam = 0
        for t in reversed(range(num_steps)):
            if t == num_steps - 1:
                nextnonterminal = 1.0 - next_done
                nextvalues = next_value
            else:
                nextnonterminal = 1.0 - dones[t + 1]
                nextvalues = values[t + 1]
            # r_t + gamma * v_{t+1} - v{t}
            delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t]
            # update lastgaelam & compute advantages
            advantages[t] = lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
        # Q values
        returns = advantages + values

        
    # flatten the batch
    b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)
    
    # Optimizing the policy and value network
    update_epochs = 4
    clip_coef = 0.2
    ent_coef = 0.01
    vf_coef = 0.5
    max_grad_norm = 0.5
    
    b_inds = np.arange(batch_size)
    clipfracs = []
    for epoch in range(update_epochs):
        np.random.shuffle(b_inds)
        for start in range(0, batch_size, minibatch_size):
            end = start + minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            mb_advantages = b_advantages[mb_inds]

            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()

            # Value loss
            v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

            entropy_loss = entropy.mean()
            loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()
            
            if start == 0:
                print(iteration, epoch, loss.item())

1 0 15.195906639099121
1 1 14.586302757263184
1 2 15.095952987670898
1 3 16.335002899169922
2 0 10.235828399658203
2 1 8.145291328430176
2 2 9.919469833374023
2 3 8.310291290283203
3 0 14.384485244750977
3 1 9.557671546936035
3 2 11.550650596618652
3 3 13.196294784545898
4 0 15.475090980529785
4 1 11.329744338989258
4 2 11.575265884399414
4 3 12.216898918151855
5 0 17.111989974975586
5 1 20.04167366027832
5 2 18.0026912689209
5 3 17.139039993286133
6 0 17.672100067138672
6 1 17.039413452148438
6 2 20.281112670898438
6 3 17.588523864746094
7 0 13.127963066101074
7 1 13.810617446899414
7 2 11.982034683227539
7 3 14.697120666503906
8 0 19.245304107666016
8 1 18.96194839477539
8 2 18.185470581054688
8 3 17.695661544799805
9 0 22.96338653564453
9 1 23.906734466552734
9 2 21.928604125976562
9 3 22.954378128051758
10 0 16.186859130859375
10 1 17.45125961303711
10 2 19.65501594543457
10 3 15.642524719238281
11 0 16.93822479248047
11 1 15.163423538208008
11 2 13.592549324035645
11 3 13.92671775