# PPO

### Setup

In [85]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_
import numpy as np

import random
import math
import time

import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output


import gymnasium as gym

In [104]:
env = gym.make(
    "LunarLander-v3",
    continuous=False,
    gravity=-10.0,
    enable_wind=False,
    wind_power=15.0,
    turbulence_power=1.5,
)

N_OBSERVATIONS = env.observation_space.shape[0]
N_ACTIONS = env.action_space.n

TEST = False
TRAIN = False
SAVE = False

### Hyperparameters

In [86]:
# Hyperparameters
N_AGENTS = 8
T = 512
GAMMA = 0.99
EPSILON = 0.2
H = 0.01 # Entropy constant
V = 1.0 # Value constant
LR = 3e-4
BATCH_SIZE = 64
N_EPOCHS = 4
N_TRAIN_LOOPS = 200
MAX_GRAD_NORM = 0.5
layer_dim = 128

### Model

In [103]:
class PolicyAndValueNetwork(nn.Module):
    def __init__(self, n_observations, n_actions, layer_dim):
        super().__init__()
        self.shared_layers = nn.Sequential(
            nn.Linear(n_observations, layer_dim),
            nn.ReLU()
        )
        self.policy_head = nn.Sequential(
            nn.Linear(layer_dim, layer_dim),
            nn.ReLU(),
            nn.Linear(layer_dim, n_actions)
        )
        self.value_head = nn.Sequential(
            nn.Linear(layer_dim, layer_dim),
            nn.ReLU(),
            nn.Linear(layer_dim, 1)
        )

    def forward(self, x):
        shared_output = self.shared_layers(x)
        action_logits = self.policy_head(shared_output)
        value = self.value_head(shared_output)
        return torch.cat([action_logits, value], dim=-1)

### Select action

Select action based on action logits.

In [88]:
def select_action(action_logits):
    action_probs = torch.softmax(action_logits, dim=0)
    action = torch.multinomial(action_probs, num_samples=1).item()
    return action

### PPO Dataset

In [128]:
class PPODataset(Dataset):
	def __init__(self, n_agents, n_timesteps, n_observations, n_actions, gamma):
		# Store variables
		self.n_agents = n_agents
		self.n_timesteps = n_timesteps
		self.n_observations = n_observations
		self.n_actions = n_actions
		self.gamma = gamma

		# Create tensors
		self.states = torch.zeros((n_agents, n_timesteps, n_observations))
		self.actions = torch.zeros((n_agents, n_timesteps))
		self.rewards = torch.zeros((n_agents, n_timesteps))
		self.logprobs = torch.zeros((n_agents, n_timesteps, n_actions))
		self.target_values = torch.zeros((n_agents, n_timesteps))
		self.advantages = torch.zeros((n_agents, n_timesteps))

		# Store episode ends
		self.episode_ends = [[] for _ in range(n_agents)]

	def add_step(self, agent, t, state, action, reward, logprob):
		self.states[agent, t] = state
		self.actions[agent, t] = action
		self.rewards[agent, t] = reward
		self.logprobs[agent, t] = logprob

	def mark_episode_end(self, agent, t):
		self.episode_ends[agent].append(t)

	def compute_advantages_and_target_values(self, model):
		with torch.no_grad():
			for agent in range(self.n_agents):
				# Iterate from last to first
				for t in range(self.n_timesteps-1, -1, -1):
					# Get step and reward and compute predicted value
					state = self.states[agent, t]
					reward = self.rewards[agent, t]
					pred_value = model(state)[-1]

					# If step is terminal
					if t in self.episode_ends[agent]:
						target_value = reward
					# If step is last but not terminal
					elif t == self.n_timesteps - 1:
						target_value = pred_value
					# non-terminal non-last step
					else:
						# Get previous target value (of t+1) and compute
						previous_target_value = self.target_values[agent][t+1]
						target_value = reward + self.gamma * previous_target_value
					
					# Compute advantage
					advantage = target_value - pred_value

					# Store target value and advantage
					self.target_values[agent, t] = target_value
					self.advantages[agent, t] = advantage

	def process_data(self):
		# Flatten all tensors but keep last dimension of states and lobprobs
		self.states = self.states.view(-1, self.n_observations)
		self.actions = self.actions.view(-1)
		self.rewards = self.rewards.view(-1)
		self.logprobs = self.logprobs.view(-1, self.n_actions)
		self.target_values = self.target_values.view(-1)
		self.advantages = self.advantages.view(-1)

	def __len__(self):
		return self.n_agents * self.n_timesteps
	
	def __getitem__(self, i):
		# Don't use before calling compute_advantages_and_target_values and flatten_data
		state = self.states[i]
		action = self.actions[i]
		logprob = self.logprobs[i]
		target_value = self.target_values[i]
		advantage = self.advantages[i]

		return state, action, logprob, target_value, advantage

### Loss function(s)

We have a 3-in-1 loss function. We want to minimize

- -L_CLIP  + V * L_V - H * L_H

I'll write each of the separate functions then combine them in one that will be backpropagated.

In [92]:
def clipped_objective_fn(log_probs, pi_batch, action_batch, advantage_batch):
    # Compute pi_new
    pi_new = log_probs[torch.arange(len(action_batch)), action_batch]

    # Compute p_ratio, ratio of new and old probabilities
    # note that it's unnormalized
    p_ratio = torch.exp(pi_new - pi_batch)

    # Compute unclipped and clipped surrogate objectives
    unclipped_surrogate_objective = p_ratio * advantage_batch
    clipped_surrogate_objective = torch.clamp(p_ratio, 1. - EPSILON, 1. + EPSILON) * advantage_batch

    # Compute elementwise minimum of two and return
    clipped_objective = torch.mean(torch.min(unclipped_surrogate_objective, clipped_surrogate_objective))
    return clipped_objective

In [93]:
def entropy_bonus_fn(log_probs):
    # Compute probabilities
    probs = torch.exp(log_probs)

    # Compute and return entropy
    entropy = -torch.mean(torch.sum(probs * log_probs, dim=-1))
    return entropy

In [94]:
def value_loss_fn(pred_values, target_value_batch):
    # Compute and return loss
    smoothl1 = nn.SmoothL1Loss(reduction='mean')
    loss = smoothl1(pred_values, target_value_batch)
    return loss

In [95]:
def loss_fn(model, state_batch, action_batch, pi_batch, target_value_batch, advantage_batch):
    # Get pred log probs and values
    output = model(state_batch)
    log_probs, pred_values = torch.log_softmax(output[:, :-1], dim=-1), output[:, -1]

    # Compute individual losses
    clipped_objective = clipped_objective_fn(log_probs, pi_batch, action_batch, advantage_batch)
    value_loss = value_loss_fn(pred_values, target_value_batch)
    entropy_bonus = entropy_bonus_fn(log_probs)

    # Compute and return total loss
    loss = -clipped_objective + V * value_loss - H * entropy_bonus
    return loss, clipped_objective.item(), V * value_loss.item(), H * entropy_bonus.item()

### Train loop

Not complete but almost the only thing left is to fix and add wrappers to the envs.

In [100]:
def train_loop(model, env, loss_fn, optimizer, batch_size, n_epochs, n_trainsteps, n_agents, n_timesteps, n_observations, n_actions, gamma):
	losses = []
	avg_rewards = []
	
	# ADD CODE TO TRANSFORM ENVIRONMENT INTO PYTORCH WHATEVER AND MAKE COPIES
	# Initilize all environments
	envs = [[env] for _ in range(n_agents)]
	for agent_env in envs:
		state, _ = agent_env[0].reset()
		terminated, truncated = False, False
		agent_env += [state, terminated, truncated]
			
	for i in range(n_trainsteps):
		# Initialize dataset
		dataset = PPODataset(n_agents, n_timesteps, n_observations, n_actions, gamma)

		# Collect data
		with torch.no_grad():
			for agent in range(n_agents):
				env = envs[agent][0]
				state, terminated, truncated = envs[agent][1:]
	
				for t in range(n_timesteps):
					# Compute and split model output
					output = model(state)
					action_logits = output[:-1]

					# Select and perform action
					action = select_action(action_logits)
					next_state, reward, terminated, truncated, _ = env.step(action)

					# Compute pi
					logprob = torch.log_softmax(action_logits, dim=-1)[action]

					# Store data
					dataset.add_step(agent, t, state, action, reward, logprob)

					# If terminated reset env and mark end, otherwise update state
					if terminated or truncated:
						dataset.mark_episode_end(agent, t)
						state, _ = env.reset()
						terminated, truncated = False, False
					else:
						state = next_state

		# Compute target values and advantages and process data
		dataset.compute_advantages_and_target_values(model)
		dataset.process_data()

		# Create dataloader
		dataloader = DataLoader(
			dataset=dataset,
			batch_size=batch_size,
			shuffle=True,
		)

		# Train model on policy for N_EPOCHS
		for _ in range(n_epochs):
			for state_batch, action_batch, pi_batch, target_value_batch, advantage_batch in dataloader:
				# Compute loss and optimize
				loss, clipped_objective, value_loss, entropy_bonus = loss_fn(
					model,
					state_batch,
					action_batch,
					pi_batch,
					target_value_batch,
					advantage_batch
				)
				loss.backward()
				# Clip gradients
				clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
				optimizer.step()
				optimizer.zero_grad()

				# Append losses
				losses.append(loss.item())

	return losses, avg_rewards

### Train

In [96]:
model = PolicyAndValueNetwork()
optimizer = optim.Adam(model.parameters())

In [97]:
def update_plot(data, title="", xlabel="", ylabel="", grid=True, sleep=0.01):
    clear_output(wait=True)
    plt.plot(data)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(grid)
    plt.show()
    time.sleep(sleep)

In [101]:
if TRAIN:
    losses, avg_rewards = train_loop(model, loss_fn, optimizer)

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ...

KeyboardInterrupt: 

In [77]:
plt.plot(losses, label="losses")
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'losses' is not defined

In [122]:
test_env = gym.make(
    "LunarLander-v3",
    continuous=False,
    gravity=-10.0,
    enable_wind=False,
    wind_power=15.0,
    turbulence_power=1.5,
    render_mode='human'
)

test_model = PolicyAndValueNetwork(N_OBSERVATIONS, N_ACTIONS, 128)
test_model.load_state_dict(torch.load('best_lunar_lander.pth'))

n = 1
total_rewards = 0.
for _ in range(n):
    state, info = test_env.reset()
    terminated, truncated = False, False
    while not (terminated or truncated):
        tensor_state = torch.from_numpy(state)
        logits = test_model(tensor_state)
        action_logits = logits[:-1]
        action = select_action(action_logits)
        state, reward, terminated, truncated, info = test_env.step(action)
        total_rewards += reward

print(total_rewards / n)
test_env.close()

  test_model.load_state_dict(torch.load('best_lunar_lander.pth'))


300.6601765585789
