In this assignment, we will use
* causality return
* value baseline

to make policy gradient update in the Lunar lander environment.

You can modify from the trainer implementation in [class demo](https://colab.research.google.com/drive/1tClkELFzT9WH7GFs-98VXKupmS149qIf#scrollTo=sQfNraq6e1ZR).
* Complete the return and advantage implementation.
* Update the trainer to call appropriate return/advantage computation for policy gradient updates.
* Implement the critic network, its loss function and back propagation logic.

Reinforcement learning is sensitive to hyperparameters. Lunar Lander environment is considered solved with 200 episode return. Our simple implementations should generally be able to reach > 100 returns. Test hyperparameters such as learning rate, max episode length, number of epochs, number of steps per epoch, return/advantage computation to see what tweeks help.



In [1]:
#@title Install dependencies for LunarLander.
!apt-get install swig > /dev/null 2>&1
!pip3 install numpy > /dev/null 2>&1
!pip3 install pandas > /dev/null 2>&1
!pip3 install matplotlib > /dev/null 2>&1
!pip3 install seaborn > /dev/null 2>&1 
!pip3 install gymnasium > /dev/null 2>&1
!pip3 install gymnasium[box2d] > /dev/null 2>&1
!pip3 install gymnasium[classic-control] > /dev/null 2>&1
!pip3 install pygame > /dev/null 2>&1
!pip3 install gym-box2d > /dev/null 2>&1

zsh:1: no matches found: gymnasium[box2d]
zsh:1: no matches found: gymnasium[classic-control]


In [4]:
#@title Import required libraries.

import pandas as pd
import random
import seaborn as sns

from IPython import display
import matplotlib.pyplot as plt

from gymnasium.spaces import Discrete, Box

In [None]:
from torch import nn
from torch import optim
from torch.distributions import Categorical

import torch
import torch.nn.functional as F
import numpy as np
import gymnasium as gym


class ActorCriticNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, n_actions) -> None:
        super(ActorCriticNN, self).__init__()
        
        layers = []
        sizes = [input_size, *hidden_sizes]
        for j in range(len(sizes)-1):
            layers += [nn.Linear(sizes[j], sizes[j+1]), nn.ReLU()]
        
        self.backend = nn.Sequential(*layers)
        self.action_layer = nn.Linear(in_features=sizes[-1], out_features=n_actions)
        self.value_layer = nn.Linear(in_features=sizes[-1], out_features=1)
        
    def forward(self, x):
        x = self.backend(x)
        actions = self.action_layer(x)
        values = self.value_layer(x)
        return actions, values
    

class ReinforceNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, n_actions) -> None:
        super(ReinforceNN, self).__init__()
        
        layers = []
        sizes = [input_size, *hidden_sizes, n_actions]
        for j in range(len(sizes)-1):
            act = nn.ReLU if j < len(sizes)-2 else nn.Identity
            layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        actions = self.model(x)
        return actions, None

In [5]:
# Trains a policy using policy gradient update.
class PolicyGradientTrainer():
    def __init__(self, env_name: str = 'LunarLander-v2', lr: float = 1e-2,
                 value_lr: float = 1e-3, max_eps_length: int = 500,
                 n_trajectories_per_epoch: int = 5000,
                 hidden_sizes: list[int] = [32], method: str = 'reinforce',
                 verbose: bool = False):
        
        if method not in {'reinforce', 'causality', 'advantage'}:
            raise ValueError('Unsupported method.')
        
        self.env = gym.make(env_name, render_mode="rgb_array")
        obs_dim = self.env.observation_space.shape[0]
        n_acts = self.env.action_space.n
        
        if method == 'advantage':
            self.model = ActorCriticNN(obs_dim, hidden_sizes, n_acts)
        else:
            self.model = ReinforceNN(obs_dim, hidden_sizes, n_acts)
        
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.max_eps_length = max_eps_length
        self.n_trajectories_per_epoch = n_trajectories_per_epoch
        self.verbose = verbose
    
    
    def sample_step(self, obs):
        logits, value = self.model(torch.as_tensor(obs, dtype=torch.float32))
        action = Categorical(logits=logits).sample().item()
        obs_prime, rew, done, _, _ = self.env.step(action)
        
        return action, rew, done, value.item(), obs_prime  
        
    def sample_trajectory(self, n=500):
        obs, _ = self.env.reset()
        
        tj_obs = []
        tj_actions = []
        tj_rewards = []
        tj_values = []
        
        for i in range(0, min(n, self.max_eps_length)):
            action, reward, done, value, obs_prime = self.sample_step(obs)
            tj_obs.append(obs)
            tj_actions.append(action)
            tj_rewards.append(reward)
            tj_values.append(value)
            obs = obs_prime
            
            if done:
                break
        
        tj_returns = []
        rt = 0
        for r in reversed(tj_rewards):
            rt += r
            tj_returns.append(rt)
        
        next_tj_values = [ *tj_values[1:],  tj_values[-1] ]
        tj_adv = np.array(tj_rewards) + np.array(tj_values) - np.array(next_tj_values)
        
        tj_exp_values = np.array(tj_rewards) + np.array([ *tj_values[1:], 0 ])
        
        return (tj_obs, tj_actions, tj_rewards, tj_returns, tj_adv, tj_exp_values)
    
    def compute_loss(self, log_ps, values, A, exp_values):
        loss_action = -1 * (log_ps * A).mean()
        loss_value = F.mse_loss(values, exp_values, reduction='mean')
        
        return loss_action + loss_value
    
    
    def train_epoch(self, i_epoch):

        epoch_obs = []
        epoch_returns = []
        epoch_rewards = []
        epoch_actions = []
        epoch_exp_values = []
        epoch_ajdvs = []
        
        epoch_tj_total_return = []
        for i in range(0, self.n_trajectories_per_epoch):
            tj_obs, tj_actions, tj_rewards, tj_returns, tj_adv, tj_exp_values = self.sample_trajectory(self.max_eps_length)
            epoch_obs.extend(tj_obs)
            epoch_returns.extend(tj_returns)
            epoch_rewards.extend(tj_rewards)
            epoch_tj_total_return.append(np.sum(tj_rewards))
            epoch_actions.extend(tj_actions)
            epoch_exp_values.extend(tj_exp_values)
            epoch_ajdvs.extend(tj_adv)
            
        if self.verbose:
            print(f'epoch={i_epoch} mean return', np.mean(epoch_tj_total_return))
        
        epoch_obs_tensor = torch.as_tensor(np.array(epoch_obs), dtype=torch.float32)
        epoch_actions_tensor = torch.as_tensor(np.array(epoch_actions), dtype=torch.int16)
        epoch_adjs_tensor = torch.as_tensor(np.array(epoch_ajdvs), dtype=torch.float32)
        epoch_exp_values_tensor = torch.as_tensor(np.array(epoch_exp_values), dtype=torch.float32)
        
        self.optimizer.zero_grad()
        logits, values = self.model(epoch_obs_tensor)
        log_ps = Categorical(logits=logits).log_prob(epoch_actions_tensor)
        loss = self.compute_loss(log_ps, values.squeeze(), epoch_adjs_tensor, epoch_exp_values_tensor)
        loss.backward()
        self.optimizer.step()
        
        return loss.item()
        
    # Train the policy model and return training statistics.
    def train(self, epochs=50):
        epoch_losses = []
        for i in range(1, epochs+1):
            epoch_loss = self.train_epoch(i)
            epoch_losses.append(epoch_loss)

    # Evaluate the policy model by acting in the environment for one episode.
    def eval(self, render_every: int = 1):
        pass

In [7]:
trainer = PolicyGradientTrainer(
    max_eps_length = 5, 
    n_trajectories_per_epoch = 5, 
    method= 'advantage'
)
loss = trainer.train_epoch()
print(loss)

DependencyNotInstalled: Box2D is not installed, run `pip install gymnasium[box2d]`