In [1]:
# Guide: https://youtu.be/HR8kQMTO8bk
# PPO paper: https://arxiv.org/pdf/1707.06347.pdf

In [2]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import torch

from torch import nn
from torch import optim
from torch.distributions.categorical import Categorical

In [3]:
DEVICE = 'cpu'

In [4]:
## Policy and value model
class ActorCriticNetwork(nn.Module):
    def __init__(self, obs_space_size, action_space_size):
        super().__init__()
        
        # Params: An Observation
        # Returns: Features
        self.feature_layers = nn.Sequential(
            nn.Linear(obs_space_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        
        # Params: Features
        # Returns: An Action - according to agent's policy
        self.policy_layers = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_space_size)
        )
        
        # Params: Features
        # Returns: 1 integer denoting how valuable the current state is
        self.value_layers = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def value(self, obs):
        feat = self.feature_layers(obs)
        value = self.value_layers(feat)
        return value

    def policy(self, obs):
        feat = self.feature_layers(obs)
        policy_logits = self.policy_layers(feat)
        return policy_logits

    def forward(self, obs):
        feat = self.feature_layers(obs)
        policy_logits = self.policy_layers(feat)
        value = self.value_layers(feat)
        return policy_logits, value

In [5]:
# PPO Trainer
class PPO():
    def __init__(self,
                 actorCritic,
                 epsilon = 0.2,
                 target_kl_div = 0.01,
                 max_policy_iters = 69,
                 value_iters = 69,
                 policy_lr = 3e-4,
                 value_lr = 3e-4):
        self.ac = actorCritic
        self.epsilon = epsilon
        self.target_kl_div = target_kl_div
        self.max_policy_iters = max_policy_iters
        self.value_iters = value_iters
        
        policy_params = list(self.ac.feature_layers.parameters()) + list(self.ac.policy_layers.parameters())       
        self.policy_optim = optim.Adam(policy_params, lr = policy_lr)
        
        value_params  = list(self.ac.feature_layers.parameters()) + list(self.ac.value_layers.parameters())
        self.value_optim = optim.Adam(value_params, lr = value_lr)
        
    '''
    PPO formula:
        L^CPI(theta)  = E_t[ pi_theta (a_t|s_t) / pi_theta(old) (a_t|s_t) * A_hat_t]
                      = E_t[ratio_t(theta) * A_hat_t]

        L^CLIP(theta) = E_t[min( ratio_t(theta) * A_hat_t,
                                 clip(ratio_t(theta), 1-epsilon, 1+epsilon) * A_hat_t )]
    '''
    def train_policy(self, obs, acts, old_log_probs, advantages):
        for _ in range(self.max_policy_iters):
            self.policy_optim.zero_grad()

            new_logits = self.ac.policy(obs)
            new_logits = Categorical(logits = new_logits)
            new_log_probs = new_logits.log_prob(acts)

            ratio = torch.exp(new_log_probs - old_log_probs)
#           pi_theta / pi_theta(old)
#           = e^ln(pi_theta) / e^ln(pi_theta(old)) 
#           = e^[ln(pi_theta) - ln(pi_theta(old))]

            clipped_ratio = ratio.clamp(1 - self.epsilon, 1 + self.epsilon)

            ratio_loss = ratio * advantages
            clipped_loss = clipped_ratio * advantages

            neg_L_CLIP = -torch.min(ratio_loss, clipped_loss).mean()
#             We minimize (-L^CLIP) instead of maximizing (L^CLIP)
#        cause I only know how to minimize a loss funciton using pyTorch

            neg_L_CLIP.backward()
            self.policy_optim.step()

#           Calculates the KL_divergence, check if we've updated the policy enough already
            kl_div = (old_log_probs - new_log_probs).mean()
            if kl_div >= self.target_kl_div:
                break

    def train_value(self, obs, returns):
        for _ in range(self.value_iters):
            self.value_optim.zero_grad()

            values = self.ac.value(obs)
            value_loss = (returns - values)**2
            value_loss = value_loss.mean()

            value_loss.backward()
            self.value_optim.step()

In [6]:
# Helper functions
def discount_rewards(rewards, gamma = 0.99):
    '''
    discount_R(t) = sum_{i=0}^{inf} gamma^i * R(t+i)
                  = R(t) + sum_{i=1}^{inf} gamma^i * R(t+i)
                  = R(t) + gamma * discount_R(t+1)
    '''
    
    new_rewards = [float(rewards[-1])]
    for t in reversed(range(len(rewards)-1)):
        new_rewards.append(float(rewards[t]) + gamma * new_rewards[-1])
    return np.array(new_rewards[::-1])

def advantage_estimates(rewards, values, gamma = 0.99):
    '''
    A_hat_t = sum_{i=0}^{T} gamma^i delta_t+i
            = delta_t + sum_{i=1}^{T} gamma^i delta_t+i
            = delta_t + gamma * A_hat_t+1
    where delta_t = R(t) + (gamma)V(s_t+1) - V(s_t)
    '''
    advantage = [float(rewards[-1] + gamma * 0 - values[-1])]
    for t in reversed(range(len(rewards)-1)):
        delta_t = float(rewards[t] + gamma * values[t+1] - values[t])
        advantage.append(delta_t + gamma * advantage[-1])
    return np.array(advantage[::-1])

In [7]:
def run_episode(model, env, max_steps = 201):
    """
    Params: model + environment

    Run's an episode 
    with at most (max_steps) steps
    and actions taken probabilistically from current (old) policy 

    Returns: Training data - (number_of_steps, observations)
            + Total Rewards
        
    Does NOT do any training.
    """
    
    train_data = [ [], [], [], [], [] ]
    # obs, act, reward, advantage_estimates, act_log_probs
    obs = env.reset()
    ep_reward = 0
    
    for step in range(max_steps):
        # Act according to current policy
        logits, val = model(torch.tensor([obs], 
                                         dtype = torch.float32, device = DEVICE))
        
        # Gets a categorical distribution of potential actions
        act_distribution = Categorical(logits = logits)
        
        # Pick an action according to the distribution
        act = act_distribution.sample()
        # How likely was that action chosen
        act_log_prob = act_distribution.log_prob(act).item()
        
        act = act.item()
        val = val.item()
        
        nxt_obs, reward, done, _ = env.step(act)
        
        # Records data for training
        for i, item in enumerate((obs, act, reward, val, act_log_prob)):
            train_data[i].append(item)
        
        obs = nxt_obs
        ep_reward += reward        
        if done:
            break
    
    # Train data filtering & formatting
    train_data = [np.asarray(data) for data in train_data]
    train_data[3] = advantage_estimates(train_data[2], train_data[3])
    
    return train_data, ep_reward

In [8]:
env = gym.make('CartPole-v0')
#   Note:
# CartPole's activation space is a Descrete Value, 
# as such 'env.action_space.shape' returns '()'

model = ActorCriticNetwork(env.observation_space.shape[0], env.action_space.n)
model = model.to(DEVICE)

In [9]:
# Test run an episode
train_data, reward = run_episode(model, env)
# assert(np.array(train_data[0]).shape[1] == env.observation_space.shape[0])

  logits, val = model(torch.tensor([obs],


In [10]:
### Init PPO and params
n_episodes = 100
print_frequency = n_episodes // 5
np.random.seed(69)

ppo = PPO(model,
         epsilon = 0.2,
         target_kl_div = 0.02,
         max_policy_iters = 40,
         value_iters = 40,
         policy_lr = 3,
         value_lr = 3
         )

In [11]:
## Training loop
ep_rewards = []
for episode in range(n_episodes):
    # Perform rollout
    train_data, returns = run_episode(model, env)
#     train_data: obs, act, reward, value, act_log_probs
    ep_rewards.append(reward)
    
    permute_idxs = np.random.permutation(len(train_data[0]))
    # Policy data formatting
    obs = torch.tensor(train_data[0][permute_idxs], 
                       dtype = torch.float32, device = DEVICE)
    acts = torch.tensor(train_data[1][permute_idxs], 
                        dtype = torch.int32, device = DEVICE)
    advantages = torch.tensor(train_data[3][permute_idxs], 
                              dtype = torch.float32, device = DEVICE)
    act_log_probs = torch.tensor(train_data[4][permute_idxs], 
                                 dtype = torch.float32, device = DEVICE)
    
    # Value data formatting
    returns = discount_rewards(train_data[2])[permute_idxs]
    returns = torch.tensor(returns,
                           dtype = torch.float32, device = DEVICE)
    
    # Train model
    ppo.train_policy(obs, acts, act_log_probs, advantages)
    ppo.train_value(obs, returns)
    
    if (episode == 0) or ((episode+1) % print_frequency == 0) or (episode == n_episodes-1):
        print(f'Episode {episode+1} | Avg reward {np.mean(ep_rewards[-print_frequency:])}')

Episode 1 | Avg reward 12.0
Episode 20 | Avg reward 12.0
Episode 40 | Avg reward 12.0
Episode 60 | Avg reward 12.0
Episode 80 | Avg reward 12.0
Episode 100 | Avg reward 12.0


In [12]:
'''
PPO formula:
    L^CPI(theta)  = E_t[ pi_theta (a_t|s_t) / pi_theta(old) (a_t|s_t) * A_hat_t]
                  = E_t[ratio_t(theta) * A_hat_t]
    
    L^CLIP(theta) = E_t[min( ratio_t(theta) * A_hat_t,
                             clip(ratio_t(theta), 1-epsilon, 1+epsilon) * A_hat_t )]
'''
pass