In [1]:
import warnings, random, gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from IPython.display import clear_output

import tensorflow as tf
from tensorflow import keras
from keras import layers

EPISODE = 1000
EPOCHS = 4
ITERS = 64
BATCH_SIZE = 16
GAMMA = 0.99
LAMBDA = 0.95
EPSILON = 0.1
EPS = 1e-8
TEST_EPISODE = 50

warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
class PPOBuffer:
    def __init__(self, maxlen, state_shape):
        super(PPOBuffer, ).__init__()
        self.maxlen = maxlen
        self.idx = 0

        self.states = np.zeros((maxlen, state_shape))
        self.actions = np.zeros(maxlen, dtype=np.int32)
        self.log_probs = np.zeros(maxlen, dtype=np.float32)
        self.critic_values = np.zeros(maxlen+1, dtype=np.float32)
        self.rewards = np.zeros(maxlen)
        self.mask_done_ep = np.zeros(maxlen)
        
    def store(self, state, action, log_prob, critic_value, reward, done):
        self.states[self.idx] = state
        self.actions[self.idx] = action
        self.log_probs[self.idx] = log_prob
        self.critic_values[self.idx] = critic_value
        self.rewards[self.idx] = reward
        self.mask_done_ep[self.idx] = 1 - int(done)
        self.idx += 1

    def reset_buffer(self):
        self.idx = 0
        
    def sample(self, idxs):
        states_batch = self.states[idxs]
        # this structure is used to gather the action probabilites below
        actions_batch = [[i,  self.actions[idx]] for i,idx in enumerate(idxs)] 
        log_probs_batch =  self.log_probs[idxs]
        
        return states_batch, actions_batch, log_probs_batch
        
    def __len__(self):
        return len(self.buffer)

In [3]:
class PPO:
    def __init__(self, 
                 state_shape,
                 n_actions,
                 hidden_units,
                 optimizer,
                 epsilon, 
                 gamma,
                 llambda
    ):
        
        # ppo parameters
        self.clip_max = 1 + epsilon
        self.clip_min = 1 - epsilon
        self.c1 = 1
        self.c2 = 0.01
        self.gamma = gamma
        self.llambda = llambda
        
        # policy-critic network
        self.n_actions = n_actions
        self.actor_critic = self.get_actor_critic_model(state_shape, n_actions, hidden_units)
        self.optim = optimizer
        
    def get_actor_critic_model(self, state_shape, n_actions, hidden_units):
        inputs = layers.Input(shape=(state_shape,))
        x = layers.Dense(hidden_units[0], activation='relu')(inputs)
        x = layers.Dense(hidden_units[1], activation='relu')(x)
        actor = layers.Dense(n_actions, activation='softmax', name='actor')(x)
        critic = layers.Dense(1, activation='linear', name='critic')(x)

        return keras.Model(inputs=inputs, outputs=[actor,critic])


    def select_action(self, state):
        state = tf.expand_dims(state, 0)
        
        actions_prob, critic_val = self.actor_critic(state)
        action = np.random.choice(self.n_actions, p=np.squeeze(actions_prob.numpy()))
        
        return action, tf.squeeze(actions_prob), tf.math.log(tf.squeeze(actions_prob)), tf.squeeze(critic_val)
    
    def compute_advantages(self, rewards, critic_values, masks_done_ep, last_state):
        critic_values[-1] = tf.squeeze(self.actor_critic(tf.expand_dims(last_state, 0))[1]).numpy()
        deltas = rewards + self.gamma * critic_values[1:] * masks_done_ep - critic_values[:-1]
        a_t = 0
        advantages = np.zeros_like(rewards, dtype=np.float32)
        for t in reversed(range(rewards.shape[0])):
            a_t = deltas[t] + self.gamma * self.llambda * masks_done_ep[t] * a_t
            advantages[t] = a_t
        
        return advantages
    
    @tf.function
    def train(self, states, actions, log_probabilities, advantages, returns):
        with tf.GradientTape() as tape:
            probs, values = self.actor_critic(states)
            log_probs = tf.math.log(tf.gather_nd(probs, actions))
            
            # actor loss
            ratio = tf.math.exp(log_probs - log_probabilities)
            l_clip = - tf.math.minimum(
                    ratio * advantages,                                                  # normal ratio
                    tf.clip_by_value(ratio, self.clip_min, self.clip_max) * advantages,  # clip term
                )

            # critic loss
            l_vf = keras.losses.huber(returns, values)
            
            # entropy term
            s = - tf.reduce_sum(probs * tf.math.log(probs + 1e-8), axis=-1)
            
            loss = tf.reduce_mean (
                l_clip + self.c1 * l_vf - self.c2 * s
            )
            
        gradients = tape.gradient(loss, self.actor_critic.trainable_variables)
        self.optim.apply_gradients(zip(gradients, self.actor_critic.trainable_variables))
        
    def test(self, env):
        state, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _, _, _ = self.select_action(state)
            state, reward, done, _, _ = env.step(action)
            total_reward += 1
        return total_reward
        
    def save_weight(self, name):
        self.actor_critic.save_weights(name)

In [None]:
env = gym.make('LunarLander-v2')
state_shape = env.observation_space.shape[0]
n_actions = env.action_space.n

optimizer = keras.optimizers.legacy.Adam(learning_rate=0.001)

ppo_buffer = PPOBuffer(maxlen=ITERS, state_shape=state_shape)
agent = PPO(state_shape=state_shape,
          n_actions=n_actions,
          hidden_units=[32,16],
          optimizer=optimizer,
          epsilon=EPSILON, 
          gamma=GAMMA,
          llambda=LAMBDA
)

for episode in range(1, EPISODE+1):
    state, _ = env.reset()
    
    for t in range(ITERS):
        action, actions_prob, actions_log_prob, critic_val = agent.select_action(state=state)

        next_state, reward, done, _, _ = env.step(action)

        ppo_buffer.store(state, action, actions_log_prob[action].numpy(), critic_val.numpy(), reward, done)

        if done:
            state, _ = env.reset()
            continue
        
        state = next_state
            
    # compute advantages using gae
    advantages = agent.compute_advantages(ppo_buffer.rewards, ppo_buffer.critic_values, ppo_buffer.mask_done_ep, next_state)
    discrounted_returns = advantages + ppo_buffer.critic_values[:-1]
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    
    batch_idx_size = (int(ITERS/BATCH_SIZE), BATCH_SIZE)
    for epoch in range(EPOCHS):
        idxs = np.random.choice(range(ITERS), size=batch_idx_size, replace=False)
        for batch_idxs in idxs:
            states_batch, actions_batch, log_probs_batch = ppo_buffer.sample(batch_idxs)
            advantages_batch = advantages[batch_idxs]
            returns_batch = discrounted_returns[batch_idxs]
  
            agent.train(states_batch, actions_batch, log_probs_batch, advantages_batch, returns_batch)

    ppo_buffer.reset_buffer()
    
    if running_rewards[-1] > TEST_EPISODE:
        print(f'Solved at {episode} espisode: running_reward={running_rewards[-1]:.2f}')
        agent.save_weight(f"checkpoints/ppo_lunarlander_final.h5")
        break
            