In [10]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
import tensorflow_probability as tfp
import numpy as np

In [11]:
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.actions = []
        self.probs = []
        self.vals = []
        self.rewards = []
        self.dones = []
        
        self.batch_size = batch_size
        
    def generate_batches(self):
        # get a sample from the memory
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]
        
        return np.array(self.states), np.array(self.actions), np.array(self.probs), np.array(self.vals), np.array(self.rewards), np.array(self.dones), batches
    
    def store_memory(self, state, action, probs, vals, reward, done):
        # store a memory
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)
        
    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

In [12]:
class ActorNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
        # Dense Network with 2 hidden layers of 256 neurons
        super(ActorNetwork, self).__init__()
        self.fc1 = Dense(fc1_dims, activation='relu')
        self.fc2 = Dense(fc2_dims, activation='relu')
        self.fc3 = Dense(n_actions, activation='softmax')
        
    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [13]:
class CriticNetwork(keras.Model):
    def __init__(self, fc1_dims=256, fc2_dims=256):
        # Dense Network with 2 hidden layers of 256 neurons
        super(CriticNetwork, self).__init__()
        self.fc1 = Dense(fc1_dims, activation='relu')
        self.fc2 = Dense(fc2_dims, activation='relu')
        self.fc3 = Dense(1, activation=None)
        
    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [15]:
class PPOAgent:
    def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95, policy_clip=0.2, batch_size=64, n_epochs=10, chkpt_dir='models/ppo/'):
        # future reward discount
        self.gamma = gamma
        # clipping value
        self.policy_clip = policy_clip
        # number of epochs
        self.n_epochs = n_epochs
        # generalized advantage estimator
        self.gae_lambda = gae_lambda
        
        self.chkpt_dir = chkpt_dir
        
        # create actor and critic networks
        self.actor = ActorNetwork(n_actions)
        self.critic = CriticNetwork()
        self.actor.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha))
        self.critic.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha))
        # create a memory 
        self.memory = PPOMemory(batch_size)
        
    def store_transition(self, state, actions, probs, vals, reward, done):
        self.memory.store_memory(state, actions, probs, vals, reward, done)
        
    def save_model(self):
        self.actor.save(self.chkpt_dir + 'actor')
        self.critic.save(self.chkpt_dir + 'critic')
        
    def load_model(self):
        self.actor = keras.models.load_model(self.chkpt_dir + 'actor')
        self.critic = keras.models.load_model(self.chkpt_dir + 'critic')
        
    def choose_action(self, observation):
        # get the next action to perform
        state = tf.convert_to_tensor([observation])
        
        # we get the next action as a sample from the distribution
        probs = self.actor(state)
        dist = tfp.distributions.Categorical(probs)
        action = dist.sample()
        # get the log probabilites
        log_prob = dist.log_prob(action)
        # get the value of the action according to the critic network
        value = self.critic(state)
        
        action = action.numpy()[0]
        value = value.numpy()[0]
        log_prob = log_prob.numpy()[0]
        
        return action, log_prob, value
    
    def learn(self):
        for epoch in range(self.n_epochs):
            # sample from memory
            state_arr, action_arr, old_prob_arr, vals_arr, reward_arr, dones_arr, batches = self.memory.generate_batches()
            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)
            
            # compute the advantages
            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*(1-int(dones_arr[k])) - values[k])
                    discount *= self.gamma*self.gae_lambda
                advantage[t] = a_t
            # 
            for batch in batches:
                with tf.GradientTape(persistent=True) as tape:
                    # convert ppo memory samples to tensors
                    states = tf.convert_to_tensor(state_arr[batch])
                    old_probs = tf.convert_to_tensor(old_prob_arr[batch])
                    actions = tf.convert_to_tensor(action_arr[batch])
                    
                    probs = self.actor(states)
                    dist = tfp.distributions.Categorical(probs)
                    new_probs = dist.log_prob(actions)
                    
                
                    critic_value = self.critic(states)
                    critic_value = tf.squeeze(critic_value, 1)
                    
                    prob_ratio = tf.math.exp(new_probs - old_probs)
                    
                    # clip the distribution
                    weighted_probs = advantage[batch] * prob_ratio
                    clipped_probs = tf.clip_by_value(prob_ratio, 1-self.policy_clip, 1+self.policy_clip)
                    weighted_clipped_probs = clipped_probs * advantage[batch]
                    
                    # compute clipped actor loss
                    actor_loss = -tf.math.minimum(weighted_probs, weighted_clipped_probs)
                    actor_loss = tf.math.reduce_mean(actor_loss)
                    
                    # compute critic loss
                    returns = advantage[batch] + values[batch]
                    critic_loss = keras.losses.MSE(critic_value, returns)
                
                # apply gradients to actor and critic
                actor_params = self.actor.trainable_variables
                critic_params = self.critic.trainable_variables
                actor_grads = tape.gradient(actor_loss, actor_params)
                critic_grads = tape.gradient(critic_loss, critic_params)
                self.actor.optimizer.apply_gradients(zip(actor_grads, actor_params))
                self.critic.optimizer.apply_gradients(zip(critic_grads, critic_params))
                
        self.memory.clear_memory()