## Vanilla Policy Gradient Agent

This tutorial contains a simple example of how to build a policy-gradient based agent that can solve the CartPole problem.

In [1]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = gym.make('CartPole-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### The Policy-Based Agent

In [3]:
# hyperparams:

gamma = 0.99

In [4]:
def discount_rewards(rewards, gamma=0.99):
    """ take 1D float array of rewards and compute discounted rewards """
    discounted = np.zeros_like(rewards)
    running_add = 0
    for t in reversed(range(0, len(rewards))):
        running_add = running_add * gamma + rewards[t]
        discounted[t] = running_add
        
    return discounted

In [5]:
# discount_rewards([1, 1, 1])

In [6]:
class Agent(object):
    
    def __init__(self, learning_rate, state_dim, n_actions, n_hidden):
        self.state_in = tf.placeholder(shape=[None, state_dim], dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in, n_hidden, activation_fn=tf.nn.relu,
                                      biases_initializer=None)
        self.output = slim.fully_connected(hidden, n_actions, activation_fn=tf.nn.softmax,
                                           biases_initializer=None)
        self.chosen_action = tf.argmax(self.output, 1)
        
        self.reward_ph = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action_ph = tf.placeholder(shape=[None], dtype=tf.int32)

        # output shape is [num episodes until done, n_actions]
        # select the output value per row corresponding to the 
        # index of the chosen action, similar to numpy operation
        # A[np.arange(A.shape[0]), indices]
        row_indices = tf.range(tf.shape(self.action_ph)[0])
        indices = tf.stack([row_indices, self.action_ph], axis=1)
        actioned_outputs = tf.gather_nd(self.output, indices)
        
        self.loss = -tf.reduce_mean(tf.log(actioned_outputs) * self.reward_ph)
        
        tvars = tf.trainable_variables()
        self.gradient_phs = []
        for i, var in enumerate(tvars):
            ph = tf.placeholder(tf.float32, name='grad{}_ph'.format(i))
            self.gradient_phs.append(ph)
            
        self.gradients = tf.gradients(self.loss, tvars)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_phs, tvars))

### Training the Agent

In [12]:
tf.reset_default_graph()

agent = Agent(learning_rate=1e-2, state_dim=4, n_actions=2, n_hidden=8)

n_epochs = 5000
max_episodes = 999
update_freq = 5

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    total_reward = []
    total_length = []
    grad_buffer = np.array(sess.run(tf.trainable_variables()))
    grad_buffer *= 0
    
    for i in range(n_epochs):
        s = env.reset()
        running_reward = 0
        states, actions, rewards = [], [], []
        for j in range(max_episodes):
            # probabilistically pick an action given our network outputs
            a_dist = sess.run(agent.output, feed_dict={agent.state_in: [s]})
            a = np.random.choice(a_dist[0], p=a_dist[0])
            a = np.argmax(a_dist == a)
            
            s1, reward, done, _ = env.step(a)
            states.append(s)
            actions.append(a)
            rewards.append(reward)
            s = s1
            running_reward += reward
            
            if done:
                # episode lengths vary
                
                # Update the network
                discounted_rewards = discount_rewards(rewards, gamma)
                feed_dict = {
                    agent.reward_ph: discounted_rewards,
                    agent.action_ph: actions,
                    agent.state_in: states
                }
                grads = sess.run(agent.gradients, feed_dict)
                for idx, grad in enumerate(grads):
                    grad_buffer[idx] += grad
                    
                if i % update_freq == 0 and i != 0:
                    feed_dict = dict(zip(agent.gradient_phs, grad_buffer))
                    sess.run(agent.update_batch, feed_dict)
                    grad_buffer *= 0
                    
                total_reward.append(running_reward)
                total_length.append(j)
                break
                
        # Update the running tally of rewards
        if i % 100 == 0:
            print('Mean reward:', np.mean(total_reward[-100:]))

Mean reward: 26.0
Mean reward: 29.28
Mean reward: 35.91
Mean reward: 45.16
Mean reward: 47.6
Mean reward: 63.67
Mean reward: 91.75
Mean reward: 113.76
Mean reward: 140.32


KeyboardInterrupt: 