# REINFORCE in TensorFlow

This notebook implements a basic reinforce algorithm a.k.a. policy gradient for CartPole env.

It has been deliberately written to be as simple and human-readable.


The notebook assumes that you have [openai gym](https://github.com/openai/gym) installed.

In case you're running on a server, [use xvfb](https://github.com/openai/gym#rendering-on-a-server)

In [None]:
#If you are running on a server, launch xvfb to record game videos
#Please make sure you have xvfb installed
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline

In [None]:
import gym

env = gym.make("CartPole-v0")

#gym compatibility: unwrap TimeLimit
if hasattr(env,'env'):
    env=env.env

env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape

plt.imshow(env.render("rgb_array"))

# Building the policy network

For REINFORCE algorithm, we'll need a model that predicts action probabilities given states.

For numerical stability, please __do not include the softmax layer into your network architecture__. 

We'll use softmax or log-softmax where appropriate.

In [None]:
import tensorflow as tf

#create input variables. We only need <s,a,R> for REINFORCE
states = tf.placeholder('float32', (None,) + state_dim, name="states")
actions = tf.placeholder('int32', name="action_ids")
cumulative_rewards = tf.placeholder('float32', name="cumulative_returns")

In [None]:
import keras
import keras.layers as L

nodes = 32

In [None]:
network = keras.models.Sequential()
network.add(L.InputLayer(state_dim))
network.add(L.ReLU())
network.add(L.Dense(nodes))
network.add(L.ReLU())
network.add(L.Dense(nodes))
network.add(L.ReLU())
network.add(L.Dense(n_actions))

logits = network(states)

policy = tf.nn.softmax(logits)
log_policy = tf.nn.log_softmax(logits)

In [None]:
#utility function to pick action in one given state
get_action_proba = lambda s: policy.eval({states:[s]})[0] 

#### Loss function and updates

**NOTE**: The cumulative rewards of $R(s,a)$ is called $G(s,a)$ in Sutton '16

We now need to define objective and update over policy gradient.

Our objective function is

$$ J \approx  { 1 \over N } \sum  _{s_i,a_i} \pi_\theta (a_i | s_i) \cdot G(s_i,a_i) $$


Following the REINFORCE algorithm, we can define our objective as follows: 

$$ \hat J \approx { 1 \over N } \sum  _{s_i,a_i} log \pi_\theta (a_i | s_i) \cdot G(s_i,a_i) $$

When you compute gradient of that function over network weights $ \theta $, it will become exactly the policy gradient.

In [None]:
# get probabilities for part i

# NOTE: The zeroth index in log_policy contains the batch number
#       (The states are fed into the `logits` variable, which are fed into `log_policy`)
#       tf.range(4) returns [0, 1, 2, 3]
#       The action also have a lenght of batch_number
#       The stack takes the two 1-d arrays an stacks them together at the last index
#
#       Example:
#       s.run(indices, feed_dict={states: [[0,1,2,3], [0,10,20,30], [0,100,200,300]],
#                                 actions: [0, 42, 777]})
#
#       array([[  0,   0],
#              [  1,  42],
#              [  2, 777]], dtype=int32)
indices = tf.stack([tf.range(tf.shape(log_policy)[0]), actions], axis=-1)
# NOTE: tf.gather_nd performs slicing by indices
log_policy_for_actions = tf.gather_nd(log_policy, indices)

In [None]:
# policy objective as in the last formula. please use mean, not sum.
# note: you need to use log_policy_for_actions to get log probabilities for actions taken

J = tf.reduce_mean(log_policy_for_actions*cumulative_rewards)

In [None]:
# Regularize with entropy
entropy = -tf.reduce_sum(policy * log_policy, 1, name="entropy")

In [None]:
# all network weights
# NOTE: This is a list of all trainable weights in your network
all_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

# weight updates. maximizing J is same as minimizing -J. Adding negative entropy.
loss = -J -0.1 * entropy

update = tf.train.AdamOptimizer().minimize(loss, var_list=all_weights)

### Computing cumulative rewards

In [None]:
def get_cumulative_rewards(rewards, gamma = 0.99):
    """
    Returns the cumulative rewards
    
    Parameters
    ----------
    rewards : array-like, shape (batch,)
        Rewards r(s,a) for the whole session
    gamma : float
        The discounting factor
    
    Returns
    -------
    cumulative_reward : array-like, shape (batch,)
        The cumulative reward (a.k.a. G(s,a) in Sutton '16) on the form
        >>> [..., r_{n+2} + gamma*r_{n+1} + gamma^2*r_{n}, r_{n+1} + gamma*r_{n}, r_{n}]
    """
    
    cumulative_reward = np.zeros(len(rewards))
    
    for i in range(len(rewards)):
        # NOTE: In the first iteration cumulative_reward[-i] = cumulative_reward[-0] = 0
        cumulative_reward[-(i+1)] = rewards[-(i+1)] + gamma*cumulative_reward[-i]
        
    return cumulative_reward

In [None]:
assert len(get_cumulative_rewards(range(100))) == 100
assert np.allclose(get_cumulative_rewards([0,0,1,0,0,1,0],gamma=0.9),[1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])
assert np.allclose(get_cumulative_rewards([0,0,1,-2,3,-4,0],gamma=0.5), [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])
assert np.allclose(get_cumulative_rewards([0,0,1,2,3,4,0],gamma=0), [0, 0, 1, 2, 3, 4, 0])
print("looks good!")

In [None]:
def train_step(_states,_actions,_rewards):
    """
    Given the full session, trains agent with policy gradient
    
    Parameters
    ----------
    _states : list
        A list of the states (output from the gym environment)
    _actions : list
        A list of the corresponding actions (output from the gym environment)
    _rewards :  list
        A list of the corresponding reward (output from the gym environment)
    """
    _cumulative_rewards = get_cumulative_rewards(_rewards)
    update.run({states:_states,actions:_actions,cumulative_rewards:_cumulative_rewards})

### Playing the game

In [None]:
def generate_session(t_max=1000):
    """play env with REINFORCE agent and train at the session end"""
    
    #arrays to record session
    states,actions,rewards = [],[],[]
    
    s = env.reset()
    
    for t in range(t_max):
        
        #action probabilities array aka pi(a|s)
        action_probas = get_action_proba(s)
        
        a = np.random.choice(n_actions, 1, p=action_probas)[0]
        
        new_s,r,done,info = env.step(a)
        
        #record session history to train later
        states.append(s)
        actions.append(a)
        rewards.append(r)
        
        s = new_s
        if done: 
            break
            
    train_step(states, actions, rewards)
            
    return sum(rewards)

In [None]:
s = tf.InteractiveSession()
s.run(tf.global_variables_initializer())

for i in tqdm_notebook(range(100), desc='session'):
    
    rewards = [generate_session() for _ in range(100)] #generate new sessions
    
    print ("mean reward:%.3f"%(np.mean(rewards)))

    if np.mean(rewards) > 300:
        print ("You Win!")
        break

### Results & video

In [None]:
# Record sessions
import gym.wrappers
env = gym.wrappers.Monitor(gym.make("CartPole-v0"),directory="videos",force=True)
sessions = [generate_session() for _ in range(100)]
env.close()

In [None]:
# Ahow video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices

In [None]:
EMAIL = ''
TOKEN = ''

In [None]:
from submit import submit_cartpole
submit_cartpole(generate_session, EMAIL, TOKEN)

In [None]:
# That's all, thank you for your attention!
# Not having enough? There's an actor-critic waiting for you in the honor section.
# But make sure you've seen the videos first.