# Cartpole: REINFORCE Monte Carlo Policy Gradients
![](https://camo.githubusercontent.com/6c525864040e12c833041d8fffde019ab3c3546e/687474703a2f2f6e6575726f2d6564756361746f722e636f6d2f77702d636f6e74656e742f75706c6f6164732f323031372f30392f44514e2e676966)

## Step 1: Import librarires

In [1]:
import tensorflow as tf
import numpy as np
import gym

  from ._conv import register_converters as _register_converters


## Step 2: Create Environment

In [2]:
env = gym.make("CartPole-v0")
env = env.unwrapped

# Policy gradient has high variance, seed for reproducability
env.seed(1)

[1]

In [21]:
# move left, right
env.action_space

Discrete(2)

In [17]:
# Observation space
print(env.observation_space.low)
print(env.observation_space.high)

[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [28]:
# Maybe, state is define by coordinates of 
# pole bottom and top points
print(env.reset())

[ 0.02725216 -0.04481721 -0.04304738  0.00151751]


## Step 3: Setup hyper-parameters

In [22]:
# ENV hyper parameters
state_size = 4
action_size = env.action_space.n

# Training hyper-parameters
max_episodes = 10000
learning_rate = 0.01
gamma = 0.95

## Step 4: Define preprocessing funtions

In [23]:
def discount_and_normalize_rewards(episode_rewards):
    """
    Take the rewards and perform discount
    """
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    
    # G_t = \sum_{k=0}^{T}\gamma^k R_{t+k+1}
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
        
    # transform to Z-score
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = \
    (discounted_episode_rewards - mean)/(std)
    
    return discounted_episode_rewards  

## Step 5: Create Policy Gradient Neural Network model
![](https://camo.githubusercontent.com/302679523d9151a5a9ee8093a480096f8cf28f33/68747470733a2f2f7261772e67697468756275736572636f6e74656e742e636f6d2f73696d6f6e696e6974686f6d61732f446565705f7265696e666f7263656d656e745f6c6561726e696e675f436f757273652f6d61737465722f506f6c6963792532304772616469656e74732f43617274706f6c652f6173736574732f636174706f6c652e706e67)

- state which is an array of 4 values will be used as an input
- NN has 3 FC layers (10-2-2), the last 2 = number of actions
- Last layer's activation = softmax = action probability distribution

In [25]:
with tf.name_scope('inputs'):
    # input
    input_ = tf.placeholder(
        dtype=tf.float32, 
        shape=[None, state_size], 
        name="input_"
    )
    # output
    actions = tf.placeholder(
        dtype=tf.float32, 
        shape=[None, action_size], 
        name="actions"
    )
    discounted_episode_rewards_ = tf.placeholder(
        dtype=tf.float32,
        shape=[None, ],
        name="discounted_episode_rewards"
    )
    
    # Add this placeholder for having this variable in tensorboard
    mean_reward_ = tf.placeholder(
        dtype=tf.float32,
        name="mean_reward_"
    )
    
    # first FC layer
    with tf.name_scope("fc1"):
        fc1 = tf.contrib.layers.fully_connected(
            inputs=input_,
            num_outputs = 10,
            activation_fn=tf.nn.relu,
            weights_initializer=tf.contrib.layers.xavier_initializer()
        )
    
    # second FC layer
    with tf.name_scope("fc2"):
        fc2 = tf.contrib.layers.fully_connected(
            inputs=fc1,
            num_outputs = action_size,
            activation_fn=tf.nn.relu,
            weights_initializer=tf.contrib.layers.xavier_initializer()
        )
    
    # third FC layer
    with tf.name_scope("fc3"):
        fc3 = tf.contrib.layers.fully_connected(
            inputs=fc2,
            num_outputs = action_size,
            activation_fn=tf.nn.relu,
            weights_initializer=tf.contrib.layers.xavier_initializer()
        )

    # softamx activator
    with tf.name_scope("softmax"):
        action_distribution = tf.nn.softmax(logits=fc3)
    
    # loss function
    with tf.name_scope("loss"):
        # tf.nn.softmax_cross_entropy_with_logits computes 
        # the cross entropy of the result 
        # after applying the softmax function
        # If you have single-class labels, 
        # where an object can only belong to one class, 
        # you might now consider using 
        # tf.nn.sparse_softmax_cross_entropy_with_logits 
        # so that you don't have to convert your labels 
        # to a dense one-hot array.
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits = fc3, 
            labels = actions
        )
        loss = tf.reduce_mean(neg_log_prob * 
                              discounted_episode_rewards_)
    
    # training operator
    with tf.name_scope("train"):
        train_opt = tf.train.AdamOptimizer(learning_rate)\
                      .minimize(loss)

In [41]:
action_distribution

<tf.Tensor 'inputs_1/softmax/Softmax:0' shape=(?, 2) dtype=float32>

## Step 6: Set up Tensorboard

In [32]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("output1")

## Losses
tf.summary.scalar("Loss", loss)

## Reward mean
tf.summary.scalar("Reward_mean", mean_reward_)

write_op = tf.summary.merge_all()

## Step 7: Train agent

In [35]:
from datetime import datetime
allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
episode = 0
episode_states, episode_actions, episode_rewards = [],[],[]

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for episode in range(max_episodes):
        
        episode_rewards_sum = 0
        
        # reset the game
        state = env.reset()
        
        # env.render
        total_steps = 0
        while True:
            # tracking step number
            total_steps += 1
            # Choose an action
            # Remember that WE'RE NOT IN A DETERMINISTIC ENV
            # WE'RE IN STOCHASTIC ENV, WE OUTPUT PROBABILITIES
            
            action_probability_distribution = \
            sess.run(
                action_distribution,
                feed_dict={input_: state.reshape([1,4])}
            )
            
            # Select an action w.r.t the actions probability
            action = np.random.choice(
                a=range(action_probability_distribution.shape[1]),
                p=action_probability_distribution.ravel()
            )
            
            # Perform the action
            new_state, reward, done, info = env.step(action)
            
            # Store s, a
            episode_states.append(state)
            
            # For actions because we output only one (the index) 
            # we need 2 (1 is for the action taken)
            # We need [0., 1.] (if we take right) 
            # not just the index
            action_ = np.zeros(action_size)
            action_[action] = 1
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            
            if done:
                # Calculate sum reward
                episode_rewards_sum = np.sum(episode_rewards)
                allRewards.append(episode_rewards_sum)
                total_rewards = np.sum(allRewards)
                
                # Mean reward
                mean_reward = np.divide(total_rewards, episode+1)
                maximumRewardRecorded = np.amax(allRewards)
                
                if episode%100 == 0:
                    print("Episode: %d" % episode,
                          "\tSteps: %d" % total_steps,
                          "\tReward: %0.2f" % episode_rewards_sum,
                          "\tMean Reward: %0.2f" % mean_reward,
                          "\tMax reward so far: %0.2f" % maximumRewardRecorded
                         )
                
                # Calculate discounted reward
                discounted_episode_rewards = \
                discount_and_normalize_rewards(episode_rewards)
                
                # Feed forward, gradient and backpropagation
                loss_, _ = sess.run(
                    [loss, train_opt],
                    feed_dict={
                        input_: np.vstack(np.array(episode_states)),
                        actions: np.vstack(np.array(episode_actions)),
                        discounted_episode_rewards_: discounted_episode_rewards
                    }
                )
                
                # Write TF summaries
                summary = sess.run(
                    write_op,
                    feed_dict={
                        input_: np.vstack(np.array(episode_states)),
                        actions: np.vstack(np.array(episode_actions)),
                        discounted_episode_rewards_: discounted_episode_rewards,
                        mean_reward_: mean_reward
                    }
                )
                
                writer.add_summary(summary, episode)
                writer.flush()
                
                # Reset the transition stores
                episode_states, episode_actions, episode_rewards = \
                [],[],[]
                
                break
            
            # Assign new state
            state = new_state

Episode: 0 	Steps: 18 	Reward: 18.00 	Mean Reward: 18.00 	Max reward so far: 18.00
Episode: 100 	Steps: 15 	Reward: 15.00 	Mean Reward: 22.00 	Max reward so far: 84.00
Episode: 200 	Steps: 47 	Reward: 47.00 	Mean Reward: 23.13 	Max reward so far: 84.00
Episode: 300 	Steps: 17 	Reward: 17.00 	Mean Reward: 23.43 	Max reward so far: 84.00
Episode: 400 	Steps: 10 	Reward: 10.00 	Mean Reward: 23.68 	Max reward so far: 84.00
Episode: 500 	Steps: 20 	Reward: 20.00 	Mean Reward: 23.23 	Max reward so far: 84.00
Episode: 600 	Steps: 15 	Reward: 15.00 	Mean Reward: 23.28 	Max reward so far: 84.00
Episode: 700 	Steps: 29 	Reward: 29.00 	Mean Reward: 23.01 	Max reward so far: 90.00
Episode: 800 	Steps: 17 	Reward: 17.00 	Mean Reward: 23.02 	Max reward so far: 90.00
Episode: 900 	Steps: 37 	Reward: 37.00 	Mean Reward: 22.99 	Max reward so far: 90.00
Episode: 1000 	Steps: 12 	Reward: 12.00 	Mean Reward: 22.89 	Max reward so far: 90.00
Episode: 1100 	Steps: 17 	Reward: 17.00 	Mean Reward: 22.86 	Max r

Episode: 9500 	Steps: 28 	Reward: 28.00 	Mean Reward: 22.28 	Max reward so far: 117.00
Episode: 9600 	Steps: 18 	Reward: 18.00 	Mean Reward: 22.27 	Max reward so far: 117.00
Episode: 9700 	Steps: 14 	Reward: 14.00 	Mean Reward: 22.28 	Max reward so far: 117.00
Episode: 9800 	Steps: 16 	Reward: 16.00 	Mean Reward: 22.27 	Max reward so far: 117.00
Episode: 9900 	Steps: 34 	Reward: 34.00 	Mean Reward: 22.27 	Max reward so far: 117.00
