# Reinforcement Learning for Cart Pole game

Import libraries:

In [1]:
import numpy as np
import tensorflow as tf
import gym
import matplotlib.pyplot as plt
% matplotlib notebook

  from ._conv import register_converters as _register_converters


Defining hyperparameters:

In [6]:
num_steps = 10**7      # Maxiaml number of steps in episode
num_episodes = 200      # Number of episodes to play
learning_rate = 0.01
gamma = 0.99           # Discount factor for reward
num_Hidden = 200       # number of nodes in the hidden layer

env = gym.make('CartPole-v0')    # Choose a game and create an environment
obs_dim = env.reset().shape      # obervation dimension
num_actions = env.action_space.n # number of actions (this works only for descrete action space, which is the case here)

### Define functions

In [3]:
def discount_rewards(r):
    '''Takes a 1D rewards (one episode) and discounts it and also standardize
    the rewards to be unit normal (helps control the gradient estimator variance)'''
    
    # Discounting
    dis_r = np.zeros_like(r)
    running_sum = 0
    for t in reversed(range(len(r))):
        running_sum = running_sum + gamma*r[t]
        dis_r[t] = running_sum
    
    # Normailzing
    dis_r = (dis_r - np.mean(dis_r))/np.std(dis_r)
        
    return dis_r

### Build model

We start by creating the simplest model: Observation as an input, one hidden layer and one node in the output layer - corresponding to the action of moving right (1) or left (0)  

In [4]:
'''Create placeholders for inputs'''
with tf.name_scope("inputs"):
    # A place holder for input observations
    input_ = tf.placeholder(tf.float32, shape = (None, obs_dim[0]), name = "input")
    # A place holder for actions in a full episode
    actions = tf.placeholder(tf.float32, shape = (None, num_actions), name = "actions")
    # A place holder for discounted rewards in a full episode
    dis_rewards = tf.placeholder(tf.float32, shape = (None, 1), name = "dis_rewards")

'''Fully connected layers'''
with tf.name_scope("FC"):
    fc1 = tf.layers.dense(inputs = input_, units = num_Hidden ,activation = tf.nn.relu, name = "fc1" )
    fc2 = tf.layers.dense(inputs = fc1, units = num_actions ,activation = None, name = "fc2" )
    
# Operate with softmax on fc2 outputs to get a probability distribution
action_prob_dist = tf.nn.softmax(logits = fc2, name = "softamx")

'''Define loss'''
# Fist define reular softmax cross entropy loss
CE_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels = actions, logits = fc2, name = "CE_loss")
# Modulate the loss based on our discounted reward - this is where reinforcment learning happens,
# we favor actions that produced high reward
loss = tf.reduce_mean(CE_loss * dis_rewards)

'''Define optimizer'''
training_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

### Train model

In [7]:
ep_action, ep_obs, ep_reward = [], [], []  # Allocate space for episode actions, observations and rewards
tot_ep_reward = [] # Total episode reward

''' Run TF session '''
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    ''' Run episodes '''
    for ep in range(num_episodes): 
        obs = env.reset()  # Reset and save first observation
        ep_obs.append(obs) # append observation

        ''' Run steps '''
        for t in range(1,num_steps):
            # Propagate forward to compute action probability distribution
            apd = np.squeeze(sess.run(action_prob_dist, feed_dict = {input_ : obs.reshape((1,4))}))
            action = np.random.choice(np.arange(num_actions), p = apd)   # Sample an action based on the pdf
            obs, reward, done, info = env.step(action)  # Take action and save observation, reward and done boolean
            
            # Convert action to one hot
            action_oh = np.zeros((1,num_actions))
            action_oh[0,action] = 1
            
            ep_action.append(action_oh)  # append action
            ep_obs.append(obs)           # append observation
            ep_reward.append(reward)     # append reward

            if done: 
                # Stack vertically episode parameters to one np.array
                ep_action = np.vstack(ep_action)
                ep_obs = np.vstack(ep_obs)
                ep_reward = np.vstack(ep_reward)

                # Discount rewards
                dis_rewards_arr = discount_rewards(ep_reward)
                # Compute loss and optimize
                sess.run([loss, training_opt], feed_dict = {input_ : ep_obs[:-1], actions : ep_action, dis_rewards : dis_rewards_arr})
                
                tot_ep_reward.append(np.sum(ep_reward))  # Compute total reward for episode
                ep_action, ep_obs, ep_reward = [], [], []  # Clear episode values for next episode
                # print info
                print("----------------------------------------")
                print("Episode ended after {} steps".format(t+1))
                print("Accumulated reward in this episode {} steps".format(tot_ep_reward[ep]))
                break

env.close()

----------------------------------------
Episode ended after 20 steps
Accumulated reward in this episode 19.0 steps
----------------------------------------
Episode ended after 18 steps
Accumulated reward in this episode 17.0 steps
----------------------------------------
Episode ended after 18 steps
Accumulated reward in this episode 17.0 steps
----------------------------------------
Episode ended after 38 steps
Accumulated reward in this episode 37.0 steps
----------------------------------------
Episode ended after 42 steps
Accumulated reward in this episode 41.0 steps
----------------------------------------
Episode ended after 21 steps
Accumulated reward in this episode 20.0 steps
----------------------------------------
Episode ended after 44 steps
Accumulated reward in this episode 43.0 steps
----------------------------------------
Episode ended after 41 steps
Accumulated reward in this episode 40.0 steps
----------------------------------------
Episode ended after 23 steps
Ac

----------------------------------------
Episode ended after 93 steps
Accumulated reward in this episode 92.0 steps
----------------------------------------
Episode ended after 12 steps
Accumulated reward in this episode 11.0 steps
----------------------------------------
Episode ended after 16 steps
Accumulated reward in this episode 15.0 steps
----------------------------------------
Episode ended after 19 steps
Accumulated reward in this episode 18.0 steps
----------------------------------------
Episode ended after 27 steps
Accumulated reward in this episode 26.0 steps
----------------------------------------
Episode ended after 53 steps
Accumulated reward in this episode 52.0 steps
----------------------------------------
Episode ended after 28 steps
Accumulated reward in this episode 27.0 steps
----------------------------------------
Episode ended after 48 steps
Accumulated reward in this episode 47.0 steps
----------------------------------------
Episode ended after 60 steps
Ac

----------------------------------------
Episode ended after 25 steps
Accumulated reward in this episode 24.0 steps
----------------------------------------
Episode ended after 12 steps
Accumulated reward in this episode 11.0 steps
----------------------------------------
Episode ended after 39 steps
Accumulated reward in this episode 38.0 steps
----------------------------------------
Episode ended after 22 steps
Accumulated reward in this episode 21.0 steps
----------------------------------------
Episode ended after 53 steps
Accumulated reward in this episode 52.0 steps
----------------------------------------
Episode ended after 18 steps
Accumulated reward in this episode 17.0 steps
----------------------------------------
Episode ended after 21 steps
Accumulated reward in this episode 20.0 steps
----------------------------------------
Episode ended after 41 steps
Accumulated reward in this episode 40.0 steps
----------------------------------------
Episode ended after 56 steps
Ac