# Reinforcement Learning for Cart Pole game

Import libraries:

In [1]:
import numpy as np
import tensorflow as tf
import gym
import matplotlib.pyplot as plt
% matplotlib notebook

  from ._conv import register_converters as _register_converters


Defining hyperparameters:

In [2]:
num_episodes = 1000      # Number of episodes to play
learning_rate = 0.01
gamma = 0.99           # Discount factor for reward
num_Hidden = 10       # number of nodes in the hidden layer

env = gym.make('CartPole-v0')    # Choose a game and create an environment
obs_dim = env.reset().shape      # obervation dimension
num_actions = env.action_space.n # number of actions (this works only for descrete action space, which is the case here)

### Define functions

In [3]:
def discount_rewards(r):
    '''Takes a 1D rewards (one episode) and discounts it and also standardize
    the rewards to be unit normal (helps control the gradient estimator variance)'''
    
    # Discounting
    dis_r = np.zeros_like(r)
    running_sum = 0
    for t in reversed(range(len(r))):
        running_sum = gamma*running_sum + r[t]
        dis_r[t] = running_sum
    
    # Normailzing
    dis_r = (dis_r - np.mean(dis_r))/np.std(dis_r)
        
    return dis_r

### Build model

We start by creating the simplest model: Observation as an input, one hidden layer and one node in the output layer - corresponding to the action of moving right (1) or left (0)  

In [4]:
'''Create placeholders for inputs'''
with tf.name_scope("inputs"):
    # A place holder for input observations
    input_ = tf.placeholder(tf.float32, shape = (None, obs_dim[0]), name = "input")
    # A place holder for actions in a full episode
    actions = tf.placeholder(tf.float32, shape = (None, num_actions), name = "actions")
    # A place holder for discounted rewards in a full episode
    dis_rewards = tf.placeholder(tf.float32, shape = (None, 1), name = "dis_rewards")

'''Fully connected layers'''
with tf.name_scope("FC"):
    fc1 = tf.layers.dense(inputs = input_, units = num_Hidden ,activation = tf.nn.relu, name = "fc1" )
    fc2 = tf.layers.dense(inputs = fc1, units = num_actions ,activation = tf.nn.relu, name = "fc2" )
    fc3 = tf.layers.dense(inputs = fc2, units = num_actions ,activation = None, name = "fc3" )
    
# Operate with softmax on fc2 outputs to get a probability distribution
action_prob_dist = tf.nn.softmax(logits = fc3, name = "softamx")

'''Define loss'''
# Fist define reular softmax cross entropy loss
CE_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels = actions, logits = fc3, name = "CE_loss")
# Modulate the loss based on our discounted reward - this is where reinforcment learning happens,
# we favor actions that produced high reward
loss = tf.reduce_mean(CE_loss * dis_rewards)

'''Define optimizer'''
training_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

'''Define saver for saving and restoring model'''
saver = tf.train.Saver()

### Train model

In [5]:
ep_action, ep_obs, ep_reward = [], [], []  # Allocate space for episode actions, observations and rewards
tot_ep_reward = [] # Total episode reward

''' Run TF session '''
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    ''' Run episodes '''
    for ep in range(num_episodes): 
        obs = env.reset()  # Reset and save first observation
        ep_obs.append(obs) # append observation

        ''' Run steps '''
        while True:
            # Propagate forward to compute action probability distribution
            apd = np.squeeze(sess.run(action_prob_dist, feed_dict = {input_ : obs.reshape((1,4))}))
            action = np.random.choice(np.arange(num_actions), p = apd)   # Sample an action based on the pdf
            obs, reward, done, info = env.step(action)  # Take action and save observation, reward and done boolean
            
            # Convert action to one hot
            action_oh = np.zeros((1,num_actions))
            action_oh[0,action] = 1
            
            ep_action.append(action_oh)  # append action
            ep_obs.append(obs)           # append observation
            ep_reward.append(reward)     # append reward

            if done: 
                # Stack vertically episode parameters to one np.array
                ep_action = np.vstack(ep_action)
                ep_obs = np.vstack(ep_obs)
                ep_reward = np.vstack(ep_reward)

                # Discount rewards
                dis_rewards_arr = discount_rewards(ep_reward)
                # Compute loss and optimize
                sess.run([loss, training_opt],
                         feed_dict = {input_ : ep_obs[:-1], actions : ep_action, dis_rewards : dis_rewards_arr})
                
                tot_ep_reward.append(np.sum(ep_reward))  # Compute total reward for episode
                
                 # print info
                print("-------------------------------------------------")
                print("Episode {}".format(ep))
                print("Episode ended after {} steps".format(ep_action.shape[0]))
                print("Accumulated reward in this episode {}".format(tot_ep_reward[ep]))
                print("Mean reward so far {:0.2f}".format(np.mean(tot_ep_reward)))
                print("Maximal reward so far {}".format(np.max(tot_ep_reward)))
                
                ep_action, ep_obs, ep_reward = [], [], []  # Clear episode values for next episode
                      
                break

    saver.save(sess, "models/model.ckpt") # save model for later
env.close()

-------------------------------------------------
Episode 0
Episode ended after 10 steps
Accumulated reward in this episode 10.0
Mean reward so far 10.00
Maximal reward so far 10.0
-------------------------------------------------
Episode 1
Episode ended after 16 steps
Accumulated reward in this episode 16.0
Mean reward so far 13.00
Maximal reward so far 16.0
-------------------------------------------------
Episode 2
Episode ended after 20 steps
Accumulated reward in this episode 20.0
Mean reward so far 15.33
Maximal reward so far 20.0
-------------------------------------------------
Episode 3
Episode ended after 30 steps
Accumulated reward in this episode 30.0
Mean reward so far 19.00
Maximal reward so far 30.0
-------------------------------------------------
Episode 4
Episode ended after 22 steps
Accumulated reward in this episode 22.0
Mean reward so far 19.60
Maximal reward so far 30.0
-------------------------------------------------
Episode 5
Episode ended after 12 steps
Accumu

-------------------------------------------------
Episode 48
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.63
Maximal reward so far 66.0
-------------------------------------------------
Episode 49
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.48
Maximal reward so far 66.0
-------------------------------------------------
Episode 50
Episode ended after 12 steps
Accumulated reward in this episode 12.0
Mean reward so far 21.29
Maximal reward so far 66.0
-------------------------------------------------
Episode 51
Episode ended after 9 steps
Accumulated reward in this episode 9.0
Mean reward so far 21.06
Maximal reward so far 66.0
-------------------------------------------------
Episode 52
Episode ended after 28 steps
Accumulated reward in this episode 28.0
Mean reward so far 21.19
Maximal reward so far 66.0
-------------------------------------------------
Episode 53
Episode ended after 18 steps
Ac

-------------------------------------------------
Episode 102
Episode ended after 11 steps
Accumulated reward in this episode 11.0
Mean reward so far 20.21
Maximal reward so far 73.0
-------------------------------------------------
Episode 103
Episode ended after 45 steps
Accumulated reward in this episode 45.0
Mean reward so far 20.45
Maximal reward so far 73.0
-------------------------------------------------
Episode 104
Episode ended after 24 steps
Accumulated reward in this episode 24.0
Mean reward so far 20.49
Maximal reward so far 73.0
-------------------------------------------------
Episode 105
Episode ended after 25 steps
Accumulated reward in this episode 25.0
Mean reward so far 20.53
Maximal reward so far 73.0
-------------------------------------------------
Episode 106
Episode ended after 29 steps
Accumulated reward in this episode 29.0
Mean reward so far 20.61
Maximal reward so far 73.0
-------------------------------------------------
Episode 107
Episode ended after 21 

-------------------------------------------------
Episode 150
Episode ended after 64 steps
Accumulated reward in this episode 64.0
Mean reward so far 21.28
Maximal reward so far 148.0
-------------------------------------------------
Episode 151
Episode ended after 15 steps
Accumulated reward in this episode 15.0
Mean reward so far 21.24
Maximal reward so far 148.0
-------------------------------------------------
Episode 152
Episode ended after 20 steps
Accumulated reward in this episode 20.0
Mean reward so far 21.24
Maximal reward so far 148.0
-------------------------------------------------
Episode 153
Episode ended after 15 steps
Accumulated reward in this episode 15.0
Mean reward so far 21.19
Maximal reward so far 148.0
-------------------------------------------------
Episode 154
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.15
Maximal reward so far 148.0
-------------------------------------------------
Episode 155
Episode ended afte

-------------------------------------------------
Episode 201
Episode ended after 22 steps
Accumulated reward in this episode 22.0
Mean reward so far 21.30
Maximal reward so far 148.0
-------------------------------------------------
Episode 202
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.27
Maximal reward so far 148.0
-------------------------------------------------
Episode 203
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.23
Maximal reward so far 148.0
-------------------------------------------------
Episode 204
Episode ended after 22 steps
Accumulated reward in this episode 22.0
Mean reward so far 21.23
Maximal reward so far 148.0
-------------------------------------------------
Episode 205
Episode ended after 19 steps
Accumulated reward in this episode 19.0
Mean reward so far 21.22
Maximal reward so far 148.0
-------------------------------------------------
Episode 206
Episode ended afte

-------------------------------------------------
Episode 246
Episode ended after 46 steps
Accumulated reward in this episode 46.0
Mean reward so far 21.85
Maximal reward so far 148.0
-------------------------------------------------
Episode 247
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.81
Maximal reward so far 148.0
-------------------------------------------------
Episode 248
Episode ended after 19 steps
Accumulated reward in this episode 19.0
Mean reward so far 21.80
Maximal reward so far 148.0
-------------------------------------------------
Episode 249
Episode ended after 32 steps
Accumulated reward in this episode 32.0
Mean reward so far 21.84
Maximal reward so far 148.0
-------------------------------------------------
Episode 250
Episode ended after 12 steps
Accumulated reward in this episode 12.0
Mean reward so far 21.80
Maximal reward so far 148.0
-------------------------------------------------
Episode 251
Episode ended afte

Maximal reward so far 148.0
-------------------------------------------------
Episode 298
Episode ended after 12 steps
Accumulated reward in this episode 12.0
Mean reward so far 21.48
Maximal reward so far 148.0
-------------------------------------------------
Episode 299
Episode ended after 13 steps
Accumulated reward in this episode 13.0
Mean reward so far 21.46
Maximal reward so far 148.0
-------------------------------------------------
Episode 300
Episode ended after 17 steps
Accumulated reward in this episode 17.0
Mean reward so far 21.44
Maximal reward so far 148.0
-------------------------------------------------
Episode 301
Episode ended after 15 steps
Accumulated reward in this episode 15.0
Mean reward so far 21.42
Maximal reward so far 148.0
-------------------------------------------------
Episode 302
Episode ended after 26 steps
Accumulated reward in this episode 26.0
Mean reward so far 21.44
Maximal reward so far 148.0
-------------------------------------------------
Ep

-------------------------------------------------
Episode 345
Episode ended after 21 steps
Accumulated reward in this episode 21.0
Mean reward so far 21.66
Maximal reward so far 148.0
-------------------------------------------------
Episode 346
Episode ended after 22 steps
Accumulated reward in this episode 22.0
Mean reward so far 21.66
Maximal reward so far 148.0
-------------------------------------------------
Episode 347
Episode ended after 27 steps
Accumulated reward in this episode 27.0
Mean reward so far 21.68
Maximal reward so far 148.0
-------------------------------------------------
Episode 348
Episode ended after 13 steps
Accumulated reward in this episode 13.0
Mean reward so far 21.65
Maximal reward so far 148.0
-------------------------------------------------
Episode 349
Episode ended after 13 steps
Accumulated reward in this episode 13.0
Mean reward so far 21.63
Maximal reward so far 148.0
-------------------------------------------------
Episode 350
Episode ended afte

-------------------------------------------------
Episode 397
Episode ended after 19 steps
Accumulated reward in this episode 19.0
Mean reward so far 21.45
Maximal reward so far 148.0
-------------------------------------------------
Episode 398
Episode ended after 16 steps
Accumulated reward in this episode 16.0
Mean reward so far 21.43
Maximal reward so far 148.0
-------------------------------------------------
Episode 399
Episode ended after 12 steps
Accumulated reward in this episode 12.0
Mean reward so far 21.41
Maximal reward so far 148.0
-------------------------------------------------
Episode 400
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.39
Maximal reward so far 148.0
-------------------------------------------------
Episode 401
Episode ended after 24 steps
Accumulated reward in this episode 24.0
Mean reward so far 21.40
Maximal reward so far 148.0
-------------------------------------------------
Episode 402
Episode ended afte

-------------------------------------------------
Episode 445
Episode ended after 12 steps
Accumulated reward in this episode 12.0
Mean reward so far 21.60
Maximal reward so far 148.0
-------------------------------------------------
Episode 446
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.59
Maximal reward so far 148.0
-------------------------------------------------
Episode 447
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.57
Maximal reward so far 148.0
-------------------------------------------------
Episode 448
Episode ended after 23 steps
Accumulated reward in this episode 23.0
Mean reward so far 21.57
Maximal reward so far 148.0
-------------------------------------------------
Episode 449
Episode ended after 21 steps
Accumulated reward in this episode 21.0
Mean reward so far 21.57
Maximal reward so far 148.0
-------------------------------------------------
Episode 450
Episode ended afte

-------------------------------------------------
Episode 492
Episode ended after 16 steps
Accumulated reward in this episode 16.0
Mean reward so far 21.69
Maximal reward so far 148.0
-------------------------------------------------
Episode 493
Episode ended after 13 steps
Accumulated reward in this episode 13.0
Mean reward so far 21.68
Maximal reward so far 148.0
-------------------------------------------------
Episode 494
Episode ended after 33 steps
Accumulated reward in this episode 33.0
Mean reward so far 21.70
Maximal reward so far 148.0
-------------------------------------------------
Episode 495
Episode ended after 22 steps
Accumulated reward in this episode 22.0
Mean reward so far 21.70
Maximal reward so far 148.0
-------------------------------------------------
Episode 496
Episode ended after 32 steps
Accumulated reward in this episode 32.0
Mean reward so far 21.72
Maximal reward so far 148.0
-------------------------------------------------
Episode 497
Episode ended afte

-------------------------------------------------
Episode 541
Episode ended after 37 steps
Accumulated reward in this episode 37.0
Mean reward so far 21.67
Maximal reward so far 148.0
-------------------------------------------------
Episode 542
Episode ended after 11 steps
Accumulated reward in this episode 11.0
Mean reward so far 21.65
Maximal reward so far 148.0
-------------------------------------------------
Episode 543
Episode ended after 17 steps
Accumulated reward in this episode 17.0
Mean reward so far 21.64
Maximal reward so far 148.0
-------------------------------------------------
Episode 544
Episode ended after 13 steps
Accumulated reward in this episode 13.0
Mean reward so far 21.62
Maximal reward so far 148.0
-------------------------------------------------
Episode 545
Episode ended after 13 steps
Accumulated reward in this episode 13.0
Mean reward so far 21.61
Maximal reward so far 148.0
-------------------------------------------------
Episode 546
Episode ended afte

-------------------------------------------------
Episode 591
Episode ended after 17 steps
Accumulated reward in this episode 17.0
Mean reward so far 21.58
Maximal reward so far 148.0
-------------------------------------------------
Episode 592
Episode ended after 16 steps
Accumulated reward in this episode 16.0
Mean reward so far 21.57
Maximal reward so far 148.0
-------------------------------------------------
Episode 593
Episode ended after 11 steps
Accumulated reward in this episode 11.0
Mean reward so far 21.55
Maximal reward so far 148.0
-------------------------------------------------
Episode 594
Episode ended after 18 steps
Accumulated reward in this episode 18.0
Mean reward so far 21.55
Maximal reward so far 148.0
-------------------------------------------------
Episode 595
Episode ended after 27 steps
Accumulated reward in this episode 27.0
Mean reward so far 21.56
Maximal reward so far 148.0
-------------------------------------------------
Episode 596
Episode ended afte

-------------------------------------------------
Episode 643
Episode ended after 33 steps
Accumulated reward in this episode 33.0
Mean reward so far 21.46
Maximal reward so far 148.0
-------------------------------------------------
Episode 644
Episode ended after 38 steps
Accumulated reward in this episode 38.0
Mean reward so far 21.49
Maximal reward so far 148.0
-------------------------------------------------
Episode 645
Episode ended after 21 steps
Accumulated reward in this episode 21.0
Mean reward so far 21.49
Maximal reward so far 148.0
-------------------------------------------------
Episode 646
Episode ended after 17 steps
Accumulated reward in this episode 17.0
Mean reward so far 21.48
Maximal reward so far 148.0
-------------------------------------------------
Episode 647
Episode ended after 17 steps
Accumulated reward in this episode 17.0
Mean reward so far 21.47
Maximal reward so far 148.0
-------------------------------------------------
Episode 648
Episode ended afte

-------------------------------------------------
Episode 691
Episode ended after 31 steps
Accumulated reward in this episode 31.0
Mean reward so far 21.47
Maximal reward so far 148.0
-------------------------------------------------
Episode 692
Episode ended after 12 steps
Accumulated reward in this episode 12.0
Mean reward so far 21.46
Maximal reward so far 148.0
-------------------------------------------------
Episode 693
Episode ended after 38 steps
Accumulated reward in this episode 38.0
Mean reward so far 21.48
Maximal reward so far 148.0
-------------------------------------------------
Episode 694
Episode ended after 16 steps
Accumulated reward in this episode 16.0
Mean reward so far 21.48
Maximal reward so far 148.0
-------------------------------------------------
Episode 695
Episode ended after 54 steps
Accumulated reward in this episode 54.0
Mean reward so far 21.52
Maximal reward so far 148.0
-------------------------------------------------
Episode 696
Episode ended afte

-------------------------------------------------
Episode 736
Episode ended after 29 steps
Accumulated reward in this episode 29.0
Mean reward so far 21.61
Maximal reward so far 148.0
-------------------------------------------------
Episode 737
Episode ended after 24 steps
Accumulated reward in this episode 24.0
Mean reward so far 21.61
Maximal reward so far 148.0
-------------------------------------------------
Episode 738
Episode ended after 15 steps
Accumulated reward in this episode 15.0
Mean reward so far 21.60
Maximal reward so far 148.0
-------------------------------------------------
Episode 739
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.59
Maximal reward so far 148.0
-------------------------------------------------
Episode 740
Episode ended after 43 steps
Accumulated reward in this episode 43.0
Mean reward so far 21.62
Maximal reward so far 148.0
-------------------------------------------------
Episode 741
Episode ended afte

-------------------------------------------------
Episode 796
Episode ended after 15 steps
Accumulated reward in this episode 15.0
Mean reward so far 21.76
Maximal reward so far 148.0
-------------------------------------------------
Episode 797
Episode ended after 15 steps
Accumulated reward in this episode 15.0
Mean reward so far 21.75
Maximal reward so far 148.0
-------------------------------------------------
Episode 798
Episode ended after 15 steps
Accumulated reward in this episode 15.0
Mean reward so far 21.74
Maximal reward so far 148.0
-------------------------------------------------
Episode 799
Episode ended after 93 steps
Accumulated reward in this episode 93.0
Mean reward so far 21.83
Maximal reward so far 148.0
-------------------------------------------------
Episode 800
Episode ended after 11 steps
Accumulated reward in this episode 11.0
Mean reward so far 21.82
Maximal reward so far 148.0
-------------------------------------------------
Episode 801
Episode ended afte

Mean reward so far 21.72
Maximal reward so far 148.0
-------------------------------------------------
Episode 846
Episode ended after 27 steps
Accumulated reward in this episode 27.0
Mean reward so far 21.72
Maximal reward so far 148.0
-------------------------------------------------
Episode 847
Episode ended after 18 steps
Accumulated reward in this episode 18.0
Mean reward so far 21.72
Maximal reward so far 148.0
-------------------------------------------------
Episode 848
Episode ended after 13 steps
Accumulated reward in this episode 13.0
Mean reward so far 21.71
Maximal reward so far 148.0
-------------------------------------------------
Episode 849
Episode ended after 28 steps
Accumulated reward in this episode 28.0
Mean reward so far 21.72
Maximal reward so far 148.0
-------------------------------------------------
Episode 850
Episode ended after 21 steps
Accumulated reward in this episode 21.0
Mean reward so far 21.72
Maximal reward so far 148.0
---------------------------

-------------------------------------------------
Episode 892
Episode ended after 18 steps
Accumulated reward in this episode 18.0
Mean reward so far 21.79
Maximal reward so far 148.0
-------------------------------------------------
Episode 893
Episode ended after 26 steps
Accumulated reward in this episode 26.0
Mean reward so far 21.79
Maximal reward so far 148.0
-------------------------------------------------
Episode 894
Episode ended after 61 steps
Accumulated reward in this episode 61.0
Mean reward so far 21.84
Maximal reward so far 148.0
-------------------------------------------------
Episode 895
Episode ended after 10 steps
Accumulated reward in this episode 10.0
Mean reward so far 21.82
Maximal reward so far 148.0
-------------------------------------------------
Episode 896
Episode ended after 16 steps
Accumulated reward in this episode 16.0
Mean reward so far 21.82
Maximal reward so far 148.0
-------------------------------------------------
Episode 897
Episode ended afte

Episode 939
Episode ended after 9 steps
Accumulated reward in this episode 9.0
Mean reward so far 21.87
Maximal reward so far 148.0
-------------------------------------------------
Episode 940
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.86
Maximal reward so far 148.0
-------------------------------------------------
Episode 941
Episode ended after 13 steps
Accumulated reward in this episode 13.0
Mean reward so far 21.85
Maximal reward so far 148.0
-------------------------------------------------
Episode 942
Episode ended after 40 steps
Accumulated reward in this episode 40.0
Mean reward so far 21.87
Maximal reward so far 148.0
-------------------------------------------------
Episode 943
Episode ended after 19 steps
Accumulated reward in this episode 19.0
Mean reward so far 21.87
Maximal reward so far 148.0
-------------------------------------------------
Episode 944
Episode ended after 17 steps
Accumulated reward in this episode 17.0
M

-------------------------------------------------
Episode 990
Episode ended after 18 steps
Accumulated reward in this episode 18.0
Mean reward so far 21.88
Maximal reward so far 148.0
-------------------------------------------------
Episode 991
Episode ended after 10 steps
Accumulated reward in this episode 10.0
Mean reward so far 21.87
Maximal reward so far 148.0
-------------------------------------------------
Episode 992
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.87
Maximal reward so far 148.0
-------------------------------------------------
Episode 993
Episode ended after 14 steps
Accumulated reward in this episode 14.0
Mean reward so far 21.86
Maximal reward so far 148.0
-------------------------------------------------
Episode 994
Episode ended after 17 steps
Accumulated reward in this episode 17.0
Mean reward so far 21.85
Maximal reward so far 148.0
-------------------------------------------------
Episode 995
Episode ended afte

## Watch the agent play an episode

In [6]:
with tf.Session() as sess:
    saver.restore(sess, "models/model.ckpt") # load model
    obs = env.reset() # Reset env and save observation
    t = 0
    while True:
        env.render() # Render game
        # Use our model to create a probability distribution of actions based on observation
        apd = np.squeeze(sess.run(action_prob_dist, feed_dict={input_ : obs.reshape((1,4))}))
        # Choose an action out of the PDF and take action
        action = np.random.choice(np.arange(num_actions), p = apd)
        obs, reward, done, info = env.step(action)
        t = t+1
        if done:
            print("Game ended after {} steps".format(t+1))
            break
env.close()

INFO:tensorflow:Restoring parameters from models/model.ckpt
Game ended after 19 steps
