# Model-Based RL - Tuned
In this exercise you will implement a policy and model network which work in tandem to solve the CartPole reinforcement learning problem.

This is a bonus task where this Model Policy Network is tuned.

### Loading libraries and starting CartPole environment

In [1]:
from __future__ import print_function
import numpy as np
try:
    import cPickle as pickle
except:
    import pickle
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import math

In [2]:
import sys
if sys.version_info.major > 2:
    xrange = range
del sys

In [3]:
import gym
env = gym.make('CartPole-v0')

  result = entry_point.load(False)


### Setting Hyper-parameters

In [4]:
# hyperparameters
H = 16 # number of hidden layer neurons
learning_rate = 1e-2
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?

model_bs = 3 # Batch size when learning from model
real_bs = 3 # Batch size when learning from real environment

# model initialization
D = 4 # input dimensionality

### Policy Network

In [5]:
tf.reset_default_graph()
observations = tf.placeholder(tf.float32, [None,4] , name="input_x")
W1 = tf.get_variable("W1", shape=[4, H],
           initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable("W2", shape=[H, 1],
           initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)

tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
advantages = tf.placeholder(tf.float32,name="reward_signal")
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
W1Grad = tf.placeholder(tf.float32,name="batch_grad1")
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [W1Grad,W2Grad]

################################################################################
# TODO: Implement the loss function.                                           #
# This sends the weights in the direction of making actions that gave good     #
# advantage (reward overtime) more likely, and actions that didn't less likely.#
################################################################################
loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglik * advantages) 
################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

newGrads = tf.gradients(loss,tvars)
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))

### Model Network
Here we implement a multi-layer neural network that predicts the next observation, reward, and done state from a current state and action.

In [6]:
mH = 512 # model layer size

input_data = tf.placeholder(tf.float32, [None, 5])
with tf.variable_scope('rnnlm'):
    softmax_w = tf.get_variable("softmax_w", [mH, 50])
    softmax_b = tf.get_variable("softmax_b", [50])

previous_state = tf.placeholder(tf.float32, [None,5] , name="previous_state")
W1M = tf.get_variable("W1M", shape=[5, mH],
           initializer=tf.contrib.layers.xavier_initializer())
B1M = tf.Variable(tf.zeros([mH]),name="B1M")
layer1M = tf.nn.relu(tf.matmul(previous_state,W1M) + B1M)
W2M = tf.get_variable("W2M", shape=[mH, mH],
           initializer=tf.contrib.layers.xavier_initializer())
B2M = tf.Variable(tf.zeros([mH]),name="B2M")
layer2M = tf.nn.relu(tf.matmul(layer1M,W2M) + B2M)
wO = tf.get_variable("wO", shape=[mH, 4],
           initializer=tf.contrib.layers.xavier_initializer())
wR = tf.get_variable("wR", shape=[mH, 1],
           initializer=tf.contrib.layers.xavier_initializer())
wD = tf.get_variable("wD", shape=[mH, 1],
           initializer=tf.contrib.layers.xavier_initializer())

bO = tf.Variable(tf.zeros([4]),name="bO")
bR = tf.Variable(tf.zeros([1]),name="bR")
bD = tf.Variable(tf.ones([1]),name="bD")


predicted_observation = tf.matmul(layer2M,wO,name="predicted_observation") + bO
predicted_reward = tf.matmul(layer2M,wR,name="predicted_reward") + bR
predicted_done = tf.sigmoid(tf.matmul(layer2M,wD,name="predicted_done") + bD)

true_observation = tf.placeholder(tf.float32,[None,4],name="true_observation")
true_reward = tf.placeholder(tf.float32,[None,1],name="true_reward")
true_done = tf.placeholder(tf.float32,[None,1],name="true_done")


predicted_state = tf.concat([predicted_observation,predicted_reward,predicted_done],1)

observation_loss = tf.square(true_observation - predicted_observation)

reward_loss = tf.square(true_reward - predicted_reward)

done_loss = tf.multiply(predicted_done, true_done) + tf.multiply(1-predicted_done, 1-true_done)
done_loss = -tf.log(done_loss)

model_loss = tf.reduce_mean(observation_loss + done_loss + reward_loss)

modelAdam = tf.train.AdamOptimizer(learning_rate=learning_rate)
updateModel = modelAdam.minimize(model_loss)

### Helper-functions

In [7]:
def resetGradBuffer(gradBuffer):
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    return gradBuffer

def discount_rewards(r):
    ################################################################################
    # TODO: Implement the discounted rewards function                              #
    # Return discounted rewards weighed by gamma. Each reward will be replaced     #
    # with a weight reward that involves itself and all the other rewards occuring #
    # after it. The later the reward after it happens, the less effect it has on   #
    # the current rewards's discounted reward                                      #
    # Hint: [r0, r1, r2, ..., r_N] will look someting like:                        #
    #       [(r0 + r1*gamma^1 + ... r_N*gamma^N), (r1 + r2*gamma^1 + ...), ...]    #
    ################################################################################
    rnew = np.copy(r)
    for i in range(1, len(rnew)):
        rnew[:len(r)-i] += gamma**i * r[i:]
    return rnew        
    ################################################################################
    #                                 END OF YOUR CODE                             #
    ################################################################################

# This function uses our model to produce a new state when given a previous state and action
def stepModel(sess, xs, action):
    toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5])
    myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed})
    reward = myPredict[0][:,4]
    observation = myPredict[0][:,0:4]
    observation[:,0] = np.clip(observation[:,0],-2.4,2.4)
    observation[:,2] = np.clip(observation[:,2],-0.4,0.4)
    doneP = np.clip(myPredict[0][:,5],0,1)
    if doneP > 0.1 or len(xs)>= 300:
        done = True
    else:
        done = False
    return observation, reward, done

## Training the Policy and Model

In [8]:
xs,drs,ys,ds = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
real_episodes = 1
init = tf.global_variables_initializer()
batch_size = real_bs

drawFromModel = False # When set to True, will use model for observations
trainTheModel = True # Whether to train the model
trainThePolicy = False # Whether to train the policy
switch_point = 1

# Launch the graph
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset()
    x = observation
    gradBuffer = sess.run(tvars)
    gradBuffer = resetGradBuffer(gradBuffer)
    
    while episode_number <= 5000:
        # Start displaying environment once performance is acceptably high.
        if (reward_sum/batch_size > 150 and drawFromModel == False) or rendering == True : 
#             env.render()
            rendering = True
            
        x = np.reshape(observation,[1,4])

        tfprob = sess.run(probability,feed_dict={observations: x})
        action = 1 if np.random.uniform() < tfprob else 0

        # record various intermediates (needed later for backprop)
        xs.append(x) 
        y = 1 if action == 0 else 0 
        ys.append(y)
        
        # step the  model or real environment and get new measurements
        if drawFromModel == False:
            observation, reward, done, info = env.step(action)
        else:
            observation, reward, done = stepModel(sess,xs,action)
                
        reward_sum += reward
        
        ds.append(done*1)
        drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

        if done: 
            
            if drawFromModel == False: 
                real_episodes += 1
            episode_number += 1

            # stack together all inputs, hidden states, action gradients, and rewards for this episode
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            epd = np.vstack(ds)
            xs,drs,ys,ds = [],[],[],[] # reset array memory
            
            if trainTheModel == True:
                
                ################################################################################
                # TODO: Run the model network and compute predicted_state                      #
                # Output: 'pState'                                                             #
                ################################################################################
                feed_dict = {
                    previous_state: np.hstack([epx[:-1], epy[:-1]]),
                    true_observation: epx[1:],
                    true_reward: epr[1:],
                    true_done: epd[1:]
                }
                tState = np.hstack([epx[1:], epr[1:], epd[1:]])
                _, pState = sess.run([updateModel, predicted_state], feed_dict=feed_dict)
                ################################################################################
                #                                 END OF YOUR CODE                             #
                ################################################################################
                

            if trainThePolicy == True:
                
                ################################################################################
                # TODO: Run the policy network and compute newGrads                            #
                # Output: 'tGrad'                                                              #
                ################################################################################
                discounted_epr = discount_rewards(epr)
                # size the rewards to be unit normal (helps control the gradient estimator variance)
                discounted_epr -= np.mean(discounted_epr)
                discounted_epr //= np.std(discounted_epr)
                tGrad = sess.run(newGrads, feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
                ################################################################################
                #                                 END OF YOUR CODE                             #
                ################################################################################
                
                # If gradients becom too large, end training process
                if np.sum(tGrad[0] == tGrad[0]) == 0:
                    break
                for ix,grad in enumerate(tGrad):
                    gradBuffer[ix] += grad
                
            if switch_point + batch_size == episode_number: 
                switch_point = episode_number
                if trainThePolicy == True:
                    
                    ################################################################################
                    # TODO:                                                                        #
                    # (1) Run the policy network and update gradients                              #
                    # (2) Reset gradBuffer to 0                                                    #
                    ################################################################################
                    sess.run(updateGrads, feed_dict={W1Grad: gradBuffer[0], W2Grad: gradBuffer[1]})
                    # gradBuffer reset is already done at the beginning of episode
                    ################################################################################
                    #                                 END OF YOUR CODE                             #
                    ################################################################################

                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                if drawFromModel == False:
                    print('World Perf: Episode %f. Reward %f. action: %f. mean reward %f.' % (real_episodes,reward_sum/real_bs,action, running_reward/real_bs))
                    if reward_sum/batch_size > 200:
                        break
                reward_sum = 0

                # Once the model has been trained on 100 episodes
                if episode_number > 1000:
                    
                    ################################################################################
                    # TODO: Alternating between training the policy from the model and training    #
                    # the model from the real environment.                                         #
                    ################################################################################
                    drawFromModel = not drawFromModel
                    trainTheModel = not trainTheModel
                    trainThePolicy = not trainThePolicy
                    ################################################################################
                    #                                 END OF YOUR CODE                             #
                    ################################################################################
            
            if drawFromModel == True:
                observation = np.random.uniform(-0.1,0.1,[4]) # Generate reasonable starting point
                batch_size = model_bs
            else:
                observation = env.reset()
                batch_size = real_bs
                
print(real_episodes)

World Perf: Episode 4.000000. Reward 17.333333. action: 0.000000. mean reward 17.333333.
World Perf: Episode 7.000000. Reward 26.333333. action: 1.000000. mean reward 17.423333.
World Perf: Episode 10.000000. Reward 20.333333. action: 1.000000. mean reward 17.452433.
World Perf: Episode 13.000000. Reward 39.000000. action: 1.000000. mean reward 17.667909.
World Perf: Episode 16.000000. Reward 19.000000. action: 1.000000. mean reward 17.681230.
World Perf: Episode 19.000000. Reward 24.000000. action: 1.000000. mean reward 17.744418.
World Perf: Episode 22.000000. Reward 31.000000. action: 0.000000. mean reward 17.876973.
World Perf: Episode 25.000000. Reward 29.333333. action: 0.000000. mean reward 17.991537.
World Perf: Episode 28.000000. Reward 21.000000. action: 1.000000. mean reward 18.021622.
World Perf: Episode 31.000000. Reward 15.333333. action: 1.000000. mean reward 17.994739.
World Perf: Episode 34.000000. Reward 17.000000. action: 0.000000. mean reward 17.984791.
World Perf: 

World Perf: Episode 289.000000. Reward 26.333333. action: 1.000000. mean reward 19.799648.
World Perf: Episode 292.000000. Reward 26.333333. action: 0.000000. mean reward 19.864984.
World Perf: Episode 295.000000. Reward 16.666667. action: 0.000000. mean reward 19.833001.
World Perf: Episode 298.000000. Reward 20.666667. action: 1.000000. mean reward 19.841338.
World Perf: Episode 301.000000. Reward 15.666667. action: 0.000000. mean reward 19.799591.
World Perf: Episode 304.000000. Reward 21.000000. action: 1.000000. mean reward 19.811595.
World Perf: Episode 307.000000. Reward 18.000000. action: 1.000000. mean reward 19.793479.
World Perf: Episode 310.000000. Reward 28.000000. action: 0.000000. mean reward 19.875544.
World Perf: Episode 313.000000. Reward 19.000000. action: 1.000000. mean reward 19.866789.
World Perf: Episode 316.000000. Reward 33.666667. action: 1.000000. mean reward 20.004788.
World Perf: Episode 319.000000. Reward 17.000000. action: 1.000000. mean reward 19.974740.

World Perf: Episode 574.000000. Reward 24.333333. action: 1.000000. mean reward 20.570298.
World Perf: Episode 577.000000. Reward 23.000000. action: 1.000000. mean reward 20.594595.
World Perf: Episode 580.000000. Reward 13.333333. action: 1.000000. mean reward 20.521983.
World Perf: Episode 583.000000. Reward 18.333333. action: 0.000000. mean reward 20.500096.
World Perf: Episode 586.000000. Reward 24.333333. action: 0.000000. mean reward 20.538429.
World Perf: Episode 589.000000. Reward 15.333333. action: 0.000000. mean reward 20.486378.
World Perf: Episode 592.000000. Reward 15.333333. action: 0.000000. mean reward 20.434847.
World Perf: Episode 595.000000. Reward 12.666667. action: 1.000000. mean reward 20.357166.
World Perf: Episode 598.000000. Reward 20.000000. action: 1.000000. mean reward 20.353594.
World Perf: Episode 601.000000. Reward 14.666667. action: 1.000000. mean reward 20.296725.
World Perf: Episode 604.000000. Reward 22.333333. action: 0.000000. mean reward 20.317091.

World Perf: Episode 859.000000. Reward 21.333333. action: 1.000000. mean reward 20.700685.
World Perf: Episode 862.000000. Reward 20.666667. action: 1.000000. mean reward 20.700345.
World Perf: Episode 865.000000. Reward 13.000000. action: 1.000000. mean reward 20.623341.
World Perf: Episode 868.000000. Reward 19.000000. action: 1.000000. mean reward 20.607108.
World Perf: Episode 871.000000. Reward 14.666667. action: 0.000000. mean reward 20.547704.
World Perf: Episode 874.000000. Reward 23.333333. action: 0.000000. mean reward 20.575560.
World Perf: Episode 877.000000. Reward 14.666667. action: 1.000000. mean reward 20.516471.
World Perf: Episode 880.000000. Reward 18.000000. action: 1.000000. mean reward 20.491306.
World Perf: Episode 883.000000. Reward 17.333333. action: 1.000000. mean reward 20.459727.
World Perf: Episode 886.000000. Reward 23.333333. action: 1.000000. mean reward 20.488463.
World Perf: Episode 889.000000. Reward 33.333333. action: 0.000000. mean reward 20.616911.

World Perf: Episode 1129.000000. Reward 9.666667. action: 1.000000. mean reward 25.506653.
World Perf: Episode 1132.000000. Reward 14.333333. action: 0.000000. mean reward 25.213549.
World Perf: Episode 1135.000000. Reward 11.666667. action: 0.000000. mean reward 25.889967.
World Perf: Episode 1138.000000. Reward 12.000000. action: 1.000000. mean reward 25.579987.
World Perf: Episode 1141.000000. Reward 11.333333. action: 1.000000. mean reward 25.239510.
World Perf: Episode 1144.000000. Reward 10.666667. action: 0.000000. mean reward 27.097336.
World Perf: Episode 1147.000000. Reward 10.333333. action: 1.000000. mean reward 26.712702.
World Perf: Episode 1150.000000. Reward 10.333333. action: 0.000000. mean reward 26.429365.
World Perf: Episode 1153.000000. Reward 10.333333. action: 0.000000. mean reward 29.033224.
World Perf: Episode 1156.000000. Reward 11.333333. action: 1.000000. mean reward 28.667185.
World Perf: Episode 1159.000000. Reward 12.000000. action: 0.000000. mean reward 

World Perf: Episode 1399.000000. Reward 9.333333. action: 0.000000. mean reward 111.314629.
World Perf: Episode 1402.000000. Reward 10.000000. action: 1.000000. mean reward 112.220573.
World Perf: Episode 1405.000000. Reward 9.666667. action: 1.000000. mean reward 110.166115.
World Perf: Episode 1408.000000. Reward 9.666667. action: 1.000000. mean reward 111.064995.
World Perf: Episode 1411.000000. Reward 9.666667. action: 0.000000. mean reward 111.997375.
World Perf: Episode 1414.000000. Reward 9.666667. action: 0.000000. mean reward 112.765900.
World Perf: Episode 1417.000000. Reward 9.666667. action: 1.000000. mean reward 113.564217.
World Perf: Episode 1420.000000. Reward 9.333333. action: 0.000000. mean reward 114.432198.
World Perf: Episode 1423.000000. Reward 9.333333. action: 0.000000. mean reward 115.220406.
World Perf: Episode 1426.000000. Reward 9.666667. action: 0.000000. mean reward 115.972069.
World Perf: Episode 1429.000000. Reward 9.000000. action: 1.000000. mean reward

World Perf: Episode 1666.000000. Reward 9.333333. action: 1.000000. mean reward 146.447632.
World Perf: Episode 1669.000000. Reward 9.333333. action: 0.000000. mean reward 146.496490.
World Perf: Episode 1672.000000. Reward 9.000000. action: 0.000000. mean reward 146.609726.
World Perf: Episode 1675.000000. Reward 9.000000. action: 1.000000. mean reward 146.882812.
World Perf: Episode 1678.000000. Reward 8.666667. action: 1.000000. mean reward 147.030289.
World Perf: Episode 1681.000000. Reward 9.333333. action: 1.000000. mean reward 147.116638.
World Perf: Episode 1684.000000. Reward 9.666667. action: 0.000000. mean reward 147.180832.
World Perf: Episode 1687.000000. Reward 9.333333. action: 0.000000. mean reward 147.363434.
World Perf: Episode 1690.000000. Reward 9.666667. action: 1.000000. mean reward 147.580887.
World Perf: Episode 1693.000000. Reward 8.666667. action: 1.000000. mean reward 147.615158.
World Perf: Episode 1696.000000. Reward 9.000000. action: 0.000000. mean reward 

World Perf: Episode 1936.000000. Reward 9.333333. action: 1.000000. mean reward 150.283218.
World Perf: Episode 1939.000000. Reward 9.666667. action: 1.000000. mean reward 150.466751.
World Perf: Episode 1942.000000. Reward 8.333333. action: 0.000000. mean reward 150.505249.
World Perf: Episode 1945.000000. Reward 9.333333. action: 1.000000. mean reward 150.487198.
World Perf: Episode 1948.000000. Reward 9.333333. action: 1.000000. mean reward 150.645157.
World Perf: Episode 1951.000000. Reward 9.333333. action: 1.000000. mean reward 150.754013.
World Perf: Episode 1954.000000. Reward 9.666667. action: 1.000000. mean reward 150.776016.
World Perf: Episode 1957.000000. Reward 8.666667. action: 1.000000. mean reward 150.769608.
World Perf: Episode 1960.000000. Reward 9.666667. action: 1.000000. mean reward 150.880447.
World Perf: Episode 1963.000000. Reward 8.666667. action: 0.000000. mean reward 150.983749.
World Perf: Episode 1966.000000. Reward 9.333333. action: 1.000000. mean reward 

World Perf: Episode 2203.000000. Reward 9.333333. action: 0.000000. mean reward 154.386612.
World Perf: Episode 2206.000000. Reward 9.333333. action: 0.000000. mean reward 154.324188.
World Perf: Episode 2209.000000. Reward 9.000000. action: 0.000000. mean reward 154.510574.
World Perf: Episode 2212.000000. Reward 8.333333. action: 0.000000. mean reward 154.418777.
World Perf: Episode 2215.000000. Reward 8.666667. action: 0.000000. mean reward 154.313751.
World Perf: Episode 2218.000000. Reward 9.666667. action: 1.000000. mean reward 154.399857.
World Perf: Episode 2221.000000. Reward 9.333333. action: 1.000000. mean reward 154.393234.
World Perf: Episode 2224.000000. Reward 8.666667. action: 1.000000. mean reward 154.345749.
World Perf: Episode 2227.000000. Reward 8.666667. action: 0.000000. mean reward 154.285568.
World Perf: Episode 2230.000000. Reward 8.666667. action: 0.000000. mean reward 154.397278.
World Perf: Episode 2233.000000. Reward 8.666667. action: 1.000000. mean reward 

World Perf: Episode 2473.000000. Reward 9.666667. action: 1.000000. mean reward 154.623947.
World Perf: Episode 2476.000000. Reward 9.000000. action: 1.000000. mean reward 154.648514.
World Perf: Episode 2479.000000. Reward 9.000000. action: 1.000000. mean reward 154.601135.
World Perf: Episode 2482.000000. Reward 9.000000. action: 0.000000. mean reward 154.559402.
World Perf: Episode 2485.000000. Reward 9.666667. action: 1.000000. mean reward 154.588943.
World Perf: Episode 2488.000000. Reward 9.333333. action: 0.000000. mean reward 154.647156.
World Perf: Episode 2491.000000. Reward 9.333333. action: 1.000000. mean reward 154.645126.
World Perf: Episode 2494.000000. Reward 9.000000. action: 1.000000. mean reward 154.661148.
World Perf: Episode 2497.000000. Reward 9.333333. action: 0.000000. mean reward 154.653915.
World Perf: Episode 2500.000000. Reward 9.333333. action: 0.000000. mean reward 154.709671.
World Perf: Episode 2503.000000. Reward 8.000000. action: 0.000000. mean reward 

World Perf: Episode 2743.000000. Reward 8.666667. action: 0.000000. mean reward 155.086029.
World Perf: Episode 2746.000000. Reward 9.000000. action: 1.000000. mean reward 155.015259.
World Perf: Episode 2749.000000. Reward 9.000000. action: 1.000000. mean reward 155.064697.
World Perf: Episode 2752.000000. Reward 9.666667. action: 1.000000. mean reward 155.049820.
World Perf: Episode 2755.000000. Reward 8.666667. action: 0.000000. mean reward 154.959274.
World Perf: Episode 2758.000000. Reward 8.666667. action: 0.000000. mean reward 155.054581.
World Perf: Episode 2761.000000. Reward 9.000000. action: 1.000000. mean reward 155.125137.
World Perf: Episode 2764.000000. Reward 9.333333. action: 1.000000. mean reward 155.109970.
World Perf: Episode 2767.000000. Reward 9.666667. action: 1.000000. mean reward 155.060776.
World Perf: Episode 2770.000000. Reward 9.000000. action: 1.000000. mean reward 155.189590.
World Perf: Episode 2773.000000. Reward 9.333333. action: 1.000000. mean reward 

Originally at Epsiode 2551, mean reward is about 138

Now at Epsiode 2551, mean reward is 155 ==> 17 points more (+12% increase)