In [1]:
import numpy as np
import _pickle as pickle
import tensorflow as tf
import matplotlib.pyplot as plt
import math

%matplotlib inline

from tensorflow.python.framework import dtypes, ops
from tensorflow.python.ops import array_ops, control_flow_ops, embedding_ops, math_ops, nn_ops, variable_scope

  from ._conv import register_converters as _register_converters


In [2]:
import gym
env = gym.make('CartPole-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
# Hyperparameters
H = 8 # number of hidden layer neurons
learning_rate = 1e-2
gamma = 0.99 # Discount factor for reward
decay_rate = 0.99 # Decay factor for RMSProp leaky sum of grad^2
resume = False # Resume from previous checkpoint

policy_bs = 3 # Batch size when training policy
model_bs = 3 # Batch size when training model

# Model initialization
D = 4 # Input dimensionality

# Policy Network

In [4]:
tf.reset_default_graph()
observations = tf.placeholder(tf.float32, [None, 4], name="input_x")
W1 = tf.get_variable("W1", shape=[4, H],
                     initializer=tf.contrib.layers.xavier_initializer())
a1 = tf.nn.relu(tf.matmul(observations, W1))
W2 = tf.get_variable("W2", shape=[H, 1],
                     initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(a1, W2)
proba = tf.nn.sigmoid(score) # Probability of taking action "1"

tvars = tf.trainable_variables()
# The action that was performed, as a float
input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
# How much reward offset by a baseline did we get after taking `this` action?
advantages = tf.placeholder(tf.float32, name="reward_signal")
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
W1grad = tf.placeholder(tf.float32, name="batch_grad1")
W2grad = tf.placeholder(tf.float32, name="batch_grad2")
batch_grad = [W1grad, W2grad]

# Define loss function
loglik = tf.log(input_y * (input_y - proba) +      # if input_y is 1
                (1 - input_y) * (input_y + proba)) # if input_y is 0
loss = -tf.reduce_mean(loglik * advantages)

# Compute the gradient of the loss w.r.t the parameters
new_grads = tf.gradients(loss, tvars)

# Don't apply the gradients right away, in case we want to manipulate them in our training routine
update_grads = adam.apply_gradients(zip(batch_grad, tvars))

Instructions for updating:
Use the retry module or similar alternatives.


# Model Network

In [5]:
mH = 256 # model layer size

# Model "encoder": takes the previous observation + action and tries to find a rich embedding
prev_state = tf.placeholder(tf.float32, [None, 5], name="prev_state")
W1_model = tf.get_variable("W1_model", shape=[5, mH],
                           initializer=tf.contrib.layers.xavier_initializer())
B1_model = tf.Variable(tf.zeros([mH]), name="B1_model")
A1_model = tf.nn.relu(tf.matmul(prev_state, W1_model) + B1_model)
W2_model = tf.get_variable("W2_model", shape=[mH, mH],
                           initializer=tf.contrib.layers.xavier_initializer())
B2_model = tf.Variable(tf.zeros([mH]), name="B2_model")
A2_model = tf.nn.relu(tf.matmul(A1_model, W2_model) + B2_model)

# Final layer to predict output
wO = tf.get_variable("wO", shape=[mH, 4],
                     initializer=tf.contrib.layers.xavier_initializer())
bO = tf.Variable(tf.zeros([4]), name="bO")

# Final layer to predict reward
wR = tf.get_variable("wR", shape=[mH, 1],
                     initializer=tf.contrib.layers.xavier_initializer())
bR = tf.Variable(tf.zeros([1]), name="bR")

# Final layer to predict if done
wD = tf.get_variable("wD", shape=[mH, 1],
                     initializer=tf.contrib.layers.xavier_initializer())
bD = tf.Variable(tf.ones([1]), name="bD")

pred_obs = tf.add(tf.matmul(A2_model, wO), bO, name="predicted_obs")
pred_reward = tf.add(tf.matmul(A2_model, wR), bR, name="predicted_reward")
pred_done = tf.sigmoid(tf.matmul(A2_model, wD) + bD, name="predicted_done")

true_obs = tf.placeholder(tf.float32, [None, 4], name="true_obs")
true_reward = tf.placeholder(tf.float32, [None, 1], name="true_reward")
true_done = tf.placeholder(tf.float32, [None, 1], name="true_done")

predicted_state = tf.concat([pred_obs, pred_reward, pred_done], axis=1)

# Define losses for Model Network
obs_loss = tf.square(true_obs - pred_obs)
reward_loss = tf.square(true_reward - pred_reward)
done_loss = tf.multiply(pred_done, true_done) + tf.multiply(1 - pred_done, 1 - true_done)
# Make sure to not have a typo here (done_less -> fail)
done_loss = -tf.log(done_loss)
model_loss = tf.reduce_mean(obs_loss + done_loss + reward_loss)

adam_model = tf.train.AdamOptimizer(learning_rate=learning_rate)
update_model = adam_model.minimize(model_loss)

# Helper Functions

In [6]:
def reset_grad_buffer(buffer):
    for ix, grad in enumerate(buffer):
        buffer[ix] = grad * 0
    return buffer

def discount_rewards(rewards):
    discounted_r = np.zeros_like(rewards)
    running_add = 0
    for t in reversed(range(0, rewards.size)):
        running_add = running_add * gamma + rewards[t]
        discounted_r[t] = running_add
    return discounted_r

# Use our model to produce a new state when given a previous observation and action
# Actually, we pass in the entire history of observations (shape: (1, 4)) to this function
def step_model(sess, history, action):
    prev_state_tofeed = np.reshape(np.hstack([history[-1][0],
                                                np.array(action)]), [1, 5])
    prediction = sess.run([predicted_state], feed_dict={prev_state: prev_state_tofeed})
    reward = prediction[0][:, 4][0]
    obs = prediction[0][:, 0:4]
    obs[:, 0] = np.clip(obs[:, 0], -2.4, 2.4)
    obs[:, 2] = np.clip(obs[:, 2], -0.4, 0.4)
    done_pred = np.clip(prediction[0][:, 5], 0, 1)[0]
#     print("predicted reward:", reward)
    if done_pred > 0.1 or len(history) >= 300:
        done = True
    else:
        done = False
#     print("model output: obs: {} \t reward: {} \t done: {}".format(str(obs), str(reward), str(done)))
    return obs, reward, done

# Training the Policy and Model

In [7]:
saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess, './cartpole-model/cartpole-model')
    obs = env.reset()
    
    while True:
        obs = np.reshape(obs, [1, 4])
        env.render()
        action_proba = sess.run(proba, feed_dict={observations: obs})
        action = 1 if np.random.uniform() < action_proba else 0
        
        obs, reward, done, info = env.step(action)
        
        if done:
            cont = input('>>> ')
            if len(cont) > 0 and cont[0].lower() == 'n':
                break
            obs = env.reset()

INFO:tensorflow:Restoring parameters from ./cartpole-model/cartpole-model
>>> 
>>> 
>>> 
>>> 
>>> 
>>> 
>>> 
>>> n
