In [21]:
import numpy as np
import pickle
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import math

In [22]:
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import embedding_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import rnn
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import variable_scope

In [23]:
import gym

In [24]:
env = gym.make('CartPole-v0')

[2017-02-01 00:33:21,586] Making new env: CartPole-v0


### Setting Hyper-parameters

In [25]:
H = 8 # hidden layer 
learning_rate = 1e-2
gamma = 0.99
decay_rate = 0.99 
resume = False

model_bs = 3 # batch size
real_bs = 3

D = 4 # input dimensionality

### Policy Network

#### 1 hidden layer for policy network
- input : observation(shape=[4])
- output : probability(shape=[1])

In [26]:
tf.reset_default_graph()
observations = tf.placeholder(tf.float32, [None, 4], name="input_x")
W1 = tf.get_variable("W1", shape=[4, H],
                    initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations, W1))
W2 = tf.get_variable("W2", shape=[H, 1], 
                     initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1, W2)
probability = tf.nn.sigmoid(score)

In [27]:
tvars = tf.trainable_variables() # W1, W2
input_y = tf.placeholder(tf.float32, [None,1], name="input_y")
advantages = tf.placeholder(tf.float32, name="reward_signal")
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
W1Grad = tf.placeholder(tf.float32, name="batch_grad1")
W2Grad = tf.placeholder(tf.float32, name="batch_grad2")
batchGrad = [W1Grad, W2Grad]

#### loss & gradients

In [28]:
loglik = tf.log(input_y * (input_y - probability) + (1 - input_y) * (input_y + probability))
loss = -tf.reduce_mean(loglik * advantages)
newGrads = tf.gradients(loss,tvars)
updateGrads = adam.apply_gradients(zip(batchGrad,tvars)) # list of [gradient, vars]

### Model Network

#### 2 hidden layer for model

In [29]:
mH = 256 # model layer size

input_data = tf.placeholder(tf.float32, [None, 5]) # 4+ 1?
with tf.variable_scope('rnnlm'):
    softmax_w = tf.get_variable("softmax_w", [mH, 50])
    softmax_b = tf.get_variable("softmax_b", [50])

previous_state = tf.placeholder(tf.float32, [None, 5], name= "previous_state")
W1M = tf.get_variable("W1M", shape=[5, mH], initializer=tf.contrib.layers.xavier_initializer())
B1M = tf.Variable(tf.zeros([mH]), name="B1M")
layer1M = tf.nn.relu(tf.matmul(previous_state, W1M) + B1M)
W2M = tf.get_variable("W2M", shape=[mH, mH], initializer=tf.contrib.layers.xavier_initializer())
B2M = tf.Variable(tf.zeros([mH]), name="B2M")
layer2M = tf.nn.relu(tf.matmul(layer1M, W2M) + B2M)

#### output layer for model(observation, reward, done)

In [30]:
wO = tf.get_variable("wO", shape=[mH, 4], initializer=tf.contrib.layers.xavier_initializer())
wR = tf.get_variable("wR", shape=[mH, 1], initializer=tf.contrib.layers.xavier_initializer())
wD = tf.get_variable("wD", shape=[mH, 1], initializer=tf.contrib.layers.xavier_initializer())
bO = tf.Variable(tf.zeros([4]), name="bO")
bR = tf.Variable(tf.zeros([1]), name="bR")
bD = tf.Variable(tf.zeros([1]), name="bD")

In [31]:
predicted_observation = tf.matmul(layer2M, wO, name="predicted_observation") + bO
predicted_reward = tf.matmul(layer2M, wR, name="predicted_reward") + bR
predicted_done = tf.sigmoid(tf.matmul(layer2M, wD, name="predicted_done") + bD)

true_observation = tf.placeholder(tf.float32, [None, 4], name="true_observation")
true_reward = tf.placeholder(tf.float32, [None, 1], name="true_reward")
true_done = tf.placeholder(tf.float32, [None, 1], name="true_done")

In [32]:
predicted_state = tf.concat(1, [predicted_observation, predicted_reward, predicted_done])

#### loss calculating from true & predicted values

In [33]:
observation_loss = tf.square(true_observation - predicted_observation)
reward_loss = tf.square(true_reward - predicted_reward)
done_loss = tf.mul(predicted_done, true_done) + tf.mul(1-predicted_done, 1-true_done) #?
done_loss = -tf.log(done_loss)

model_loss = tf.reduce_mean(observation_loss + done_loss + reward_loss)

#### optimizer

In [34]:
modelAdam = tf.train.AdamOptimizer(learning_rate=learning_rate)

In [35]:
updateModel = modelAdam.minimize(model_loss)

### Helper-functions

In [41]:
def resetGradBuffer(gradBuffer):
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    return gradBuffer

# import sys

# if sys.version_info >= (3, 0):
#     def xrange(*args, **kwargs):
#         return iter(range(*args, **kwargs))
    
def discount_rewards(r):
    discounted_r = np.zeros_like(r) # same shape
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

def stepModel(sess, xs, action):
    toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]), [1,5]) # -1 for last one , 0 for erase first of (1 X 4)
    myPredict = sess.run([predicted_state], feed_dict={previous_state: toFeed})
    reward = myPredict[0][:,4]
    observation = myPredict[0][:, 0:4]
    observation[:, 0] = np.clip(observation[:,0],-2.4, 2.4)
    observation[:, 2] = np.clip(observation[:,2], -0.4,0.4)
    doneP = np.clip(myPredict[0][:, 5], 0, 1)
    if doneP > 0.1 or len(xs) >= 300:
        done = True
    else:
        done = False
    return observation, reward, done
    

### Training the policy and model

In [42]:
xs, drs, ys, ds = [], [], [], []

In [43]:
running_reward = None
reward_sum = 0
episode_number = 1
real_episodes = 1
init = tf.global_variables_initializer()
batch_size = real_bs

In [44]:
drawFromModel = False # if true, will use model for observations
trainTheModel = True # to train the model
trainThePolicy = False # to train the policy
switch_point = 1

#### launch the graph

In [45]:
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset()
    x = observation
    gradBuffer = sess.run(tvars)
    gradBuffer = resetGradBuffer(gradBuffer)
    
    while episode_number <= 5000:
        if (reward_sum/batch_size > 150 and drawFromModel == False) or rendering== True:
            env.render()
            rendering = True
        
        x = np.reshape(observation, [1,4]) # 1 X 4 array
        tfprob = sess.run(probability, feed_dict = {observations: x})
        action = 1 if np.random.uniform() < tfprob else 0
        
        # record various intermediates (needed later for backprop)
        xs.append(x)
        y = 1 if action == 0 else 0
        ys.append(y)
        
        # step the model or real environment and get new measurements
        if drawFromModel == False:
            observation, reward, done, info = env.step(action)
        else: # observe from the learned model
            observation, reward, done = stepModel(sess, xs, action)
        
        reward_sum += reward
        
        ds.append(done*1)
        drs.append(reward) # record reward to call later 
        
        if done:
            if drawFromModel == False:
                real_episodes += 1
            episode_number += 1
            
            # stack together all inputs, hidden states, action gradients, and rewards for this episode
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            epd = np.vstack(ds)
            xs, drs, ys, ds = [], [], [], []
            
            if trainTheModel == True:
                actions = np.array([np.abs(y-1) for y in epy][:-1])
                state_prevs = epx[:-1, :]
                state_prevs = np.hstack([state_prevs,actions])
                state_nexts = epx[1:, :]
                rewards = np.array(epr[1:, :])
                dones = np.array(epd[1:,:])
                state_nextsAll = np.hstack([state_nexts, rewards, dones])
                
                feed_dict = {previous_state: state_prevs, true_observation: state_nexts, true_done:dones, true_reward:rewards}
                loss,pState,_ = sess.run([model_loss, predicted_state, updateModel], feed_dict)
            if trainThePolicy == True:
                discounted_epr = discount_rewards(epr).astype('float32')
                discounted_epr -= np.mean(discounted_epr)
                discounted_epr /= np.std(discounted_epr)
                tGrad = sess.run(newGrads, feed_dict={observations: epx, input_y: epy, advantages:discounted_epr})
                
                if np.sum(tGrad[0] == tGrad[0]) == 0:
                    break
                for ix, grad in enumerate(tGrad):
                    gradBuffer[ix] += grad
                
            if switch_point + batch_size == episode_number:
                switch_point = episode_number
                if trainThePolicy == True:
                    sess.run(updateGrads, feed_dict={W1Grad: gradBuffer[0], W2Grad: gradBuffer[1]})
                    gradBuffer = resetGradBuffer(gradBuffer)
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.1
                if drawFromModel == False:
                    # print(real_episodes,', ', reward_sum/real_bs,', ', action,', ', running_reward/real_bs, '\n')
                    print ("World Pref: Episode %f. Reward %f. action : %f. mean reward %f." % (real_episodes, reward_sum/real_bs, action, running_reward/real_bs))
                    if reward_sum/batch_size > 200:
                        break
                reward_sum = 0
                
                if episode_number > 100:
                    drawFromModel = not drawFromModel
                    trainTheModel = not trainTheModel
                    trainThePolicy = not trainThePolicy
                
            
            if drawFromModel == True:
                observation = np.random.uniform(-0.1, 0.1, [4])
                batch_size = model_bs
            else:
                observation = env.reset()
                batch_size = real_bs
print(real_episodes)

World Pref: Episode 4.000000. Reward 25.000000. action : 0.000000. mean reward 25.000000.
World Pref: Episode 7.000000. Reward 24.666667. action : 1.000000. mean reward 27.216667.
World Pref: Episode 10.000000. Reward 14.666667. action : 1.000000. mean reward 28.411167.
World Pref: Episode 13.000000. Reward 23.666667. action : 1.000000. mean reward 30.493722.
World Pref: Episode 16.000000. Reward 15.000000. action : 1.000000. mean reward 31.688784.
World Pref: Episode 19.000000. Reward 19.000000. action : 1.000000. mean reward 33.271897.
World Pref: Episode 22.000000. Reward 26.333333. action : 0.000000. mean reward 35.572511.
World Pref: Episode 25.000000. Reward 20.000000. action : 1.000000. mean reward 37.216786.
World Pref: Episode 28.000000. Reward 23.333333. action : 0.000000. mean reward 39.177951.
World Pref: Episode 31.000000. Reward 20.666667. action : 0.000000. mean reward 40.852838.
World Pref: Episode 34.000000. Reward 22.333333. action : 0.000000. mean reward 42.677643.
W

TypeError: argument to reversed() must be a sequence