In [1]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import gym

In [13]:
def get_output(params, obs):

  weights = tf.slice(params, [0], [4])
  obs_reshaped = tf.reshape(obs, [-1, 4])
  bias = params[-1]
  #obs2 = tf.Print(obs_reshaped, [tf.shape(obs_reshaped), tf.shape(weights), tf.shape(tf.mul(obs_reshaped, weights))], message="obs shape")
  activation = tf.reduce_sum(tf.mul(obs_reshaped, weights), 1) + bias
  #activation2 = tf.Print(activation, [activation, tf.shape(activation)], message="activation")
  return tf.sigmoid(activation)  # probability of choosing move 0

In [3]:
def test_get_output(sess):
  from math import exp
  def sigmoid(x): return 1./(1. + exp(-x))
  
  params = [1., 2., 3., 4., 5.]
  obs = [0.4, 6., 8., 5.]
  activation = 0.4 + 12 + 24 + 20 + 5
  output = sess.run(get_output(params, obs))
  print(output, [sigmoid(activation)])
  assert abs(output - [sigmoid(activation)]) < 1e-6
  
  obs = [[0.4, 6., -3., 5.], [0.8, -0.5, 0.6, -0.01]]
  activation = [0.4 + 12 - 9 + 20 + 5, 0.8 - 1. + 1.8 - 0.04 +5]
  after_activ = map(sigmoid, activation)
  output = sess.run(get_output(params, obs))
  assert abs(sum(output - after_activ)) < 1e-7

In [4]:
def get_action(output):
  probs = tf.transpose(tf.pack([output, 1. - output]))
  return tf.reshape(tf.multinomial(tf.log(probs), 1), [-1])

In [5]:
def test_get_action(sess):
  output = [0.99, 0.001, 0.001]
  actions = sess.run(get_action(tf.convert_to_tensor(output)))
  print(actions)
  assert (actions == [0, 1, 1]).all()

In [6]:
def get_loss(output, actions, reward, average_reward, num_moves):
  eps =  1e-16
  #output2 = tf.Print(output, [output, tf.shape(output), tf.shape(actions)], message="output")
  pred = tf.equal(actions, 0)
  move_ids = tf.cast(tf.range(num_moves), tf.float32)
  
  select = tf.select(pred, tf.log(output + eps), tf.log(1. + eps - output))
  #select2 = tf.Print(select, [select, average_reward], message="select")
  advantage = (reward - average_reward) - move_ids
  #advantage2 = tf.Print(advantage, [advantage], message="advantage")
  return tf.reduce_sum(tf.mul(advantage, select))

In [7]:
def test_get_loss(sess):
  from math import log
  outputs = [0.55, 0.7, 0.1]  # probabilities of choosing 0
  actions = [0, 1, 0]
  log_probs = [log(0.55), log(0.3), log(0.1)]
  reward = 4.
  num_moves = len(outputs)
  
  my_loss = sess.run(get_loss(outputs, actions, reward, num_moves))
  print(my_loss, reward * np.sum(log_probs))
  assert abs(my_loss - reward * np.sum(log_probs)) < 1e-6

In [8]:
def run_model(params, output, obs_place, action, render=False):
  env.reset()
  assert env.action_space.n == 2
  first_action = env.action_space.sample()
  obs, rew, done, _ = env.step(first_action)
  reward = rew
  observations = []
  actions = []
  while not done:
    if reward >= 1000:
      break
    if render:
      env.render()
    action_eval = sess.run(action, feed_dict={obs_place: obs})[0]
    observations.append(obs)
    actions.append(action_eval)
    
    obs, rew, done, _ = env.step(action_eval)
    reward += rew
  env.close()
  return (reward, np.vstack(observations), actions)

In [9]:
def get_session():
  config = tf.ConfigProto(operation_timeout_in_ms=5000)
  return tf.Session(config=config)

In [14]:
import shutil
env = gym.make("CartPole-v1")
rounds=5000

tf.reset_default_graph()

with get_session() as sess:
  theta = tf.Variable([0.] * 5, dtype=tf.float32, name="theta")
  observation_place = tf.placeholder(tf.float32, shape=None)
  action_place = tf.placeholder(tf.float32, shape=None)
  reward_place = tf.placeholder(tf.float32, shape=None)
  num_moves_place = tf.placeholder(tf.int32, shape=None)
  average_place = tf.placeholder(tf.float32, shape=None)
  
  output = get_output(theta, observation_place)
  action = get_action(output)
  
  loss = get_loss(output, action_place, reward_place, average_place, num_moves_place)
  loss_grad = tf.gradients(loss, [theta])[0]
  learning_rate = 0.0001
  gradient_place = tf.placeholder(tf.float32, shape=None)
  
  params_update_op = tf.assign_add(theta, gradient_place)
  
  sess.run(tf.initialize_all_variables())
  
  sum_rewards = 0.
  #test_get_action(sess)
  #test_get_output(sess)
  for i in range(rounds):
    #print("theta:", sess.run(theta))
    rew, observations, actions = run_model(theta, output, observation_place, action)
    sum_rewards += rew
    loss_ev, loss_grad_ev = sess.run([loss, loss_grad],
                                     feed_dict={observation_place: observations, action_place: actions,
                                                reward_place: rew, average_place: min(100., (sum_rewards / (i+1))),
                                                num_moves_place: observations.shape[0]})
    #print("reward:", rew, "\nobservations: ", observations, "\nobs sum:", np.sum(observations, axis=0),
    #      "\nactions:", actions, "\ngradients:", loss_grad_ev)
    sess.run(params_update_op, feed_dict={gradient_place: loss_grad_ev * learning_rate})
    print(i, rew)
    prev = rew
  shutil.rmtree("/tmp/pg1")
  writer = tf.train.SummaryWriter("/tmp/pg1", sess.graph)

[2016-10-26 11:09:16,299] Making new env: CartPole-v1


AttributeError: 'Tensor' object has no attribute 'shape'