In [3]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import gym

In [4]:
def get_output(params, obs):

  weights = tf.slice(params, [0], [4])
  obs_reshaped = tf.reshape(obs, [-1, 4])
  bias = params[-1]
  #obs2 = tf.Print(obs_reshaped, [tf.shape(obs_reshaped), tf.shape(weights), tf.shape(tf.mul(obs_reshaped, weights))], message="obs shape")
  activation = tf.reduce_sum(tf.mul(obs_reshaped, weights), 1) + bias
  #activation2 = tf.Print(activation, [activation, tf.shape(activation)], message="activation")
  return tf.sigmoid(activation)  # probability of choosing move 0

In [5]:
def test_get_output(sess):
  from math import exp
  def sigmoid(x): return 1./(1. + exp(-x))
  
  params = [1., 2., 3., 4., 5.]
  obs = [0.4, 6., 8., 5.]
  activation = 0.4 + 12 + 24 + 20 + 5
  output = sess.run(get_output(params, obs))
  print(output, [sigmoid(activation)])
  assert abs(output - [sigmoid(activation)]) < 1e-6
  
  obs = [[0.4, 6., -3., 5.], [0.8, -0.5, 0.6, -0.01]]
  activation = [0.4 + 12 - 9 + 20 + 5, 0.8 - 1. + 1.8 - 0.04 +5]
  after_activ = map(sigmoid, activation)
  output = sess.run(get_output(params, obs))
  assert abs(sum(output - after_activ)) < 1e-7

In [6]:
def get_action(output):
  probs = tf.transpose(tf.pack([output, 1. - output]))
  return tf.reshape(tf.multinomial(tf.log(probs), 1), [-1])

In [7]:
def test_get_action(sess):
  output = [0.99, 0.001, 0.001]
  actions = sess.run(get_action(tf.convert_to_tensor(output)))
  print(actions)
  assert (actions == [0, 1, 1]).all()

In [8]:
def get_loss(output, actions, reward, average_reward, num_moves):
  eps =  1e-16
  #output2 = tf.Print(output, [output, tf.shape(output), tf.shape(actions)], message="output")
  pred = tf.equal(actions, 0)
  move_ids = tf.cast(tf.range(num_moves), tf.float32)
  
  select = tf.select(pred, tf.log(output + eps), tf.log(1. + eps - output))
  #select2 = tf.Print(select, [select, average_reward], message="select")
  advantage = (reward - average_reward) - move_ids
  #advantage2 = tf.Print(advantage, [advantage], message="advantage")
  return tf.reduce_sum(tf.mul(advantage, select))

In [9]:
def test_get_loss(sess):
  from math import log
  outputs = [0.55, 0.7, 0.1]  # probabilities of choosing 0
  actions = [0, 1, 0]
  log_probs = [log(0.55), log(0.3), log(0.1)]
  reward = 4.
  num_moves = len(outputs)
  
  my_loss = sess.run(get_loss(outputs, actions, reward, num_moves))
  print(my_loss, reward * np.sum(log_probs))
  assert abs(my_loss - reward * np.sum(log_probs)) < 1e-6

In [10]:
def run_model(params, output, obs_place, action, render=False):
  env.reset()
  assert env.action_space.n == 2
  first_action = env.action_space.sample()
  obs, rew, done, _ = env.step(first_action)
  reward = rew
  observations = []
  actions = []
  while not done:
    if reward >= 1000:
      break
    if render:
      env.render()
    action_eval = sess.run(action, feed_dict={obs_place: obs})[0]
    observations.append(obs)
    actions.append(action_eval)
    
    obs, rew, done, _ = env.step(action_eval)
    reward += rew
  env.close()
  return (reward, np.vstack(observations), actions)

In [11]:
def get_session():
  config = tf.ConfigProto(operation_timeout_in_ms=5000)
  return tf.Session(config=config)

In [None]:
import shutil
env = gym.make("CartPole-v1")
rounds=5000

tf.reset_default_graph()

with get_session() as sess:
  theta = tf.Variable([0.] * 5, dtype=tf.float32, name="theta")
  observation_place = tf.placeholder(tf.float32, shape=None)
  action_place = tf.placeholder(tf.float32, shape=None)
  reward_place = tf.placeholder(tf.float32, shape=None)
  num_moves_place = tf.placeholder(tf.int32, shape=None)
  average_place = tf.placeholder(tf.float32, shape=None)
  
  output = get_output(theta, observation_place)
  action = get_action(output)
  
  loss = get_loss(output, action_place, reward_place, average_place, num_moves_place)
  loss_grad = tf.gradients(loss, [theta])[0]
  learning_rate = 0.0001
  gradient_place = tf.placeholder(tf.float32, shape=None)
  
  params_update_op = tf.assign_add(theta, gradient_place)
  
  sess.run(tf.initialize_all_variables())
  
  sum_rewards = 0.
  #test_get_action(sess)
  #test_get_output(sess)
  for i in range(rounds):
    #print("theta:", sess.run(theta))
    rew, observations, actions = run_model(theta, output, observation_place, action)
    sum_rewards += rew
    loss_ev, loss_grad_ev = sess.run([loss, loss_grad],
                                     feed_dict={observation_place: observations, action_place: actions,
                                                reward_place: rew, average_place: min(100., (sum_rewards / (i+1))),
                                                num_moves_place: observations.shape[0]})
    #print("reward:", rew, "\nobservations: ", observations, "\nobs sum:", np.sum(observations, axis=0),
    #      "\nactions:", actions, "\ngradients:", loss_grad_ev)
    sess.run(params_update_op, feed_dict={gradient_place: loss_grad_ev * learning_rate})
    print(i, rew)
    prev = rew
  shutil.rmtree("/tmp/pg1")
  writer = tf.train.SummaryWriter("/tmp/pg1", sess.graph)

[2016-10-26 11:31:10,518] Making new env: CartPole-v1


0 9.0
1 34.0
2 15.0
3 38.0
4 11.0
5 14.0
6 19.0
7 18.0
8 10.0
9 24.0
10 18.0
11 20.0
12 21.0
13 40.0
14 40.0
15 15.0
16 41.0
17 23.0
18 9.0
19 23.0
20 39.0
21 18.0
22 22.0
23 21.0
24 15.0
25 13.0
26 19.0
27 10.0
28 32.0
29 17.0
30 18.0
31 17.0
32 14.0
33 19.0
34 12.0
35 12.0
36 11.0
37 28.0
38 20.0
39 26.0
40 24.0
41 24.0
42 21.0
43 42.0
44 20.0
45 36.0
46 39.0
47 15.0
48 19.0
49 29.0
50 32.0
51 23.0
52 19.0
53 16.0
54 33.0
55 27.0
56 12.0
57 15.0
58 26.0
59 23.0
60 18.0
61 9.0
62 21.0
63 28.0
64 10.0
65 20.0
66 24.0
67 15.0
68 23.0
69 31.0
70 20.0
71 19.0
72 12.0
73 15.0
74 22.0
75 17.0
76 19.0
77 18.0
78 46.0
79 15.0
80 14.0
81 15.0
82 12.0
83 16.0
84 12.0
85 38.0
86 18.0
87 20.0
88 28.0
89 15.0
90 18.0
91 15.0
92 17.0
93 22.0
94 20.0
95 12.0
96 16.0
97 9.0
98 17.0
99 67.0
100 23.0
101 12.0
102 31.0
103 13.0
104 12.0
105 12.0
106 29.0
107 60.0
108 15.0
109 18.0
110 92.0
111 14.0
112 29.0
113 15.0
114 24.0
115 23.0
116 28.0
117 48.0
118 25.0
119 12.0
120 10.0
121 19.0
122 52.0
123 16.