Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 306 lines (256 sloc) 12 KB
#!/usr/bin/env python
import argparse
import bullet_cartpole
import collections
import datetime
import gym
import json
import numpy as np
import signal
import sys
import tensorflow as tf
from tensorflow.python.ops import init_ops
import time
import util
np.set_printoptions(precision=5, threshold=10000, suppress=True, linewidth=10000)
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--num-eval', type=int, default=0,
help="if >0 just run this many episodes with no training")
parser.add_argument('--max-num-actions', type=int, default=0,
help="train for (at least) this number of actions (always finish current episode)"
" ignore if <=0")
parser.add_argument('--max-run-time', type=int, default=0,
help="train for (at least) this number of seconds (always finish current episode)"
" ignore if <=0")
parser.add_argument('--ckpt-dir', type=str, default=None, help="if set save ckpts to this dir")
parser.add_argument('--ckpt-freq', type=int, default=3600, help="freq (sec) to save ckpts")
parser.add_argument('--hidden-layers', type=str, default="100,50", help="hidden layer sizes")
parser.add_argument('--learning-rate', type=float, default=0.0001, help="learning rate")
parser.add_argument('--num-train-batches', type=int, default=10,
help="number of training batches to run")
parser.add_argument('--rollouts-per-batch', type=int, default=10,
help="number of rollouts to run for each training batch")
parser.add_argument('--eval-action-noise', action='store_true', help="whether to use noise during eval")
util.add_opts(parser)
bullet_cartpole.add_opts(parser)
opts = parser.parse_args()
sys.stderr.write("%s\n" % opts)
assert not opts.use_raw_pixels, "TODO: add convnet from ddpg here"
# TODO: if we import slim _before_ building cartpole env we can't start bullet with GL gui o_O
env = bullet_cartpole.BulletCartpole(opts=opts, discrete_actions=True)
import base_network
import tensorflow.contrib.slim as slim
VERBOSE_DEBUG = False
def toggle_verbose_debug(signal, frame):
global VERBOSE_DEBUG
VERBOSE_DEBUG = not VERBOSE_DEBUG
signal.signal(signal.SIGUSR1, toggle_verbose_debug)
DUMP_WEIGHTS = False
def set_dump_weights(signal, frame):
global DUMP_WEIGHTS
DUMP_WEIGHTS = True
signal.signal(signal.SIGUSR2, set_dump_weights)
class LikelihoodRatioPolicyGradientAgent(base_network.Network):
def __init__(self, env):
self.env = env
num_actions = self.env.action_space.n
# we have three place holders we'll use...
# observations; used either during rollout to sample some actions, or
# during training when combined with actions_taken and advantages.
shape_with_batch = [None] + list(self.env.observation_space.shape)
self.observations = tf.placeholder(shape=shape_with_batch,
dtype=tf.float32)
# the actions we took during rollout
self.actions = tf.placeholder(tf.int32, name='actions')
# the advantages we got from taken 'action_taken' in 'observation'
self.advantages = tf.placeholder(tf.float32, name='advantages')
# our model is a very simple MLP
with tf.variable_scope("model"):
# stack of hidden layers on flattened input; (batch,2,2,7) -> (batch,28)
flat_input_state = slim.flatten(self.observations, scope='flat')
final_hidden = self.hidden_layers_starting_at(flat_input_state,
opts.hidden_layers)
logits = slim.fully_connected(inputs=final_hidden,
num_outputs=num_actions,
activation_fn=None)
# in the eval case just pick arg max
self.action_argmax = tf.argmax(logits, 1)
# for rollouts we need an op that samples actions from this
# model to give a stochastic action.
sample_action = tf.multinomial(logits, num_samples=1)
self.sampled_action_op = tf.reshape(sample_action, shape=[])
# we are trying to maximise the product of two components...
# 1) the log_p of "good" actions.
# 2) the advantage term based on the rewards from actions.
# first we need the log_p values for each observation for the actions we specifically
# took by sampling... we first run a log_softmax over the action logits to get
# probabilities.
log_softmax = tf.nn.log_softmax(logits)
self.debug_softmax = tf.exp(log_softmax)
# we then use a mask to only select the elements of the softmaxs that correspond
# to the actions we actually took. we could also do this by complex indexing and a
# gather but i always think this is more natural. the "cost" of dealing with the
# mostly zero one hot, as opposed to doing a gather on sparse indexes, isn't a big
# deal when the number of observations is >> number of actions.
action_mask = tf.one_hot(indices=self.actions, depth=num_actions)
action_log_prob = tf.reduce_sum(log_softmax * action_mask, reduction_indices=1)
# the (element wise) product of these action log_p's with the total reward of the
# episode represents the quantity we want to maximise. we standardise the advantage
# values so roughly 1/2 +ve / -ve as a variance control.
action_mul_advantages = tf.mul(action_log_prob,
util.standardise(self.advantages))
self.loss = -tf.reduce_sum(action_mul_advantages) # recall: we are maximising.
with tf.variable_scope("optimiser"):
# dynamically create optimiser based on opts
optimiser = util.construct_optimiser(opts)
# calc gradients
gradients = optimiser.compute_gradients(self.loss)
# potentially clip and wrap with debugging tf.Print
gradients = util.clip_and_debug_gradients(gradients, opts)
# apply
self.train_op = optimiser.apply_gradients(gradients)
def sample_action_given(self, observation, doing_eval=False):
""" sample one action given observation"""
if doing_eval:
sao, sm = tf.get_default_session().run([self.sampled_action_op, self.debug_softmax],
feed_dict={self.observations: [observation]})
print "EVAL sm ", sm, "action", sao
return sao
# epilson greedy "noise" will do for this simple case..
if np.random.random() < 0.1:
return self.env.action_space.sample()
# sample from logits
return tf.get_default_session().run(self.sampled_action_op,
feed_dict={self.observations: [observation]})
def rollout(self, doing_eval=False):
""" run one episode collecting observations, actions and advantages"""
observations, actions, rewards = [], [], []
observation = self.env.reset()
done = False
while not done:
observations.append(observation)
action = self.sample_action_given(observation, doing_eval)
assert action != 5, "FAIL! (multinomial logits sampling bug?"
observation, reward, done, _ = self.env.step(action)
actions.append(action)
rewards.append(reward)
if VERBOSE_DEBUG:
print "rollout: actions=%s" % (actions)
return observations, actions, rewards
def train(self, observations, actions, advantages):
""" take one training step given observations, actions and subsequent advantages"""
if VERBOSE_DEBUG:
print "TRAIN"
print "observations", np.stack(observations)
print "actions", actions
print "advantages", advantages
_, loss = tf.get_default_session().run([self.train_op, self.loss],
feed_dict={self.observations: observations,
self.actions: actions,
self.advantages: advantages})
else:
_, loss = tf.get_default_session().run([self.train_op, self.loss],
feed_dict={self.observations: observations,
self.actions: actions,
self.advantages: advantages})
return float(loss)
def post_var_init_setup(self):
pass
def run_training(self, max_num_actions, max_run_time, rollouts_per_batch,
saver_util):
# log start time, in case we are limiting by time...
start_time = time.time()
# run for some max number of actions
num_actions_taken = 0
n = 0
while True:
total_rewards = []
losses = []
# perform a number of rollouts
batch_observations, batch_actions, batch_advantages = [], [], []
for _ in xrange(rollouts_per_batch):
observations, actions, rewards = self.rollout()
batch_observations += observations
batch_actions += actions
# train with advantages, not per observation/action rewards.
# _every_ observation/action in this rollout gets assigned
# the _total_ reward of the episode. (crazy that this works!)
batch_advantages += [sum(rewards)] * len(rewards)
# keep total rewards just for debugging / stats
total_rewards.append(sum(rewards))
if min(total_rewards) == max(total_rewards):
# converged ??
sys.stderr.write("converged? standardisation of advantaged will barf here....\n")
loss = 0
else:
loss = self.train(batch_observations, batch_actions, batch_advantages)
losses.append(loss)
# dump some stats and progress info
stats = collections.OrderedDict()
stats["time"] = time.time()
stats["n"] = n
stats["mean_losses"] = float(np.mean(losses))
stats["total_reward"] = np.sum(total_rewards)
stats["episode_len"] = len(rewards)
print "STATS %s\t%s" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
json.dumps(stats))
sys.stdout.flush()
n += 1
# save if required
if saver_util is not None:
saver_util.save_if_required()
# emit occasional eval
if VERBOSE_DEBUG or n % 10 == 0:
self.run_eval(1)
# dump weights once if requested
global DUMP_WEIGHTS
if DUMP_WEIGHTS:
self.debug_dump_network_weights()
DUMP_WEIGHTS = False
# exit when finished
num_actions_taken += len(rewards)
if max_num_actions > 0 and num_actions_taken > max_num_actions:
break
if max_run_time > 0 and time.time() > start_time + max_run_time:
break
def run_eval(self, num_episodes, add_noise=False):
for _ in xrange(num_episodes):
_, _, rewards = self.rollout(doing_eval=True)
print sum(rewards)
def debug_dump_network_weights(self):
fn = "/tmp/weights.%s" % time.time()
with open(fn, "w") as f:
f.write("DUMP time %s\n" % time.time())
for var in tf.all_variables():
f.write("VAR %s %s\n" % (var.name, var.get_shape()))
f.write("%s\n" % var.eval())
print "weights written to", fn
def main():
config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# config.log_device_placement = True
with tf.Session(config=config) as sess:
agent = LikelihoodRatioPolicyGradientAgent(env)
# setup saver util and either load latest ckpt or init variables
saver_util = None
if opts.ckpt_dir is not None:
saver_util = util.SaverUtil(sess, opts.ckpt_dir, opts.ckpt_freq)
else:
sess.run(tf.initialize_all_variables())
for v in tf.all_variables():
print >>sys.stderr, v.name, util.shape_and_product_of(v)
# now that we've either init'd from scratch, or loaded up a checkpoint,
# we can do any required post init work.
agent.post_var_init_setup()
# run either eval or training
if opts.num_eval > 0:
agent.run_eval(opts.num_eval, opts.eval_action_noise)
else:
agent.run_training(opts.max_num_actions, opts.max_run_time,
opts.rollouts_per_batch,
saver_util)
if saver_util is not None:
saver_util.force_save()
env.reset() # just to flush logging, clumsy :/
if __name__ == "__main__":
main()