second pass at ddpg; is learning now

matpalm · Aug 27, 2016 · 8372eac · 8372eac
1 parent fb75262
commit 8372eac
Show file tree

Hide file tree

Showing 7 changed files with 389 additions and 183 deletions.
diff --git a/ddpg_cartpole.py b/ddpg_cartpole.py
diff --git a/make_plots.sh b/make_plots.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+mkdir /tmp/plots
+R --vanilla < plots.R
+
diff --git a/parse_foo.sh b/parse_foo.sh
diff --git a/parse_out_ddpg.py b/parse_out_ddpg.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+import argparse, sys, re, json
+import numpy as np
+from collections import Counter
+
+f_actions= open("/tmp/actions", "w")
+f_actions.write("time type x y\n")
+f_q_values = open("/tmp/q_values", "w")
+f_q_values.write("time net_type q_value\n")
+f_episode_len = open("/tmp/episode_stats", "w")
+f_episode_len.write("episode len total_reward\n")
+f_eval = open("/tmp/eval", "w")
+f_eval.write("time steps total_reward\n")
+f_batch_num_terminal = open("/tmp/batch_num_terminal", "w")
+f_batch_num_terminal.write("time batch_num_terminals\n")
+f_gradient_l2_norms = open("/tmp/gradient_l2_norms", "w")
+f_gradient_l2_norms.write("time source l2_norm\n")
+f_q_loss = open("/tmp/q_loss", "w")
+f_q_loss.write("time q_loss\n")
+
+freq = Counter()
+emit_freq = {"EVAL": 1, "ACTOR_L2_NORM": 20, "CRITIC_L2_NORM": 20, "Q LOSS": 10}
+def should_emit(tag):
+  freq[tag] += 1
+  return freq[tag] % (emit_freq[tag] if tag in emit_freq else 100) == 0
+
+n_parse_errors = 0
+
+time = None
+for line in sys.stdin:
+  if line.startswith("TIME"):
+    time = line.strip().replace("TIME ", "")
+    continue
+  if time is None:
+    continue
+
+  line = line.strip()
+  if line.startswith("STATS"):
+    cols = line.split("\t")
+    assert len(cols) == 2
+    try:
+      d = json.loads(cols[1])
+      if should_emit("EPISODE_LEN"):
+        episode = d["episode"]
+        total_reward = d["total_reward"]
+        episode_len = d["episode_len"] if "episode_len" in d else total_reward
+        f_episode_len.write("%s %s %s\n" % (episode, episode_len, total_reward))
+    except ValueError:
+      # interleaving output :/
+      n_parse_errors += 1
+
+  elif "actor gradient l2_norm" in line and should_emit("ACTOR_L2_NORM"):
+    norm = re.sub(".*\[", "", line).replace("]", "")
+    f_gradient_l2_norms.write("%s actor %s\n" % (time, norm))
+
+  elif "critic gradient l2_norm" in line and should_emit("CRITIC_L2_NORM"):
+    norm = re.sub(".*\[", "", line).replace("]", "")
+    f_gradient_l2_norms.write("%s critic %s\n" % (time, norm))
+
+  elif line.startswith("ACTIONS") and should_emit("ACTIONS"):
+    m = re.match("ACTIONS\t\[(.*), (.*)\]\t\[(.*), (.*)\]", line)
+    if m:
+      pre_x, pre_y, post_x, post_y = m.groups()
+      f_actions.write("%s pre %s %s\n" % (time, pre_x, pre_y))
+      f_actions.write("%s post %s %s\n" % (time, post_x, post_y))
+
+  elif line.startswith("EXPECTED_Q_VALUES") and should_emit("EXPECTED_Q_VALUES"):
+    cols = line.split(" ")
+    assert len(cols) == 3
+    assert cols[0] == "EXPECTED_Q_VALUES"
+    f_q_values.write("%s main %f\n" % (time, float(cols[1])))
+    f_q_values.write("%s target %f\n" % (time, float(cols[2])))
+
+  elif line.startswith("EVAL") and should_emit("EVAL"):
+    cols = line.split(" ")
+    if len(cols) == 2:  # OLD FORMAT
+      tag, steps = cols
+      assert tag == "EVAL"
+      total_reward = steps
+    elif len(cols) == 3:
+      tag, steps, total_reward = cols
+      assert tag == "EVAL"
+    else:
+      assert False
+    assert steps >= 0
+    assert total_reward >= 0
+    f_eval.write("%s %s %s\n" % (time, steps, total_reward))
+
+  elif line.startswith("NUM_TERMINALS_IN_BATCH") and should_emit("NUM_TERMINALS_IN_BATCH"):
+    cols = line.split(" ")
+    assert len(cols) == 2
+    assert cols[0] == "NUM_TERMINALS_IN_BATCH"
+    f_batch_num_terminal.write("%s %f\n" % (time, float(cols[1])))
+
+  elif line.startswith("Q LOSS") and should_emit("Q LOSS"):
+    cols = line.split(" ")
+    assert len(cols) == 3
+    assert cols[0] == "Q"
+    assert cols[1] == "LOSS"   # o_O
+    f_q_loss.write("%s %f\n" % (time, float(cols[2])))
+
+print "n_parse_errors", n_parse_errors
+print freq
+
diff --git a/plots.R b/plots.R
@@ -1,51 +1,63 @@
 library(ggplot2)
-library(grid)
-library(gridExtra)
+df = read.delim("/tmp/actions", h=T, sep=" ")
+png("/tmp/plots/00a_pre_noise_x_y_scatter.png", width=300, height=300)
+ggplot(df[df$type=='pre',], aes(x, y)) + geom_bin2d() + labs(title="x pre noise")
+dev.off()
+png("/tmp/plots/00b_post_noise_x_y_scatter.png", width=300, height=300)
+ggplot(df[df$type=='post',], aes(x, y)) + geom_bin2d() + labs(title="x post noise")
+dev.off()
+png("/tmp/plots/00c_x_over_time.png", width=640, height=400)
+ggplot(df, aes(time, x)) + geom_point(alpha=0.1) + geom_smooth() + facet_grid(type~.) + labs(title="x over time")
+dev.off()
+png("/tmp/plots/00d_y_over_time.png", width=640, height=400)
+ggplot(df, aes(time, y)) + geom_point(alpha=0.1) + geom_smooth() + facet_grid(type~.) + labs(title="yx over time")
+dev.off()
 
-# run parse_foo.sh first
+df = read.delim("/tmp/q_values", h=T, sep=" ")
+png("/tmp/plots/05a_action_q_values.png", width=640, height=320)
+ggplot(df, aes(time, q_value)) + geom_point(alpha=0.2, aes(color=net_type)) + geom_smooth(aes(color=net_type)) + labs(title="q values over time")
+dev.off()
 
-df = read.delim("/tmp/actions_pre_noise", h=F, sep=" ", col.names=c("x", "y"))
-png("/tmp/plots/01_x_y_scatter.png")
-ggplot(df, aes(x, y)) + geom_point()
+df = read.delim("/tmp/episode_stats", h=T, sep=" ")
+png("/tmp/plots/06a_episode_len.png", width=640, height=320)
+ggplot(df, aes(episode, len)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="episode len")
 dev.off()
-df$n = 1:nrow(df)
-png("/tmp/plots/02_x_pre_noise.png")
-ggplot(df, aes(n, x)) + geom_point() + labs(title="x pre noise")
+png("/tmp/plots/06b_episode_rewards.png", width=640, height=320)
+ggplot(df, aes(episode, total_reward)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="episode total reward")
 dev.off()
-png("/tmp/plots/03_y_pre_noise.png")
-ggplot(df, aes(n, y)) + geom_point() + labs(title="y pre noise")
+png("/tmp/plots/06c_episode_stats.png", width=320, height=320)
+ggplot(df, aes(len, total_reward)) + geom_point(alpha=0.2) + labs(title="episode step vs reward")
 dev.off()
 
-df = read.delim("/tmp/actions_post_noise", h=F, sep=" ", col.names=c("x", "y"))
-png("/tmp/plots/03_x_y_scatter.png")
-ggplot(df, aes(x, y)) + geom_point()
+df = read.delim("/tmp/eval", h=T, sep=" ")
+png("/tmp/plots/07a_eval_episode_len.png", width=640, height=320)
+ggplot(df, aes(time, steps)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="eval episode len")
 dev.off()
-df$n = 1:nrow(df)
-png("/tmp/plots/03_x_post_noise.png")
-ggplot(df, aes(n, x)) + geom_point() + labs(title="x post noise")
-dev.off()
-png("/tmp/plots/03_y_post_noise.png")
-ggplot(df, aes(n, y)) + geom_point() + labs(title="y post noise")
+
+png("/tmp/plots/07b_eval_total_reward.png", width=640, height=320)
+ggplot(df, aes(time, total_reward)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="eval total reward")
 dev.off()
 
-df = read.delim("/tmp/q_loss", h=F)
-df$n = 1:nrow(df)
-png("/tmp/plots/04_q_loss.png")
-ggplot(df, aes(n, V1)) + geom_point() + 
-  geom_smooth() + labs(title="q loss")
+df = read.delim("/tmp/batch_num_terminal", h=T, sep=" ")
+png("/tmp/plots/08_batch_num_terminal.png", width=640, height=320)
+ggplot(df, aes(time, batch_num_terminals)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="batch num terminal")
 dev.off()
 
-df = read.delim("/tmp/action_q_values", h=F)
-summary(df)
-df$n = 1:nrow(df)
-png("/tmp/plots/05_action_q_values.png")
-ggplot(df, aes(n, V1)) + geom_point() +
-  geom_smooth() + labs(title="q values over time")
+df = read.delim("/tmp/gradient_l2_norms", h=T, sep=" ")
+png("/tmp/plots/09a_actor_l2_norms.png", width=640, height=320)
+ggplot(df[df$source=="actor",], aes(time, l2_norm)) + geom_point(alpha=0.1) + geom_smooth() + labs(title="actor gradient l2 norms")
+dev.off()
+png("/tmp/plots/09b_critic_l2_norms.png", width=640, height=320)
+ggplot(df[df$source=="critic",], aes(time, l2_norm)) + geom_point(alpha=0.1) + geom_smooth() + labs(title="critic gradient l2 norms")
 dev.off()
 
-df = read.delim("/tmp/episode_len", h=F)
-df$n = 1:nrow(df)
-png("/tmp/plots/06_episode_len.png")
-ggplot(df, aes(n, V1)) + geom_point() +
-  geom_smooth() + labs(title="episode len")
+df = read.delim("/tmp/q_loss", h=T, sep=" ")
+png("/tmp/plots/10_q_loss.png", width=640, height=320)
+ggplot(df, aes(time, q_loss)) + geom_point(alpha=0.1) + geom_smooth() + labs(title="critic training q loss")
 dev.off()
+
+# df = read.delim("/tmp/replay_memory_size", h=F)
+# df$n = 1:nrow(df)
+# png("/tmp/plots/09_replay_memory_size.png", width=640, height=320)
+# ggplot(df, aes(n, V1)) + geom_point() + labs(title="replay memory size")
+# dev.off()
diff --git a/replay_memory.py b/replay_memory.py
@@ -1,7 +1,7 @@
 import numpy as np
+import sys
 
 class RingBuffer(object):
-  # 2d fill only ring buffer over 2d ndarray
   def __init__(self, buffer_size, depth=1):
     self.buffer_size = buffer_size
     self.depth = depth
@@ -24,11 +24,11 @@ def random_indexes(self, n=1):
     else:
       return np.random.randint(0, self.insert, n)
 
-  def size(self):  # number of entries
+  def size(self):
     return self.buffer_size if self.full else self.insert
 
   def debug_dump(self):
-    for r in xrange(len(self.memory)):
+    for r in xrange(self.buffer_size):
       print "   ", r, self.memory[r]
     print "insert=%s full=%s" % (self.insert, self.full)
 
@@ -37,12 +37,17 @@ def __init__(self, buffer_size, state_dim, action_dim):
     self.state_1 = RingBuffer(buffer_size, state_dim)
     self.action = RingBuffer(buffer_size, action_dim)
     self.reward = RingBuffer(buffer_size, 1)
+    self.terminal_mask = RingBuffer(buffer_size, 1)
     self.state_2 = RingBuffer(buffer_size, state_dim)
+    # TODO: write to disk as part of ckpting
 
-  def add(self, s1, a, r, s2):
+  def add(self, s1, a, r, t, s2):
     self.state_1.add(s1)
     self.action.add(a)
     self.reward.add(r)
+    # note: if state is terminal (i.e. True) we record a 0 to
+    # be used as mask during training
+    self.terminal_mask.add(0 if t else 1)
     self.state_2.add(s2)
 
   def size(self):
@@ -55,13 +60,16 @@ def dump(self):
     self.action.debug_dump()
     print "---- reward"
     self.reward.debug_dump()
+    print "---- terminal"
+    self.terminal_mask.debug_dump()
     print "---- state2"
     self.state_2.debug_dump()
 
-  def batch(self, batch_size):
+  def random_batch(self, batch_size):
     idxs = self.state_1.random_indexes(batch_size)
     return (self.state_1.memory[idxs],
             self.action.memory[idxs],
             self.reward.memory[idxs],
+            self.terminal_mask.memory[idxs],
             self.state_2.memory[idxs])
 
diff --git a/util.py b/util.py
@@ -1,9 +1,14 @@
 #!/usr/bin/env python
 import datetime, os, time, yaml, sys
+import numpy as np
 import tensorflow as tf
 
+def l2_norm(tensor):
+  """(row wise) l2 norm of a tensor"""
+  return tf.sqrt(tf.reduce_sum(tf.pow(tensor, 2)))
+
 def standardise(tensor):
-  """ standardise a tensor. """
+  """standardise a tensor"""
   # is std_dev not an op in tensorflow?!? i must be taking crazy pills...
   mean = tf.reduce_mean(tensor)
   variance = tf.reduce_mean(tf.square(tensor - mean))
@@ -31,14 +36,14 @@ def load_latest_ckpt_or_init_if_none(self):
       assert 'model_checkpoint_path' in info
       most_recent_ckpt = "%s/%s" % (self.ckpt_dir, info['model_checkpoint_path'])
       sys.stderr.write("loading ckpt %s\n" % most_recent_ckpt)
-      self.saver.restore(self.sess, most_recent_ckpt)      
+      self.saver.restore(self.sess, most_recent_ckpt)
       self.next_scheduled_save_time = time.time() + self.save_freq
     else:
       # no latest ckpts, init and force a save now
       sys.stderr.write("no latest ckpt in %s, just initing vars...\n" % self.ckpt_dir)
       self.sess.run(tf.initialize_all_variables())
       self.force_save()
-    
+
   def force_save(self):
     """force a save now."""
     dts = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
@@ -51,3 +56,28 @@ def save_if_required(self):
     """check if save is required based on time and if so, save."""
     if time.time() >= self.next_scheduled_save_time:
       self.force_save()
+
+class OrnsteinUhlenbeckNoise(object):
+  """generate time correlated noise for action exploration"""
+
+  def __init__(self, dim, theta=0.01, sigma=0.2, max_magnitude=1.5):
+    # dim: dimensionality of returned noise
+    # theta: how quickly the value moves; near zero => slow, near one => fast
+    #   0.01 gives very roughly 2/3 peaks troughs over ~1000 samples
+    # sigma: maximum range of values; 0.2 gives approximately the range (-1.5, 1.5)
+    #   which is useful for shifting the output of a tanh which is (-1, 1)
+    # max_magnitude: max +ve / -ve value to clip at. dft clip at 1.5 (again for
+    #   adding to output from tanh. we do this since sigma gives no guarantees
+    #   regarding min/max values.
+    self.dim = dim
+    self.theta = theta
+    self.sigma = sigma
+    self.max_magnitude = max_magnitude
+    self.state = np.zeros(self.dim)
+
+  def sample(self):
+    self.state += self.theta * -self.state
+    self.state += self.sigma * np.random.randn(self.dim)
+    self.state = np.clip(self.max_magnitude, -self.max_magnitude, self.state)
+    return np.copy(self.state)
+