Skip to content
This repository has been archived by the owner on Jun 13, 2020. It is now read-only.

Commit

Permalink
second pass at ddpg; is learning now
Browse files Browse the repository at this point in the history
  • Loading branch information
matpalm committed Aug 27, 2016
1 parent fb75262 commit 8372eac
Show file tree
Hide file tree
Showing 7 changed files with 389 additions and 183 deletions.
315 changes: 187 additions & 128 deletions ddpg_cartpole.py

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions make_plots.sh
@@ -0,0 +1,4 @@
#!/usr/bin/env bash
mkdir /tmp/plots
R --vanilla < plots.R

11 changes: 0 additions & 11 deletions parse_foo.sh

This file was deleted.

104 changes: 104 additions & 0 deletions parse_out_ddpg.py
@@ -0,0 +1,104 @@
#!/usr/bin/env python
import argparse, sys, re, json
import numpy as np
from collections import Counter

f_actions= open("/tmp/actions", "w")
f_actions.write("time type x y\n")
f_q_values = open("/tmp/q_values", "w")
f_q_values.write("time net_type q_value\n")
f_episode_len = open("/tmp/episode_stats", "w")
f_episode_len.write("episode len total_reward\n")
f_eval = open("/tmp/eval", "w")
f_eval.write("time steps total_reward\n")
f_batch_num_terminal = open("/tmp/batch_num_terminal", "w")
f_batch_num_terminal.write("time batch_num_terminals\n")
f_gradient_l2_norms = open("/tmp/gradient_l2_norms", "w")
f_gradient_l2_norms.write("time source l2_norm\n")
f_q_loss = open("/tmp/q_loss", "w")
f_q_loss.write("time q_loss\n")

freq = Counter()
emit_freq = {"EVAL": 1, "ACTOR_L2_NORM": 20, "CRITIC_L2_NORM": 20, "Q LOSS": 10}
def should_emit(tag):
freq[tag] += 1
return freq[tag] % (emit_freq[tag] if tag in emit_freq else 100) == 0

n_parse_errors = 0

time = None
for line in sys.stdin:
if line.startswith("TIME"):
time = line.strip().replace("TIME ", "")
continue
if time is None:
continue

line = line.strip()
if line.startswith("STATS"):
cols = line.split("\t")
assert len(cols) == 2
try:
d = json.loads(cols[1])
if should_emit("EPISODE_LEN"):
episode = d["episode"]
total_reward = d["total_reward"]
episode_len = d["episode_len"] if "episode_len" in d else total_reward
f_episode_len.write("%s %s %s\n" % (episode, episode_len, total_reward))
except ValueError:
# interleaving output :/
n_parse_errors += 1

elif "actor gradient l2_norm" in line and should_emit("ACTOR_L2_NORM"):
norm = re.sub(".*\[", "", line).replace("]", "")
f_gradient_l2_norms.write("%s actor %s\n" % (time, norm))

elif "critic gradient l2_norm" in line and should_emit("CRITIC_L2_NORM"):
norm = re.sub(".*\[", "", line).replace("]", "")
f_gradient_l2_norms.write("%s critic %s\n" % (time, norm))

elif line.startswith("ACTIONS") and should_emit("ACTIONS"):
m = re.match("ACTIONS\t\[(.*), (.*)\]\t\[(.*), (.*)\]", line)
if m:
pre_x, pre_y, post_x, post_y = m.groups()
f_actions.write("%s pre %s %s\n" % (time, pre_x, pre_y))
f_actions.write("%s post %s %s\n" % (time, post_x, post_y))

elif line.startswith("EXPECTED_Q_VALUES") and should_emit("EXPECTED_Q_VALUES"):
cols = line.split(" ")
assert len(cols) == 3
assert cols[0] == "EXPECTED_Q_VALUES"
f_q_values.write("%s main %f\n" % (time, float(cols[1])))
f_q_values.write("%s target %f\n" % (time, float(cols[2])))

elif line.startswith("EVAL") and should_emit("EVAL"):
cols = line.split(" ")
if len(cols) == 2: # OLD FORMAT
tag, steps = cols
assert tag == "EVAL"
total_reward = steps
elif len(cols) == 3:
tag, steps, total_reward = cols
assert tag == "EVAL"
else:
assert False
assert steps >= 0
assert total_reward >= 0
f_eval.write("%s %s %s\n" % (time, steps, total_reward))

elif line.startswith("NUM_TERMINALS_IN_BATCH") and should_emit("NUM_TERMINALS_IN_BATCH"):
cols = line.split(" ")
assert len(cols) == 2
assert cols[0] == "NUM_TERMINALS_IN_BATCH"
f_batch_num_terminal.write("%s %f\n" % (time, float(cols[1])))

elif line.startswith("Q LOSS") and should_emit("Q LOSS"):
cols = line.split(" ")
assert len(cols) == 3
assert cols[0] == "Q"
assert cols[1] == "LOSS" # o_O
f_q_loss.write("%s %f\n" % (time, float(cols[2])))

print "n_parse_errors", n_parse_errors
print freq

84 changes: 48 additions & 36 deletions plots.R
@@ -1,51 +1,63 @@
library(ggplot2)
library(grid)
library(gridExtra)
df = read.delim("/tmp/actions", h=T, sep=" ")
png("/tmp/plots/00a_pre_noise_x_y_scatter.png", width=300, height=300)
ggplot(df[df$type=='pre',], aes(x, y)) + geom_bin2d() + labs(title="x pre noise")
dev.off()
png("/tmp/plots/00b_post_noise_x_y_scatter.png", width=300, height=300)
ggplot(df[df$type=='post',], aes(x, y)) + geom_bin2d() + labs(title="x post noise")
dev.off()
png("/tmp/plots/00c_x_over_time.png", width=640, height=400)
ggplot(df, aes(time, x)) + geom_point(alpha=0.1) + geom_smooth() + facet_grid(type~.) + labs(title="x over time")
dev.off()
png("/tmp/plots/00d_y_over_time.png", width=640, height=400)
ggplot(df, aes(time, y)) + geom_point(alpha=0.1) + geom_smooth() + facet_grid(type~.) + labs(title="yx over time")
dev.off()

# run parse_foo.sh first
df = read.delim("/tmp/q_values", h=T, sep=" ")
png("/tmp/plots/05a_action_q_values.png", width=640, height=320)
ggplot(df, aes(time, q_value)) + geom_point(alpha=0.2, aes(color=net_type)) + geom_smooth(aes(color=net_type)) + labs(title="q values over time")
dev.off()

df = read.delim("/tmp/actions_pre_noise", h=F, sep=" ", col.names=c("x", "y"))
png("/tmp/plots/01_x_y_scatter.png")
ggplot(df, aes(x, y)) + geom_point()
df = read.delim("/tmp/episode_stats", h=T, sep=" ")
png("/tmp/plots/06a_episode_len.png", width=640, height=320)
ggplot(df, aes(episode, len)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="episode len")
dev.off()
df$n = 1:nrow(df)
png("/tmp/plots/02_x_pre_noise.png")
ggplot(df, aes(n, x)) + geom_point() + labs(title="x pre noise")
png("/tmp/plots/06b_episode_rewards.png", width=640, height=320)
ggplot(df, aes(episode, total_reward)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="episode total reward")
dev.off()
png("/tmp/plots/03_y_pre_noise.png")
ggplot(df, aes(n, y)) + geom_point() + labs(title="y pre noise")
png("/tmp/plots/06c_episode_stats.png", width=320, height=320)
ggplot(df, aes(len, total_reward)) + geom_point(alpha=0.2) + labs(title="episode step vs reward")
dev.off()

df = read.delim("/tmp/actions_post_noise", h=F, sep=" ", col.names=c("x", "y"))
png("/tmp/plots/03_x_y_scatter.png")
ggplot(df, aes(x, y)) + geom_point()
df = read.delim("/tmp/eval", h=T, sep=" ")
png("/tmp/plots/07a_eval_episode_len.png", width=640, height=320)
ggplot(df, aes(time, steps)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="eval episode len")
dev.off()
df$n = 1:nrow(df)
png("/tmp/plots/03_x_post_noise.png")
ggplot(df, aes(n, x)) + geom_point() + labs(title="x post noise")
dev.off()
png("/tmp/plots/03_y_post_noise.png")
ggplot(df, aes(n, y)) + geom_point() + labs(title="y post noise")

png("/tmp/plots/07b_eval_total_reward.png", width=640, height=320)
ggplot(df, aes(time, total_reward)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="eval total reward")
dev.off()

df = read.delim("/tmp/q_loss", h=F)
df$n = 1:nrow(df)
png("/tmp/plots/04_q_loss.png")
ggplot(df, aes(n, V1)) + geom_point() +
geom_smooth() + labs(title="q loss")
df = read.delim("/tmp/batch_num_terminal", h=T, sep=" ")
png("/tmp/plots/08_batch_num_terminal.png", width=640, height=320)
ggplot(df, aes(time, batch_num_terminals)) + geom_point(alpha=0.2) + geom_smooth() + labs(title="batch num terminal")
dev.off()

df = read.delim("/tmp/action_q_values", h=F)
summary(df)
df$n = 1:nrow(df)
png("/tmp/plots/05_action_q_values.png")
ggplot(df, aes(n, V1)) + geom_point() +
geom_smooth() + labs(title="q values over time")
df = read.delim("/tmp/gradient_l2_norms", h=T, sep=" ")
png("/tmp/plots/09a_actor_l2_norms.png", width=640, height=320)
ggplot(df[df$source=="actor",], aes(time, l2_norm)) + geom_point(alpha=0.1) + geom_smooth() + labs(title="actor gradient l2 norms")
dev.off()
png("/tmp/plots/09b_critic_l2_norms.png", width=640, height=320)
ggplot(df[df$source=="critic",], aes(time, l2_norm)) + geom_point(alpha=0.1) + geom_smooth() + labs(title="critic gradient l2 norms")
dev.off()

df = read.delim("/tmp/episode_len", h=F)
df$n = 1:nrow(df)
png("/tmp/plots/06_episode_len.png")
ggplot(df, aes(n, V1)) + geom_point() +
geom_smooth() + labs(title="episode len")
df = read.delim("/tmp/q_loss", h=T, sep=" ")
png("/tmp/plots/10_q_loss.png", width=640, height=320)
ggplot(df, aes(time, q_loss)) + geom_point(alpha=0.1) + geom_smooth() + labs(title="critic training q loss")
dev.off()

# df = read.delim("/tmp/replay_memory_size", h=F)
# df$n = 1:nrow(df)
# png("/tmp/plots/09_replay_memory_size.png", width=640, height=320)
# ggplot(df, aes(n, V1)) + geom_point() + labs(title="replay memory size")
# dev.off()
18 changes: 13 additions & 5 deletions replay_memory.py
@@ -1,7 +1,7 @@
import numpy as np
import sys

class RingBuffer(object):
# 2d fill only ring buffer over 2d ndarray
def __init__(self, buffer_size, depth=1):
self.buffer_size = buffer_size
self.depth = depth
Expand All @@ -24,11 +24,11 @@ def random_indexes(self, n=1):
else:
return np.random.randint(0, self.insert, n)

def size(self): # number of entries
def size(self):
return self.buffer_size if self.full else self.insert

def debug_dump(self):
for r in xrange(len(self.memory)):
for r in xrange(self.buffer_size):
print " ", r, self.memory[r]
print "insert=%s full=%s" % (self.insert, self.full)

Expand All @@ -37,12 +37,17 @@ def __init__(self, buffer_size, state_dim, action_dim):
self.state_1 = RingBuffer(buffer_size, state_dim)
self.action = RingBuffer(buffer_size, action_dim)
self.reward = RingBuffer(buffer_size, 1)
self.terminal_mask = RingBuffer(buffer_size, 1)
self.state_2 = RingBuffer(buffer_size, state_dim)
# TODO: write to disk as part of ckpting

def add(self, s1, a, r, s2):
def add(self, s1, a, r, t, s2):
self.state_1.add(s1)
self.action.add(a)
self.reward.add(r)
# note: if state is terminal (i.e. True) we record a 0 to
# be used as mask during training
self.terminal_mask.add(0 if t else 1)
self.state_2.add(s2)

def size(self):
Expand All @@ -55,13 +60,16 @@ def dump(self):
self.action.debug_dump()
print "---- reward"
self.reward.debug_dump()
print "---- terminal"
self.terminal_mask.debug_dump()
print "---- state2"
self.state_2.debug_dump()

def batch(self, batch_size):
def random_batch(self, batch_size):
idxs = self.state_1.random_indexes(batch_size)
return (self.state_1.memory[idxs],
self.action.memory[idxs],
self.reward.memory[idxs],
self.terminal_mask.memory[idxs],
self.state_2.memory[idxs])

36 changes: 33 additions & 3 deletions util.py
@@ -1,9 +1,14 @@
#!/usr/bin/env python
import datetime, os, time, yaml, sys
import numpy as np
import tensorflow as tf

def l2_norm(tensor):
"""(row wise) l2 norm of a tensor"""
return tf.sqrt(tf.reduce_sum(tf.pow(tensor, 2)))

def standardise(tensor):
""" standardise a tensor. """
"""standardise a tensor"""
# is std_dev not an op in tensorflow?!? i must be taking crazy pills...
mean = tf.reduce_mean(tensor)
variance = tf.reduce_mean(tf.square(tensor - mean))
Expand Down Expand Up @@ -31,14 +36,14 @@ def load_latest_ckpt_or_init_if_none(self):
assert 'model_checkpoint_path' in info
most_recent_ckpt = "%s/%s" % (self.ckpt_dir, info['model_checkpoint_path'])
sys.stderr.write("loading ckpt %s\n" % most_recent_ckpt)
self.saver.restore(self.sess, most_recent_ckpt)
self.saver.restore(self.sess, most_recent_ckpt)
self.next_scheduled_save_time = time.time() + self.save_freq
else:
# no latest ckpts, init and force a save now
sys.stderr.write("no latest ckpt in %s, just initing vars...\n" % self.ckpt_dir)
self.sess.run(tf.initialize_all_variables())
self.force_save()

def force_save(self):
"""force a save now."""
dts = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
Expand All @@ -51,3 +56,28 @@ def save_if_required(self):
"""check if save is required based on time and if so, save."""
if time.time() >= self.next_scheduled_save_time:
self.force_save()

class OrnsteinUhlenbeckNoise(object):
"""generate time correlated noise for action exploration"""

def __init__(self, dim, theta=0.01, sigma=0.2, max_magnitude=1.5):
# dim: dimensionality of returned noise
# theta: how quickly the value moves; near zero => slow, near one => fast
# 0.01 gives very roughly 2/3 peaks troughs over ~1000 samples
# sigma: maximum range of values; 0.2 gives approximately the range (-1.5, 1.5)
# which is useful for shifting the output of a tanh which is (-1, 1)
# max_magnitude: max +ve / -ve value to clip at. dft clip at 1.5 (again for
# adding to output from tanh. we do this since sigma gives no guarantees
# regarding min/max values.
self.dim = dim
self.theta = theta
self.sigma = sigma
self.max_magnitude = max_magnitude
self.state = np.zeros(self.dim)

def sample(self):
self.state += self.theta * -self.state
self.state += self.sigma * np.random.randn(self.dim)
self.state = np.clip(self.max_magnitude, -self.max_magnitude, self.state)
return np.copy(self.state)

0 comments on commit 8372eac

Please sign in to comment.