In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
import tensorflow as tf
import numpy as np
import gym
import gym.spaces
from pylab import *
import scipy.misc
import time
from collections import namedtuple, deque
import time
import os.path
import os

from train_ops import create_train_ops
from utils import *
import utils

In [15]:
G = 0.99
N_ACTIONS = 3
ACTIONS = np.arange(N_ACTIONS) + 1
N_FRAMES_STACKED = 4
N_MAX_NOOPS = 30

## Network setup

In [4]:
Network = namedtuple('Network', 's a r a_softmax graph_v policy_loss value_loss summaries_train summaries_test')

In [5]:
def create_network(scope):
    with tf.variable_scope(scope):
        graph_s = tf.placeholder(tf.float32, [None, 80, 80, 4])
        graph_action = tf.placeholder(tf.int64, [None])
        graph_r = tf.placeholder(tf.float32, [None])

        x = tf.layers.conv2d(
                inputs=graph_s,
                filters=32,
                kernel_size=8,
                strides=4,
                activation=tf.nn.relu)

        x = tf.layers.conv2d(
                inputs=x,
                filters=64,
                kernel_size=4,
                strides=2,
                activation=tf.nn.relu)

        x = tf.layers.conv2d(
                inputs=x,
                filters=64,
                kernel_size=3,
                strides=1,
                activation=tf.nn.relu)

        w, h, f = x.shape[1:]
        x = tf.reshape(x, [-1, int(w * h * f)])

        x = tf.layers.dense(
                inputs=x,
                units=512,
                activation=tf.nn.relu)

        a_logits = tf.layers.dense(
                inputs=x,
                units=N_ACTIONS,
                activation=None)

        a_softmax = tf.nn.softmax(a_logits)

        graph_v = tf.layers.dense(
            inputs=x,
            units=1,
            activation=None)
        graph_v = graph_v[:, 0]

        p = 0
        for i in range(N_ACTIONS):
            p += tf.cast(tf.equal(graph_action, i), tf.float32) * a_softmax[:, i]
        # Log probability: higher is better for actions we want to encourage
        # Negative log probability: lower is better for actions we want to encourage
        # 1e-7: prevent log(0)
        nlp = -1 * tf.log(p + 1e-7)
        policy_loss = tf.reduce_mean(nlp * graph_r)

        value_loss = tf.reduce_mean((graph_r - graph_v) ** 2)
        
        s1 = tf.summary.scalar('policy_loss_train', policy_loss)
        s2 = tf.summary.scalar('value_loss_train', value_loss)
        s3 = tf.summary.scalar('policy_loss_test', policy_loss)
        s4 = tf.summary.scalar('value_loss_test', value_loss)
        summaries_train = tf.summary.merge([s1, s2])
        summaries_test = tf.summary.merge([s3, s4])
        
        network = Network(
            s=graph_s,
            a=graph_action,
            r=graph_r,
            a_softmax=a_softmax,
            graph_v=graph_v,
            policy_loss=policy_loss,
            value_loss=value_loss,
            summaries_train=summaries_train,
            summaries_test=summaries_test)
        
        return network

In [6]:
def list_set(l, i, val):
    assert(len(l) == i)
    l.append(val)

In [33]:
class Worker:
    
    def __init__(self, worker_n, env_name, summary_writer):
        self.env = EnvWrapper(gym.make(env_name), prepro2=prepro2, frameskip=4)
        
        worker_scope = "worker_%d" % worker_n
        self.network = create_network(worker_scope)
        self.summary_writer = summary_writer
        self.scope = worker_scope
        
        self.reward_var = tf.Variable(0.0)
        self.smoothed_reward = None
        self.reward_summary = tf.summary.scalar('reward', self.reward_var)
         
        # TODO: do these need to be separate?
        policy_optimizer = tf.train.AdamOptimizer(learning_rate=0.0005)
        value_optimizer = tf.train.AdamOptimizer(learning_rate=0.0005)
        
        self.train_op = policy_optimizer.minimize(self.network.policy_loss)
        
        self.update_policy_gradients, self.apply_policy_gradients, self.zero_policy_gradients, self.grad_bufs_policy = \
            create_train_ops(self.network.policy_loss,
                             policy_optimizer,
                             update_scope=worker_scope,
                             apply_scope='global')
        
        self.update_value_gradients, self.apply_value_gradients, self.zero_value_gradients, self.grad_bufs_value = \
            create_train_ops(self.network.value_loss,
                             value_optimizer,
                             update_scope=worker_scope,
                             apply_scope='global')
        
        self.frame_stack = deque(maxlen=N_FRAMES_STACKED)
        self.reset_env()

        self.t_max = 10000
        self.steps = 0
        self.lifetime_experience = []
        self.episode_rewards = []

    def reset_env(self):
        self.env.reset()
        n_noops = np.random.randint(low=0, high=N_MAX_NOOPS+1)
        print("%d no-ops..." % n_noops)
        for i in range(n_noops):
            o, _, _, _ = self.env.step(0)
            self.frame_stack.append(o)
        while len(self.frame_stack) < N_FRAMES_STACKED:
            print("One more...")
            o, _, _, _ = self.env.step(0)
            self.frame_stack.append(o)
        print("No-ops done")
        
    def append_to_lifetime_experience(self, feed_dict):
        states = feed_dict[self.network.s]
        actions = feed_dict[self.network.a]
        r = feed_dict[self.network.r]
        self.lifetime_experience.append((states, actions, r))
        
    def lifetime_experience_to_feed_dict(self):
        states = []
        actions = []
        r = []
        for tup in self.lifetime_experience:
            tup_s, tup_a, tup_r = tup
            states.extend(tup_s)
            actions.extend(tup_a)
            r.extend(tup_r)
        feed_dict = {self.network.s: states,
                     self.network.a: actions,
                     self.network.r: r}
        return feed_dict
    
    def run_summaries(self, feed_dict):
        summaries = sess.run(self.network.summaries, feed_dict)
        self.summary_writer.add_summary(summaries, self.steps)
        
    def log_rewards(self):
        reward_sum = sum(self.episode_rewards)
        print("Reward sum was", reward_sum)
        if self.smoothed_reward is None:
            self.smoothed_reward = reward_sum
        else:
            self.smoothed_reward = self.smoothed_reward * 0.99 + reward_sum * 0.01
        print("Smoothed reward sum is %.1f" % self.smoothed_reward)
        sess.run(tf.assign(self.reward_var, self.smoothed_reward))
        summ = sess.run(self.reward_summary)
        self.summary_writer.add_summary(summ, self.steps)
        
    def sync_network(self):
        copy_network(sess,
                     from_scope='global',
                     to_scope=self.scope)
        
    def run_step(self):
        states = []
        actions = []
        rewards = []
        i = 0
        
        sess.run([self.zero_policy_gradients,
                  self.zero_value_gradients])
        self.sync_network()

        list_set(states, i, self.frame_stack)

        done = False
        while not done and i < self.t_max:
            #print("Step %d" % i)
            feed_dict = {self.network.s: [np.moveaxis(self.frame_stack, source=0, destination=-1)]}
            a_p = sess.run(self.network.a_softmax, feed_dict=feed_dict)[0]
            a = np.random.choice(ACTIONS, p=a_p)
            list_set(actions, i, a)
            
            o, r, done, _ = self.env.step(a)
            if r != 0:
                print("Got reward", r)
            self.frame_stack.append(o)
            self.episode_rewards.append(r)
            list_set(rewards, i, r)
            list_set(states, i + 1, np.copy(self.frame_stack))

            i += 1

        if done:
            print("Episode done!")
            r = 0
        else:
            # We're not at the end of an episode, so we have to estimate
            # the value of the current state using the value network
            feed_dict = {self.network.s: [np.moveaxis(states[i], source=0, destination=-1)]} # the last state
            r = sess.run(self.network.graph_v, feed_dict=feed_dict)[0]

        """
        rewards = discount_rewards(rewards, G)
        rewards -= np.mean(rewards)
        rewards /= np.std(rewards)
        feed_dict = {self.network.s: states[:-1],
                     self.network.a: list(np.array(actions) - 1), # map from possible actions (1, 2, 3) -> (0, 1, 2)
                     self.network.r: rewards}
        sess.run(self.train_op, feed_dict)
        """
        # i - 1 to 0
        # (Why start from i - 1, rather than i?
        #  So that we miss out the last state.)
        for j in reversed(range(i)):
            if rewards[j] != 0:
                r = rewards[j]
            else:
                r = rewards[j] + G * r
            feed_dict = {self.network.s: [np.moveaxis(states[j], source=0, destination=-1)],
                         self.network.a: [actions[j] - 1], # map from possible actions (1, 2, 3) -> (0, 1, 2)
                         self.network.r: [r]}
            sess.run(self.train_op, feed_dict)
            #self.append_to_lifetime_experience(feed_dict)
            sess.run([self.update_policy_gradients,
                      self.update_value_gradients],
                      feed_dict)
        sess.run([self.apply_policy_gradients,
                  self.apply_value_gradients])
        sess.run([self.zero_policy_gradients,
                  self.zero_value_gradients])
        
        if done:
            summary_ops = self.network.summaries_test
            #summ_feed_dict = self.lifetime_experience_to_feed_dict()
            summ_feed_dict = feed_dict
        else:
            summary_ops = self.network.summaries_train
            summ_feed_dict = feed_dict
        #summaries = sess.run(summary_ops, summ_feed_dict)
        #self.summary_writer.add_summary(summaries, self.steps)
        
        if done:
            self.log_rewards()
            self.episode_rewards = []
        
        self.steps += 1
        
        return done

In [34]:
tf.reset_default_graph()
sess = tf.Session()

In [35]:
global_network = create_network('global')

In [36]:
dirname = 'summaries/' + str(int(time.time()))
os.makedirs(dirname)
summary_writer = tf.summary.FileWriter(dirname, flush_secs=1)

In [37]:
workers = []
for i in range(1):
    workers.append(Worker(i, 'PongNoFrameskip-v4', summary_writer))

[2017-08-16 13:04:11,745] Making new env: PongNoFrameskip-v4


10 no-ops...
No-ops done


In [38]:
sess.run(tf.global_variables_initializer())

In [39]:
while True:
    done = workers[0].run_step()
    while not done:
        done = workers[0].run_step()
    workers[0].reset_env()

Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Episode done!
Reward sum was -21.0
Smoothed reward sum is -21.0
6 no-ops...
No-ops done
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Got reward -1.0
Episode done!


KeyboardInterrupt: 

### Manual action encouragement test

In [146]:
workers[0].reset_env()

In [147]:
o, _, _ = utils.get_o(workers[0].env, 0)

In [182]:
sess.run(workers[0].train_op, {workers[0].network.s: [o], workers[0].network.a: [1], workers[0].network.r: [1]})
sess.run(workers[0].network.a_softmax, {workers[0].network.s: [o]})

## Action probability selection test

In [None]:
sess.run(tf.global_variables_initializer())
s = np.random.random((1, 84, 84, 4))
s = np.vstack([s] * N_ACTIONS)
expected = -1 * np.log(sess.run(a_softmax, feed_dict={graph_s: s})[0])
actual = sess.run(nlp, feed_dict={graph_s: s, graph_action: np.arange(N_ACTIONS)})
np.testing.assert_allclose(expected, actual)

## Basic policy network training test

In [None]:
sess.run(tf.global_variables_initializer())
n = 5
s = np.random.random((1, 84, 84, 4))
s = np.vstack([s] * n)

# encourage action 0
print(sess.run(a_softmax, feed_dict={graph_s: s})[0])
a = [0] * n
ad = [1] * n
for i in range(3):
    sess.run(t, feed_dict={graph_s: s, graph_action: a, graph_advantage: ad})
    print(sess.run(a_softmax, feed_dict={graph_s: s})[0])
    
# now the probability of action[0] will be very close to 1,
# so the gradients will be too small to work
# so reset
sess.run(tf.global_variables_initializer())
print()

# discourage action 0
print(sess.run(a_softmax, feed_dict={graph_s: s})[0])
a = [0] * n
ad = [-1] * n
for i in range(3):
    sess.run(t, feed_dict={graph_s: s, graph_action: a, graph_advantage: ad})
    print(sess.run(a_softmax, feed_dict={graph_s: s})[0])

In [None]:
sess.run(tf.global_variables_initializer())
sess.run(tf.assign(graph_lr, 1e-3))
print(sess.run(a_softmax, feed_dict={graph_s: s})[0])

In [None]:
sess.run(t1, feed_dict={graph_s: s, graph_action: [0], graph_advantage: [-1]})
print(sess.run(a_softmax, feed_dict={graph_s: s})[0])

## Basic value network training test

In [None]:
s = np.random.normal(size=(5, 84, 84, 4))
r = np.array([1, 2, 3, 4, 5]) / 5.0
print(r)
sess.run(tf.global_variables_initializer())
sess.run(graph_v, feed_dict={graph_s: s})

In [None]:
sess.run(t2, feed_dict={graph_s: s, graph_r: r})
sess.run([loss2, graph_v], feed_dict={graph_s: s, graph_r: r})

## Dummy environment

In [None]:
class DummyEnv:
    action_space = gym.spaces.discrete.Discrete(2)
    N_STATES = 3
    N_REWARDED_SEQUENCES = int(N_ACTIONS ** N_STATES / 10)
    
    def __init__(self):
        self.rewarded_sequences = np.clip(np.random.normal(loc=int(N_ACTIONS / 2), size=(self.N_REWARDED_SEQUENCES, self.N_STATES)).astype(np.int), 0, N_ACTIONS)
        self.rewarded_sequences = np.unique(self.rewarded_sequences, axis=0)
        self.o = {}
        # look at all unique sequence prefixes
        for i in range(1, self.N_STATES + 1):
            prefixes = np.unique(env.rewarded_sequences[:, :i], axis=0)
            for p in prefixes:
                self.o[tuple(p)] = np.random.normal(size=(84, 84, 4))
        self.default_o = np.random.normal(size=(84, 84, 4))
        self.init_o = np.random.normal(size=(84, 84, 4))
    
    def reset(self):
        self.i = 0
        self.history = []
        self.possible_sequences = np.copy(self.rewarded_sequences)
        return self.init_o
    
    def step(self, a):
        self.history.append(a)
        
        ps_new = []
        l = len(self.history)
        for seq in self.possible_sequences:
            if np.array_equal(seq[:l], self.history):
                ps_new.append(seq)
        self.possible_sequences = ps_new
        if len(self.possible_sequences) > 0:
            reward = 1
        else:
            reward = 0
            
        if tuple(self.history) in self.o:
            o = self.o[tuple(self.history)]
        else:
            o = self.default_o

        if self.i != self.N_STATES - 1:
            done = False
            self.i += 1
        else:
            done = True

        return o, reward, done, None

In [None]:
env = DummyEnv()

In [None]:
for i in range(N_ACTIONS):
    env.reset()
    env.step(i)
    print(len(env.possible_sequences))

In [None]:
# Initial state same?
o = env.reset()
print(np.sum(o))
o = env.reset()
print(np.sum(o))

In [None]:
# Default state the same?
env.reset()
o, _, _, _ = env.step(0)
print(np.sum(o))
env.reset()
o, _, _, _ = env.step(8)
print(np.sum(o))

In [None]:
# Always the same state returned for a given sequence?
env.reset()
o, _, _, _ = env.step(5)
print(np.sum(o))
env.reset()
o, _, _, _ = env.step(5)
print(np.sum(o))

env.reset()
env.step(5)
o, _, _, _ = env.step(5)
print(np.sum(o))
env.reset()
env.step(5)
o, _, _, _ = env.step(5)
print(np.sum(o))

In [None]:
def test_seq(seq):
    for a in seq:
        print("Took action", a)
        _, reward, done, _ = env.step(a)
        print("Got reward ", reward)
        print("Done:", done)

env.reset()
test_seq([0, 0, 0])
print()
env.reset()
test_seq(env.rewarded_sequences[0])

## Testing proper

### Manual testing

In [None]:
sess.run(tf.global_variables_initializer())
sess.run(t, feed_dict={graph_s: [env.o[2]], graph_action: [1], graph_advantage: [1]})
sess.run(a_softmax, feed_dict={graph_s: env.o})

### Testing with environment

In [None]:
rpe = []
rpe2 = []
sess.run(tf.global_variables_initializer())
steps = 0

In [None]:
def run(n_episodes=1000):
    global steps
    for i in range(n_episodes):
        if i % 100 == 0:
            print(i)
        env.reset()
        o, _, _ = get_o(0) # do nothing
        done = False
        episode_rewards = []
        rewards = []
        actions = []
        states = []
        while not done:
            states.append(o)
            a_p = sess.run(a_softmax, feed_dict={graph_s: [o]})[0]
            a = np.random.choice(ACTIONS, p=a_p)
            actions.append(a)

            o, r, done = get_o(a)
            rewards.append(r)

            if r != 0:
                episode_rewards.append(r)
                returns = rewards_to_returns(rewards, G=0.9)
                actions = np.array(actions) - 1 # map from possible actions (1, 2, 3) -> (0, 1, 2)
                summ, _, _ = sess.run([summaries, t, t2], feed_dict={graph_s: states, graph_action: actions, graph_r: returns})
                summary_writer.add_summary(summ, steps)
                rewards = []
                actions = []
                states = []
                steps += 1
        
        rpe2.append(episode_rewards)
        if len(rpe) is 0:
            rpe.append(np.sum(episode_rewards))
        else:
            rpe.append(rpe[-1] + 0.01 * (np.sum(episode_rewards) - rpe[-1]))

In [None]:
RENDER = True
run(n_episodes=1)

In [None]:
rpe = [np.sum(rpe2[0])]
for s in rpe2[1:]:
    rpe.append(rpe[-1] + 0.01 * (np.sum(s) - rpe[-1]))

In [None]:
plt.plot(rpe)

## Old code

In [None]:
t = 0
t_max = 100

s = np.zeros(10)
r1 = np.zeros(10)
a = np.zeros(10)
r2_list = []

for i in range(10):
    t_start = t
    s[t] = env.reset()
    done = False
    
    while True:
        a_p = sess.run(ap_op, feed_dict={s_op: s})
        a[t] = np.random.choice(ACTIONS, p=a_p)
        s[t+1], r[t], done, _ = env.step(a[t])
        t += 1        
    if done or t - t_start == t_max:
        break
    
    if done:
        r2 = 0
    else:
        r2 = sess.run(v_op, feed_dict={s_op: s[t]})
    
    for i in range(t-1, t_start-1, -1):
        r2_list.append(r[i] + g * r2)
    
    
    sess.run(t1, feed_dict={s_op: s, big_r_op: r})
    sess.run(t2, feed_dict={s_op: s, big_r_op: r})

In [None]:
observation.shape
action = env.action_space.sample()
os = []
for i in range(4):
    o1, _, _, _ = env.step(action)
    o2, _, _, _ = env.step(action)
    o = np.maximum(o1, o2)
    o = np.mean(o, axis=2)
    o = scipy.misc.imresize(o, (84, 84))
    os.append(o)
os = np.stack(os, axis=-1)

In [None]:
os = np.resize(os, (1, 84, 84, 4))
print(os.shape)
print(sess.run(q['network1'], feed_dict={s: os}))
print(sess.run(qi['network1'], feed_dict={s: os}))