In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import numpy as np
import gym
import gym.spaces
from pylab import *
import scipy.misc
import time
from collections import namedtuple
import time
import os.path
import os

from train_ops import create_train_ops
from utils import *
import utils

In [3]:
%matplotlib notebook

In [4]:
G = 0.99
N_ACTIONS = 3
ACTIONS = np.arange(N_ACTIONS) + 1

## Network setup

In [5]:
Network = namedtuple('Network', 's a r a_softmax graph_v policy_loss value_loss summaries_train summaries_test')

In [90]:
def create_network(scope):
    with tf.variable_scope(scope):
        graph_s = tf.placeholder(tf.float32, [None, 84, 84, 4])
        graph_action = tf.placeholder(tf.int64, [None])
        graph_r = tf.placeholder(tf.float32, [None])

        x = tf.layers.conv2d(
                inputs=graph_s,
                filters=32,
                kernel_size=8,
                strides=4,
                activation=tf.nn.relu)

        x = tf.layers.conv2d(
                inputs=x,
                filters=64,
                kernel_size=4,
                strides=2,
                activation=tf.nn.relu)

        x = tf.layers.conv2d(
                inputs=x,
                filters=64,
                kernel_size=3,
                strides=1,
                activation=tf.nn.relu)

        w, h, f = x.shape[1:]
        x = tf.reshape(x, [-1, int(w * h * f)])

        x = tf.layers.dense(
                inputs=x,
                units=512,
                activation=tf.nn.relu)

        a_logits = tf.layers.dense(
                inputs=x,
                units=N_ACTIONS,
                activation=None)

        a_softmax = tf.nn.softmax(a_logits)

        graph_v = tf.layers.dense(
            inputs=x,
            units=1,
            activation=None)
        graph_v = graph_v[:, 0]

        p = 0
        for i in range(N_ACTIONS):
            p += tf.cast(tf.equal(graph_action, i), tf.float32) * a_softmax[:, i]
        # Log probability: higher is better for actions we want to encourage
        # Negative log probability: lower is better for actions we want to encourage
        # 1e-7: prevent log(0)
        nlp = -1 * tf.log(p + 1e-7)
        policy_loss = tf.reduce_mean(nlp * graph_r)

        value_loss = tf.reduce_mean((graph_r - graph_v) ** 2)
        
        s1 = tf.summary.scalar('policy_loss_train', policy_loss)
        s2 = tf.summary.scalar('value_loss_train', value_loss)
        s3 = tf.summary.scalar('policy_loss_test', policy_loss)
        s4 = tf.summary.scalar('value_loss_test', value_loss)
        summaries_train = tf.summary.merge([s1, s2])
        summaries_test = tf.summary.merge([s3, s4])
        
        network = Network(
            s=graph_s,
            a=graph_action,
            r=graph_r,
            a_softmax=a_softmax,
            graph_v=graph_v,
            policy_loss=policy_loss,
            value_loss=value_loss,
            summaries_train=summaries_train,
            summaries_test=summaries_test)
        
        return network

In [91]:
def list_set(l, i, val):
    assert(len(l) == i)
    l.append(val)

In [92]:
class Worker:
    
    def __init__(self, worker_n, env_name, summary_writer):
        self.env = gym.make(env_name)
        
        worker_scope = "worker_%d" % worker_n
        self.network = create_network(worker_scope)
        self.summary_writer = summary_writer
        self.scope = worker_scope
        
        self.reward_var = tf.Variable(0.0)
        self.smoothed_reward = None
        self.reward_summary = tf.summary.scalar('reward', self.reward_var)
         
        # TODO: do these need to be separate?
        policy_optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
        value_optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
        
        self.update_policy_gradients, self.apply_policy_gradients, self.zero_policy_gradients, self.grad_bufs_policy = \
            create_train_ops(self.network.policy_loss,
                             policy_optimizer,
                             update_scope=worker_scope,
                             apply_scope='global')
        
        self.update_value_gradients, self.apply_value_gradients, self.zero_value_gradients, self.grad_bufs_value = \
            create_train_ops(self.network.value_loss,
                             value_optimizer,
                             update_scope=worker_scope,
                             apply_scope='global')
        
        self.env.reset()

        self.t_max = 4
        self.steps = 0
        self.lifetime_experience = []
        self.episode_rewards = []

    def reset_env(self):
        self.env.reset()
        
    def append_to_lifetime_experience(self, feed_dict):
        states = feed_dict[self.network.s]
        actions = feed_dict[self.network.a]
        r = feed_dict[self.network.r]
        self.lifetime_experience.append((states, actions, r))
        
    def lifetime_experience_to_feed_dict(self):
        states = []
        actions = []
        r = []
        for tup in self.lifetime_experience:
            tup_s, tup_a, tup_r = tup
            states.extend(tup_s)
            actions.extend(tup_a)
            r.extend(tup_r)
        feed_dict = {self.network.s: states,
                     self.network.a: actions,
                     self.network.r: r}
        return feed_dict
    
    def run_summaries(self, feed_dict):
        summaries = sess.run(self.network.summaries, feed_dict)
        self.summary_writer.add_summary(summaries, self.steps)
        
    def log_rewards(self):
        reward_sum = sum(self.episode_rewards)
        print("Reward sum was", reward_sum)
        if self.smoothed_reward is None:
            self.smoothed_reward = reward_sum
        else:
            self.smoothed_reward = self.smoothed_reward * 0.99 + reward_sum * 0.01
        print("Smoothed reward sum is %.1f" % self.smoothed_reward)
        sess.run(tf.assign(self.reward_var, self.smoothed_reward))
        print(sess.run(self.reward_var))
        summ = sess.run(self.reward_summary)
        self.summary_writer.add_summary(summ, self.steps)
        
    def sync_network(self):
        copy_network(sess,
                     from_scope='global',
                     to_scope=self.scope)
        
    def run_step(self):
        states = []
        actions = []
        rewards = []
        i = 0
        
        sess.run([self.zero_policy_gradients,
                  self.zero_value_gradients])
        self.sync_network()

        o, _, _ = utils.get_o(self.env, 0) # 0 = do nothing (in Pong, at least)
        list_set(states, i, o)

        done = False
        while not done: #and i < self.t_max:
            #print("Step %d" % i)
            feed_dict = {self.network.s: [o]}
            a_p = sess.run(self.network.a_softmax, feed_dict=feed_dict)[0]
            a = np.random.choice(ACTIONS, p=a_p)
            list_set(actions, i, a)

            o, r, done = utils.get_o(self.env, a)
            #print("Got reward", r)
            self.episode_rewards.append(r)
            list_set(rewards, i, r)
            list_set(states, i + 1, o)

            i += 1

        if done:
            print("Episode done!")
            r = 0
        else:
            # We're not at the end of an episode, so we have to estimate
            # the value of the current state using the value network
            feed_dict = {self.network.s: [states[i]]} # the last state
            r = sess.run(self.network.graph_v, feed_dict=feed_dict)[0]

        # i - 1 to 0
        # (Why start from i - 1, rather than i?
        #  So that we miss out the last state.)
        for j in reversed(range(i)):
            if rewards[j] != 0:
                r = rewards[j]
            else:
                r = rewards[j] + G * r
            feed_dict = {self.network.s: [states[j]],
                         self.network.a: [actions[j] - 1], # map from possible actions (1, 2, 3) -> (0, 1, 2)
                         self.network.r: [r]}
            self.append_to_lifetime_experience(feed_dict)
            sess.run([self.update_policy_gradients,
                      self.update_value_gradients],
                      feed_dict)
        sess.run([self.apply_policy_gradients,
                  self.apply_value_gradients])
        sess.run([self.zero_policy_gradients,
                  self.zero_value_gradients])
        
        if done:
            summary_ops = self.network.summaries_test
            #summ_feed_dict = self.lifetime_experience_to_feed_dict()
            summ_feed_dict = feed_dict
        else:
            summary_ops = self.network.summaries_train
            summ_feed_dict = feed_dict
        #summaries = sess.run(summary_ops, summ_feed_dict)
        #self.summary_writer.add_summary(summaries, self.steps)
        
        if done:
            self.log_rewards()
            self.episode_rewards = []
        
        self.steps += 1
        
        return done

In [93]:
tf.reset_default_graph()
sess = tf.Session()

In [94]:
global_network = create_network('global')

In [95]:
dirname = 'summaries/' + str(int(time.time()))
os.makedirs(dirname)
summary_writer = tf.summary.FileWriter(dirname, flush_secs=1)

In [96]:
workers = []
for i in range(1):
    workers.append(Worker(i, 'Pong-v0', summary_writer))

[2017-07-30 09:29:55,097] Making new env: Pong-v0


In [97]:
sess.run(tf.global_variables_initializer())

In [98]:
for i in range(1):
    done = workers[0].run_step()
    while not done:
        done = workers[0].run_step()
    workers[0].reset_env()

Episode done!
Reward sum was -21.0
Smoothed reward sum is -21.0
-21.0


In [99]:
feed_dict = workers[0].lifetime_experience_to_feed_dict()
step = 0

In [100]:
for i in range(100):
    print(step)
    sess.run(workers[0].zero_value_gradients)
    sess.run(workers[0].zero_policy_gradients)
    sess.run(workers[0].update_value_gradients, feed_dict)
    sess.run(workers[0].update_policy_gradients, feed_dict)
    print("Losses:")
    print(sess.run(workers[0].network.value_loss, feed_dict))
    print(sess.run(workers[0].network.policy_loss, feed_dict))
    print("Gradients:")
    for key, value in workers[0].grad_bufs_value.items():
        print("%s:" % key, np.sum(sess.run(value)))
    for key, value in workers[0].grad_bufs_policy.items():
        print("%s:" % key, np.sum(sess.run(value)))
    sess.run(workers[0].apply_value_gradients)
    sess.run(workers[0].apply_policy_gradients)
    workers[0].sync_network()
    summ = sess.run(workers[0].network.summaries_test, feed_dict)
    summary_writer.add_summary(summ, step)
    step += 1

0
Losses:
1.01278
-1.07163
Gradients:
dense/kernel: 19.887
conv2d_2/kernel: -26.4727
conv2d_2/bias: -0.956003
dense_2/kernel: 30.9552
dense_2/bias: 2.01238
conv2d_1/kernel: -13.7214
conv2d/kernel: 34.5478
conv2d/bias: 0.452606
dense/bias: 0.243876
conv2d_1/bias: -0.242694
dense/kernel: 0.418629
conv2d_2/kernel: -0.316022
conv2d_2/bias: -0.0173784
dense/bias: 0.00516315
conv2d_1/kernel: 0.39238
dense_1/bias: -3.72529e-09
conv2d/bias: -0.00237977
conv2d/kernel: -0.249885
conv2d_1/bias: 0.0050676
dense_1/kernel: -6.63567e-08
1
Losses:
0.521596
-1.08927
Gradients:
dense/kernel: -258.674
conv2d_2/kernel: -159.776
conv2d_2/bias: -5.88009
dense_2/kernel: 23.7609
dense_2/bias: 1.44393
conv2d_1/kernel: -105.726
conv2d/kernel: -156.93
conv2d/bias: -1.54087
dense/bias: -2.93334
conv2d_1/bias: -2.74105
dense/kernel: -22.8507
conv2d_2/kernel: -9.36716
conv2d_2/bias: -0.357423
dense/bias: -0.259236
conv2d_1/kernel: -6.30035
dense_1/bias: 1.49012e-08
conv2d/bias: -0.122627
conv2d/kernel: -12.3619
con

16
Losses:
0.401211
-2.56273
Gradients:
dense/kernel: 811.804
conv2d_2/kernel: 526.531
conv2d_2/bias: 12.6867
dense_2/kernel: -77.2999
dense_2/bias: -1.26629
conv2d_1/kernel: 509.949
conv2d/kernel: 644.451
conv2d/bias: 6.78631
dense/bias: 3.63697
conv2d_1/bias: 9.3058
dense/kernel: -1325.61
conv2d_2/kernel: -577.72
conv2d_2/bias: -13.8942
dense/bias: -5.93881
conv2d_1/kernel: -588.621
dense_1/bias: -1.19209e-07
conv2d/bias: -7.87363
conv2d/kernel: -758.789
conv2d_1/bias: -10.6602
dense_1/kernel: 2.23517e-07
17
Losses:
0.485373
-2.94944
Gradients:
dense/kernel: 911.148
conv2d_2/kernel: 586.718
conv2d_2/bias: 13.514
dense_2/kernel: -95.61
dense_2/bias: -1.3929
conv2d_1/kernel: 599.562
conv2d/kernel: 757.02
conv2d/bias: 7.99312
dense/bias: 3.76895
conv2d_1/bias: 10.5662
dense/kernel: -1507.42
conv2d_2/kernel: -680.243
conv2d_2/bias: -15.5985
dense/bias: -6.23541
conv2d_1/kernel: -725.505
dense_1/bias: 5.96046e-08
conv2d/bias: -9.5749
conv2d/kernel: -915.35
conv2d_1/bias: -12.7206
dense_1/

32
Losses:
0.233944
-10.6846
Gradients:
dense/kernel: -464.643
conv2d_2/kernel: -49.3041
conv2d_2/bias: -0.63429
dense_2/kernel: 342.706
dense_2/bias: 0.966627
conv2d_1/kernel: -37.0883
conv2d/kernel: -88.5648
conv2d/bias: -0.63047
dense/bias: -0.675525
conv2d_1/bias: -0.16799
dense/kernel: -0.413933
conv2d_2/kernel: -0.250237
conv2d_2/bias: -0.00290758
dense/bias: -0.000601774
conv2d_1/kernel: -0.309336
dense_1/bias: -7.27596e-12
conv2d/bias: -0.00347851
conv2d/kernel: -0.324432
conv2d_1/bias: -0.00347127
dense_1/kernel: -5.69344e-10
33
Losses:
0.345801
-10.6846
Gradients:
dense/kernel: -579.277
conv2d_2/kernel: -37.9372
conv2d_2/bias: -0.502629
dense_2/kernel: 444.77
dense_2/bias: 1.1755
conv2d_1/kernel: -15.7615
conv2d/kernel: -75.4835
conv2d/bias: -0.411813
dense/bias: -0.807276
conv2d_1/bias: 0.128899
dense/kernel: -0.073328
conv2d_2/kernel: -0.0449373
conv2d_2/bias: -0.000506974
dense/bias: -0.000102184
conv2d_1/kernel: -0.0557869
dense_1/bias: 9.09495e-13
conv2d/bias: -0.0006229

47
Losses:
0.10252
-10.6846
Gradients:
dense/kernel: -600.036
conv2d_2/kernel: -61.4261
conv2d_2/bias: -0.608965
dense_2/kernel: 373.915
dense_2/bias: 0.639273
conv2d_1/kernel: -53.1755
conv2d/kernel: -84.3929
conv2d/bias: -0.709141
dense/bias: -0.633319
conv2d_1/bias: -0.414487
dense/kernel: -1.67335e-08
conv2d_2/kernel: -1.11529e-08
conv2d_2/bias: -1.03813e-10
dense/bias: -1.76612e-11
conv2d_1/kernel: -1.39651e-08
dense_1/bias: 0.0
conv2d/bias: -1.51532e-10
conv2d/kernel: -1.40955e-08
conv2d_1/bias: -1.37101e-10
dense_1/kernel: -3.29597e-17
48
Losses:
0.0693944
-10.6846
Gradients:
dense/kernel: -497.648
conv2d_2/kernel: -55.7148
conv2d_2/bias: -0.546202
dense_2/kernel: 311.289
dense_2/bias: 0.525519
conv2d_1/kernel: -49.9112
conv2d/kernel: -76.2155
conv2d/bias: -0.657265
dense/bias: -0.521197
conv2d_1/bias: -0.402649
dense/kernel: -9.60172e-09
conv2d_2/kernel: -6.41787e-09
conv2d_2/bias: -5.94186e-11
dense/bias: -1.00558e-11
conv2d_1/kernel: -8.04011e-09
dense_1/bias: 5.42101e-20
con

62
Losses:
0.00152902
-10.6846
Gradients:
dense/kernel: 74.5363
conv2d_2/kernel: 10.9608
conv2d_2/bias: 0.101842
dense_2/kernel: -44.3892
dense_2/bias: -0.0687949
conv2d_1/kernel: 11.1176
conv2d/kernel: 14.5942
conv2d/bias: 0.138
dense/bias: 0.074222
conv2d_1/bias: 0.096896
dense/kernel: -1.75691e-10
conv2d_2/kernel: -1.21041e-10
conv2d_2/bias: -1.08272e-12
dense/bias: -1.74913e-13
conv2d_1/kernel: -1.52366e-10
dense_1/bias: -1.69407e-21
conv2d/bias: -1.62961e-12
conv2d/kernel: -1.5158e-10
conv2d_1/bias: -1.45879e-12
dense_1/kernel: -3.33731e-19
63
Losses:
0.0108654
-10.6846
Gradients:
dense/kernel: 212.009
conv2d_2/kernel: 34.5929
conv2d_2/bias: 0.32048
dense_2/kernel: -132.726
dense_2/bias: -0.20513
conv2d_1/kernel: 36.2696
conv2d/kernel: 46.6351
conv2d/bias: 0.442705
dense/bias: 0.210765
conv2d_1/bias: 0.317486
dense/kernel: -1.55009e-10
conv2d_2/kernel: -1.06224e-10
conv2d_2/bias: -9.49137e-13
dense/bias: -1.54086e-13
conv2d_1/kernel: -1.3368e-10
dense_1/bias: 1.69407e-21
conv2d/bi

77
Losses:
0.00180733
-10.6846
Gradients:
dense/kernel: 77.05
conv2d_2/kernel: 11.8066
conv2d_2/bias: 0.109313
dense_2/kernel: -50.2201
dense_2/bias: -0.0764149
conv2d_1/kernel: 12.4065
conv2d/kernel: 16.2707
conv2d/bias: 0.152207
dense/bias: 0.076013
conv2d_1/bias: 0.10692
dense/kernel: -7.08535e-11
conv2d_2/kernel: -4.8676e-11
conv2d_2/bias: -4.32871e-13
dense/bias: -6.98859e-14
conv2d_1/kernel: -6.13137e-11
dense_1/bias: 0.0
conv2d/bias: -6.53065e-13
conv2d/kernel: -6.07285e-11
conv2d_1/bias: -5.84723e-13
dense_1/kernel: -1.55007e-19
78
Losses:
0.000352058
-10.6846
Gradients:
dense/kernel: -4.07013
conv2d_2/kernel: -0.593576
conv2d_2/bias: -0.00550827
dense_2/kernel: 2.65596
dense_2/bias: 0.00402765
conv2d_1/kernel: -0.626861
conv2d/kernel: -0.834636
conv2d/bias: -0.00782127
dense/bias: -0.00400222
conv2d_1/bias: -0.00542407
dense/kernel: -6.97957e-11
conv2d_2/kernel: -4.7958e-11
conv2d_2/bias: -4.26501e-13
dense/bias: -6.88386e-14
conv2d_1/kernel: -6.04103e-11
dense_1/bias: -4.2351

92
Losses:
0.00102136
-10.6846
Gradients:
dense/kernel: -52.2893
conv2d_2/kernel: -7.40697
conv2d_2/bias: -0.0688054
dense_2/kernel: 34.2164
dense_2/bias: 0.0518889
conv2d_1/kernel: -7.71316
conv2d/kernel: -10.3443
conv2d/bias: -0.0957128
dense/bias: -0.0515158
conv2d_1/bias: -0.0657843
dense/kernel: -6.07111e-11
conv2d_2/kernel: -4.17731e-11
conv2d_2/bias: -3.71355e-13
dense/bias: -5.9825e-14
conv2d_1/kernel: -5.26347e-11
dense_1/bias: 4.23516e-22
conv2d/bias: -5.5992e-13
conv2d/kernel: -5.20676e-11
conv2d_1/bias: -5.01757e-13
dense_1/kernel: -1.16679e-19
93
Losses:
0.000420557
-10.6846
Gradients:
dense/kernel: -17.1802
conv2d_2/kernel: -2.48285
conv2d_2/bias: -0.0230409
dense_2/kernel: 11.2378
dense_2/bias: 0.0170318
conv2d_1/kernel: -2.60647
conv2d/kernel: -3.47023
conv2d/bias: -0.032267
dense/bias: -0.0169167
conv2d_1/bias: -0.0223433
dense/kernel: -6.03099e-11
conv2d_2/kernel: -4.1499e-11
conv2d_2/bias: -3.68898e-13
dense/bias: -5.94259e-14
conv2d_1/kernel: -5.22883e-11
dense_1/bi

## Action probability selection test

In [None]:
sess.run(tf.global_variables_initializer())
s = np.random.random((1, 84, 84, 4))
s = np.vstack([s] * N_ACTIONS)
expected = -1 * np.log(sess.run(a_softmax, feed_dict={graph_s: s})[0])
actual = sess.run(nlp, feed_dict={graph_s: s, graph_action: np.arange(N_ACTIONS)})
np.testing.assert_allclose(expected, actual)

## Basic policy network training test

In [None]:
sess.run(tf.global_variables_initializer())
n = 5
s = np.random.random((1, 84, 84, 4))
s = np.vstack([s] * n)

# encourage action 0
print(sess.run(a_softmax, feed_dict={graph_s: s})[0])
a = [0] * n
ad = [1] * n
for i in range(3):
    sess.run(t, feed_dict={graph_s: s, graph_action: a, graph_advantage: ad})
    print(sess.run(a_softmax, feed_dict={graph_s: s})[0])
    
# now the probability of action[0] will be very close to 1,
# so the gradients will be too small to work
# so reset
sess.run(tf.global_variables_initializer())
print()

# discourage action 0
print(sess.run(a_softmax, feed_dict={graph_s: s})[0])
a = [0] * n
ad = [-1] * n
for i in range(3):
    sess.run(t, feed_dict={graph_s: s, graph_action: a, graph_advantage: ad})
    print(sess.run(a_softmax, feed_dict={graph_s: s})[0])

In [None]:
sess.run(tf.global_variables_initializer())
sess.run(tf.assign(graph_lr, 1e-3))
print(sess.run(a_softmax, feed_dict={graph_s: s})[0])

In [None]:
sess.run(t1, feed_dict={graph_s: s, graph_action: [0], graph_advantage: [-1]})
print(sess.run(a_softmax, feed_dict={graph_s: s})[0])

## Basic value network training test

In [None]:
s = np.random.normal(size=(5, 84, 84, 4))
r = np.array([1, 2, 3, 4, 5]) / 5.0
print(r)
sess.run(tf.global_variables_initializer())
sess.run(graph_v, feed_dict={graph_s: s})

In [None]:
sess.run(t2, feed_dict={graph_s: s, graph_r: r})
sess.run([loss2, graph_v], feed_dict={graph_s: s, graph_r: r})

## Dummy environment

In [None]:
class DummyEnv:
    action_space = gym.spaces.discrete.Discrete(2)
    N_STATES = 3
    N_REWARDED_SEQUENCES = int(N_ACTIONS ** N_STATES / 10)
    
    def __init__(self):
        self.rewarded_sequences = np.clip(np.random.normal(loc=int(N_ACTIONS / 2), size=(self.N_REWARDED_SEQUENCES, self.N_STATES)).astype(np.int), 0, N_ACTIONS)
        self.rewarded_sequences = np.unique(self.rewarded_sequences, axis=0)
        self.o = {}
        # look at all unique sequence prefixes
        for i in range(1, self.N_STATES + 1):
            prefixes = np.unique(env.rewarded_sequences[:, :i], axis=0)
            for p in prefixes:
                self.o[tuple(p)] = np.random.normal(size=(84, 84, 4))
        self.default_o = np.random.normal(size=(84, 84, 4))
        self.init_o = np.random.normal(size=(84, 84, 4))
    
    def reset(self):
        self.i = 0
        self.history = []
        self.possible_sequences = np.copy(self.rewarded_sequences)
        return self.init_o
    
    def step(self, a):
        self.history.append(a)
        
        ps_new = []
        l = len(self.history)
        for seq in self.possible_sequences:
            if np.array_equal(seq[:l], self.history):
                ps_new.append(seq)
        self.possible_sequences = ps_new
        if len(self.possible_sequences) > 0:
            reward = 1
        else:
            reward = 0
            
        if tuple(self.history) in self.o:
            o = self.o[tuple(self.history)]
        else:
            o = self.default_o

        if self.i != self.N_STATES - 1:
            done = False
            self.i += 1
        else:
            done = True

        return o, reward, done, None

In [None]:
env = DummyEnv()

In [None]:
for i in range(N_ACTIONS):
    env.reset()
    env.step(i)
    print(len(env.possible_sequences))

In [None]:
# Initial state same?
o = env.reset()
print(np.sum(o))
o = env.reset()
print(np.sum(o))

In [None]:
# Default state the same?
env.reset()
o, _, _, _ = env.step(0)
print(np.sum(o))
env.reset()
o, _, _, _ = env.step(8)
print(np.sum(o))

In [None]:
# Always the same state returned for a given sequence?
env.reset()
o, _, _, _ = env.step(5)
print(np.sum(o))
env.reset()
o, _, _, _ = env.step(5)
print(np.sum(o))

env.reset()
env.step(5)
o, _, _, _ = env.step(5)
print(np.sum(o))
env.reset()
env.step(5)
o, _, _, _ = env.step(5)
print(np.sum(o))

In [None]:
def test_seq(seq):
    for a in seq:
        print("Took action", a)
        _, reward, done, _ = env.step(a)
        print("Got reward ", reward)
        print("Done:", done)

env.reset()
test_seq([0, 0, 0])
print()
env.reset()
test_seq(env.rewarded_sequences[0])

## Testing proper

### Manual testing

In [None]:
sess.run(tf.global_variables_initializer())
sess.run(t, feed_dict={graph_s: [env.o[2]], graph_action: [1], graph_advantage: [1]})
sess.run(a_softmax, feed_dict={graph_s: env.o})

### Testing with environment

In [None]:
rpe = []
rpe2 = []
sess.run(tf.global_variables_initializer())
steps = 0

In [None]:
def run(n_episodes=1000):
    global steps
    for i in range(n_episodes):
        if i % 100 == 0:
            print(i)
        env.reset()
        o, _, _ = get_o(0) # do nothing
        done = False
        episode_rewards = []
        rewards = []
        actions = []
        states = []
        while not done:
            states.append(o)
            a_p = sess.run(a_softmax, feed_dict={graph_s: [o]})[0]
            a = np.random.choice(ACTIONS, p=a_p)
            actions.append(a)

            o, r, done = get_o(a)
            rewards.append(r)

            if r != 0:
                episode_rewards.append(r)
                returns = rewards_to_returns(rewards, G=0.9)
                actions = np.array(actions) - 1 # map from possible actions (1, 2, 3) -> (0, 1, 2)
                summ, _, _ = sess.run([summaries, t, t2], feed_dict={graph_s: states, graph_action: actions, graph_r: returns})
                summary_writer.add_summary(summ, steps)
                rewards = []
                actions = []
                states = []
                steps += 1
        
        rpe2.append(episode_rewards)
        if len(rpe) is 0:
            rpe.append(np.sum(episode_rewards))
        else:
            rpe.append(rpe[-1] + 0.01 * (np.sum(episode_rewards) - rpe[-1]))

In [None]:
RENDER = True
run(n_episodes=1)

In [None]:
rpe = [np.sum(rpe2[0])]
for s in rpe2[1:]:
    rpe.append(rpe[-1] + 0.01 * (np.sum(s) - rpe[-1]))

In [None]:
plt.plot(rpe)

## Old code

In [None]:
t = 0
t_max = 100

s = np.zeros(10)
r1 = np.zeros(10)
a = np.zeros(10)
r2_list = []

for i in range(10):
    t_start = t
    s[t] = env.reset()
    done = False
    
    while True:
        a_p = sess.run(ap_op, feed_dict={s_op: s})
        a[t] = np.random.choice(ACTIONS, p=a_p)
        s[t+1], r[t], done, _ = env.step(a[t])
        t += 1        
    if done or t - t_start == t_max:
        break
    
    if done:
        r2 = 0
    else:
        r2 = sess.run(v_op, feed_dict={s_op: s[t]})
    
    for i in range(t-1, t_start-1, -1):
        r2_list.append(r[i] + g * r2)
    
    
    sess.run(t1, feed_dict={s_op: s, big_r_op: r})
    sess.run(t2, feed_dict={s_op: s, big_r_op: r})

In [None]:
observation.shape
action = env.action_space.sample()
os = []
for i in range(4):
    o1, _, _, _ = env.step(action)
    o2, _, _, _ = env.step(action)
    o = np.maximum(o1, o2)
    o = np.mean(o, axis=2)
    o = scipy.misc.imresize(o, (84, 84))
    os.append(o)
os = np.stack(os, axis=-1)

In [None]:
os = np.resize(os, (1, 84, 84, 4))
print(os.shape)
print(sess.run(q['network1'], feed_dict={s: os}))
print(sess.run(qi['network1'], feed_dict={s: os}))