In [40]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time


name = "actor-critic-base"

In [42]:
def huber_loss(x):
  return tf.where(
      tf.abs(x) < 1.0,
      0.5 * tf.square(x), 
      tf.abs(x) - 0.5
  )

def log_shift_loss(x, alfa=0.1):
    return - tf.log(x + alfa * (1.0 - x))

class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
            
            self.V = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)

            self.P = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)

            self.Pa = select_columns(self.P, inputs.a)

            self.loss = log_shift_loss(self.Pa) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [43]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(5, max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append(s, a, r, s1, float(done))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [44]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{run}".format(run = run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{run}".format(path = os.getcwd(), run = run)
logs_path = "{path}/logs/{run}".format(path = os.getcwd(), run = run)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005
)

[2017-03-22 15:12:15,341] Making new env: LunarLander-v2
[2017-03-22 15:12:15,344] Creating monitor directory monitor/7


False


In [None]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-22 15:12:16,471] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000000.mp4
[2017-03-22 15:12:20,274] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000001.mp4


[MAX] Episode: 0, Length: 109, Reward: -318.429336077, buffer_len: 109
[MAX] Episode: 4, Length: 177, Reward: -216.753287611, buffer_len: 771
[MAX] Episode: 5, Length: 100, Reward: -190.671057424, buffer_len: 871


[2017-03-22 15:12:23,842] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000008.mp4


[MAX] Episode: 8, Length: 1000, Reward: 76.0189345362, buffer_len: 2148
[NOR] Episode: 10, Length: 1000, Avg Reward: -232.798901981, e: 0.396213, Learning Rate: 0.01, buffer_len: 3247
Loss: -1.47408568859
[NOR] Episode: 20, Length: 206, Avg Reward: -148.964016264, e: 0.3915965, Learning Rate: 0.01, buffer_len: 7204
Loss: 0.354218035936


[2017-03-22 15:12:56,504] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000027.mp4


[NOR] Episode: 30, Length: 1000, Avg Reward: -117.087402762, e: 0.3858355, Learning Rate: 0.01, buffer_len: 12142
Loss: -2.96539258957
[NOR] Episode: 40, Length: 108, Avg Reward: -64.1520784142, e: 0.379908833333, Learning Rate: 0.01, buffer_len: 17222
Loss: -4.14856290817
[NOR] Episode: 50, Length: 1000, Avg Reward: -90.685861405, e: 0.374478, Learning Rate: 0.01, buffer_len: 21877
Loss: -2.08427762985
[NOR] Episode: 60, Length: 1000, Avg Reward: -54.4080828296, e: 0.366923833333, Learning Rate: 0.01, buffer_len: 28352
Loss: -1.95832943916


[2017-03-22 15:14:07,974] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000064.mp4


[NOR] Episode: 70, Length: 567, Avg Reward: -103.496134523, e: 0.357620833333, Learning Rate: 0.01, buffer_len: 36326
Loss: -1.71053934097
[NOR] Episode: 80, Length: 1000, Avg Reward: -64.2729695028, e: 0.347690166667, Learning Rate: 0.01, buffer_len: 44838
Loss: -2.20174002647
[MAX] Episode: 83, Length: 1000, Reward: 133.438890577, buffer_len: 47072
[NOR] Episode: 90, Length: 986, Avg Reward: -109.357696541, e: 0.337399, Learning Rate: 0.01, buffer_len: 53659
Loss: -1.58219742775
[NOR] Episode: 100, Length: 1000, Avg Reward: -82.8978474786, e: 0.32748, Learning Rate: 0.01, buffer_len: 62161
Loss: -2.0480029583
[NOR] Episode: 110, Length: 720, Avg Reward: -87.1380772214, e: 0.317111833333, Learning Rate: 0.01, buffer_len: 71048
Loss: -1.94791269302
[MAX] Episode: 116, Length: 797, Reward: 145.268310061, buffer_len: 76223
[NOR] Episode: 120, Length: 571, Avg Reward: -21.9145216567, e: 0.306908166667, Learning Rate: 0.01, buffer_len: 79794
Loss: -0.897647678852


[2017-03-22 15:17:15,799] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000125.mp4


[NOR] Episode: 130, Length: 1000, Avg Reward: -40.5721590257, e: 0.295634666667, Learning Rate: 0.01, buffer_len: 89457
Loss: -0.368763923645
[NOR] Episode: 140, Length: 1000, Avg Reward: -13.0639492027, e: 0.283968, Learning Rate: 0.01, buffer_len: 99457
Loss: -0.0119026750326
[NOR] Episode: 150, Length: 1000, Avg Reward: 9.74369956057, e: 0.272429666667, Learning Rate: 0.01, buffer_len: 109347
Loss: 0.0777480006218
[NOR] Episode: 160, Length: 814, Avg Reward: -13.5181620202, e: 0.263451, Learning Rate: 0.01, buffer_len: 117043
Loss: -0.711898505688
[NOR] Episode: 170, Length: 1000, Avg Reward: 39.0539369102, e: 0.252217166667, Learning Rate: 0.01, buffer_len: 126672
Loss: -0.419445723295
[NOR] Episode: 180, Length: 1000, Avg Reward: -13.1604950326, e: 0.241542166667, Learning Rate: 0.01, buffer_len: 135822
Loss: -2.09382724762
[NOR] Episode: 190, Length: 746, Avg Reward: -125.965594051, e: 0.233800166667, Learning Rate: 0.01, buffer_len: 142458
Loss: -1.66712462902
[NOR] Episode: 200

[2017-03-22 15:21:32,963] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000216.mp4


[NOR] Episode: 220, Length: 1000, Avg Reward: -173.463043827, e: 0.217360666667, Learning Rate: 0.01, buffer_len: 156549
Loss: -0.529508709908
[NOR] Episode: 230, Length: 324, Avg Reward: -137.258533299, e: 0.213205, Learning Rate: 0.01, buffer_len: 160111
Loss: -1.46555745602
[NOR] Episode: 240, Length: 1000, Avg Reward: 17.5666416485, e: 0.201663166667, Learning Rate: 0.01, buffer_len: 170004
Loss: -1.59270620346
[MAX] Episode: 248, Length: 845, Reward: 162.794422787, buffer_len: 177480
[NOR] Episode: 250, Length: 894, Avg Reward: 60.9229818556, e: 0.1907315, Learning Rate: 0.01, buffer_len: 179374
Loss: -0.202610164881
[MAX] Episode: 257, Length: 951, Reward: 169.172103244, buffer_len: 185626
[NOR] Episode: 260, Length: 987, Avg Reward: 127.671032673, e: 0.180222166667, Learning Rate: 0.01, buffer_len: 188382
Loss: -0.856965184212
[NOR] Episode: 270, Length: 1000, Avg Reward: 44.5520708448, e: 0.169226333333, Learning Rate: 0.01, buffer_len: 197807
Loss: -0.903644561768
[NOR] Episod

[2017-03-22 15:26:53,968] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000343.mp4


[NOR] Episode: 350, Length: 559, Avg Reward: 194.501994733, e: 0.109630666667, Learning Rate: 0.01, buffer_len: 248889
Loss: -0.175923198462
[NOR] Episode: 360, Length: 591, Avg Reward: 185.109710196, e: 0.103286333333, Learning Rate: 0.01, buffer_len: 254327
Loss: -0.92608755827
[NOR] Episode: 370, Length: 539, Avg Reward: 140.991074319, e: 0.0964263333333, Learning Rate: 0.01, buffer_len: 260207
Loss: -1.10566604137
[NOR] Episode: 380, Length: 592, Avg Reward: 68.2815435668, e: 0.0900936666667, Learning Rate: 0.01, buffer_len: 265635
Loss: 3.11441707611
[NOR] Episode: 390, Length: 511, Avg Reward: 58.0360370808, e: 0.083635, Learning Rate: 0.01, buffer_len: 271171
Loss: -1.58483552933
[NOR] Episode: 400, Length: 459, Avg Reward: 106.748304779, e: 0.0776663333333, Learning Rate: 0.01, buffer_len: 276287
Loss: -1.49990367889
[NOR] Episode: 410, Length: 456, Avg Reward: 108.181435172, e: 0.0725831666667, Learning Rate: 0.01, buffer_len: 280644
Loss: 4.1914396286
[NOR] Episode: 420, Leng

[2017-03-22 15:31:01,322] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000512.mp4


[MAX] Episode: 511, Length: 373, Reward: 253.561745797, buffer_len: 317701
[NOR] Episode: 520, Length: 343, Avg Reward: 142.824874493, e: 0.05, Learning Rate: 0.01, buffer_len: 321380
Loss: 0.422359526157
[NOR] Episode: 530, Length: 460, Avg Reward: 182.351584194, e: 0.05, Learning Rate: 0.01, buffer_len: 326350
Loss: -0.370447099209
[NOR] Episode: 540, Length: 460, Avg Reward: 167.625931962, e: 0.05, Learning Rate: 0.01, buffer_len: 330816
Loss: -1.01438212395
[NOR] Episode: 550, Length: 394, Avg Reward: 181.437120115, e: 0.05, Learning Rate: 0.01, buffer_len: 335666
Loss: -2.12517642975
[NOR] Episode: 560, Length: 361, Avg Reward: 191.897795193, e: 0.05, Learning Rate: 0.01, buffer_len: 339703
Loss: -1.30273115635
[NOR] Episode: 570, Length: 605, Avg Reward: 181.608282816, e: 0.05, Learning Rate: 0.01, buffer_len: 344636
Loss: -0.948386609554
[NOR] Episode: 580, Length: 783, Avg Reward: 111.896373702, e: 0.05, Learning Rate: 0.01, buffer_len: 351557
Loss: 0.0853780210018
[NOR] Episod

[2017-03-22 15:38:32,637] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video000729.mp4


[NOR] Episode: 730, Length: 555, Avg Reward: 161.426295429, e: 0.05, Learning Rate: 0.01, buffer_len: 450515
Loss: -0.237186670303
[NOR] Episode: 740, Length: 783, Avg Reward: 152.330989936, e: 0.05, Learning Rate: 0.01, buffer_len: 456963
Loss: -2.06156086922
[NOR] Episode: 750, Length: 904, Avg Reward: 37.3427565808, e: 0.05, Learning Rate: 0.01, buffer_len: 464657
Loss: -2.73502039909
[NOR] Episode: 760, Length: 1000, Avg Reward: -144.363292696, e: 0.05, Learning Rate: 0.01, buffer_len: 473875
Loss: 0.782838702202
[NOR] Episode: 770, Length: 1000, Avg Reward: -84.9819201673, e: 0.05, Learning Rate: 0.01, buffer_len: 483474
Loss: 1.16471982002
[NOR] Episode: 780, Length: 1000, Avg Reward: -69.1119216475, e: 0.05, Learning Rate: 0.01, buffer_len: 491523
Loss: -1.04193735123
[NOR] Episode: 790, Length: 1000, Avg Reward: -159.156890078, e: 0.05, Learning Rate: 0.01, buffer_len: 499752
Loss: -0.559261023998
[NOR] Episode: 800, Length: 391, Avg Reward: -150.465009987, e: 0.05, Learning Ra

[2017-03-22 15:54:58,653] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video001000.mp4


[NOR] Episode: 1000, Length: 266, Avg Reward: 143.190275908, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.617822527885
[MAX] Episode: 1004, Length: 245, Reward: 257.522494348, buffer_len: 500000
[NOR] Episode: 1010, Length: 714, Avg Reward: 163.446232046, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.3885178566
[NOR] Episode: 1020, Length: 1000, Avg Reward: 146.041013098, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.567297339439
[NOR] Episode: 1030, Length: 408, Avg Reward: 194.895014696, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.416999101639
[NOR] Episode: 1040, Length: 356, Avg Reward: 193.280877004, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.778429687023
[NOR] Episode: 1050, Length: 579, Avg Reward: 166.29930155, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.901616692543
[NOR] Episode: 1060, Length: 316, Avg Reward: 166.963150891, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.23896956444
[NO

[2017-03-22 16:14:50,053] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video002000.mp4


[NOR] Episode: 2000, Length: 300, Avg Reward: 230.183577443, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.192189276218
[NOR] Episode: 2010, Length: 330, Avg Reward: 228.289770428, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.432618290186
[NOR] Episode: 2020, Length: 306, Avg Reward: 204.984662087, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.18077266216
[NOR] Episode: 2030, Length: 289, Avg Reward: 211.848256091, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.418677479029
[NOR] Episode: 2040, Length: 255, Avg Reward: 219.293199463, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.06175768375
[NOR] Episode: 2050, Length: 241, Avg Reward: 200.815657979, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.255984544754
[NOR] Episode: 2060, Length: 315, Avg Reward: 201.486674205, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.75194478035
[NOR] Episode: 2070, Length: 320, Avg Reward: 164.183953937, e: 0.05, Learning Ra

[2017-03-22 16:40:28,560] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video003000.mp4


[NOR] Episode: 3000, Length: 310, Avg Reward: 53.0720940026, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.95336699486
[NOR] Episode: 3010, Length: 1000, Avg Reward: 38.3216798409, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.185500323772
[NOR] Episode: 3020, Length: 394, Avg Reward: 33.7205699425, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.8150562048
[NOR] Episode: 3030, Length: 645, Avg Reward: 64.147372986, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.886025846004
[NOR] Episode: 3040, Length: 282, Avg Reward: -43.8851980431, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.72516644001
[NOR] Episode: 3050, Length: 574, Avg Reward: 54.2311153287, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0152168869972
[NOR] Episode: 3060, Length: 525, Avg Reward: 63.4877249238, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.02010464668
[NOR] Episode: 3070, Length: 732, Avg Reward: 85.7999526864, e: 0.05, Learning Rate

[2017-03-22 17:09:23,197] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video004000.mp4


[NOR] Episode: 4000, Length: 360, Avg Reward: 190.759433923, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.67170095444
[NOR] Episode: 4010, Length: 352, Avg Reward: 214.130036044, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -3.52731513977
[NOR] Episode: 4020, Length: 265, Avg Reward: 223.393083975, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.90722250938
[NOR] Episode: 4030, Length: 209, Avg Reward: 175.647997137, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -7.16994380951
[NOR] Episode: 4040, Length: 313, Avg Reward: 210.945277612, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.147361889482
[NOR] Episode: 4050, Length: 272, Avg Reward: 227.014246075, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.36188364029
[NOR] Episode: 4060, Length: 347, Avg Reward: 226.262504382, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.33333659172
[NOR] Episode: 4070, Length: 348, Avg Reward: 202.788979434, e: 0.05, Learning Rate

[2017-03-22 17:39:22,848] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video005000.mp4


[NOR] Episode: 5000, Length: 338, Avg Reward: 68.4973127746, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.96251916885
[NOR] Episode: 5010, Length: 443, Avg Reward: -3.08570734343, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.47281098366
[NOR] Episode: 5020, Length: 507, Avg Reward: 18.8133085654, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.586030244827
[NOR] Episode: 5030, Length: 338, Avg Reward: -28.706196188, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.71179032326
[NOR] Episode: 5040, Length: 955, Avg Reward: 18.9494191399, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.93115854263
[NOR] Episode: 5050, Length: 910, Avg Reward: -151.885562131, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.617282509804
[NOR] Episode: 5060, Length: 709, Avg Reward: -254.514655186, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.20165538788
[NOR] Episode: 5070, Length: 518, Avg Reward: -294.675207416, e: 0.05, Learnin

[2017-03-22 18:11:26,994] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video006000.mp4


[NOR] Episode: 6000, Length: 356, Avg Reward: 163.253858747, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.121156938374
[NOR] Episode: 6010, Length: 779, Avg Reward: 166.721411405, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.46970653534
[NOR] Episode: 6020, Length: 612, Avg Reward: 160.904827696, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.680952012539
[NOR] Episode: 6030, Length: 649, Avg Reward: 63.649142126, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.09636831284
[NOR] Episode: 6040, Length: 1000, Avg Reward: 80.8720473089, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.33526682854
[NOR] Episode: 6050, Length: 916, Avg Reward: 127.943612326, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.14234089851
[NOR] Episode: 6060, Length: 719, Avg Reward: 106.16471152, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.03257608414
[NOR] Episode: 6070, Length: 1000, Avg Reward: -42.9373936299, e: 0.05, Learning Rate

[2017-03-22 18:48:57,793] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video007000.mp4


[NOR] Episode: 7000, Length: 136, Avg Reward: -38.5197143772, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5.56186485291
[NOR] Episode: 7010, Length: 100, Avg Reward: -62.1346302559, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.86078429222
[NOR] Episode: 7020, Length: 138, Avg Reward: -57.1282662654, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.17297291756
[NOR] Episode: 7030, Length: 684, Avg Reward: -76.6906029278, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -6.05068778992
[NOR] Episode: 7040, Length: 95, Avg Reward: -49.6346982769, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -5.04913377762
[NOR] Episode: 7050, Length: 106, Avg Reward: -102.657341885, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.0972244441509
[NOR] Episode: 7060, Length: 99, Avg Reward: -100.958811372, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -5.00362014771
[NOR] Episode: 7070, Length: 115, Avg Reward: -125.543605912, e: 0.05, Learning

[2017-03-22 18:56:06,849] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/7/openaigym.video.9.937.video008000.mp4


[NOR] Episode: 8000, Length: 165, Avg Reward: -25.2481756866, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.641100525856
[NOR] Episode: 8010, Length: 532, Avg Reward: -1.74041249118, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.39502716064
[NOR] Episode: 8020, Length: 543, Avg Reward: 37.6396880134, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.34110212326
[NOR] Episode: 8030, Length: 816, Avg Reward: -80.3668358782, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -3.37681078911
[NOR] Episode: 8040, Length: 1000, Avg Reward: -179.533008986, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.982254981995
[NOR] Episode: 8050, Length: 1000, Avg Reward: -173.585373943, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -4.50900316238
[NOR] Episode: 8060, Length: 362, Avg Reward: -131.736192664, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.61961531639
[NOR] Episode: 8070, Length: 244, Avg Reward: -85.4132801276, e: 0.05, Lear

In [6]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-17 18:20:33,289] Making new env: LunarLander-v2


False
212.63485096
222.209029125
137.585769854
180.875673014
206.109792056
244.175226865
219.223174017
226.254220443
189.530083255
180.200875147
226.533856294
215.507410864
257.758179742


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type