In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time


name = "actor-critic-base"

In [22]:
def huber_loss(x):
  return tf.where(
      tf.abs(x) < 1.0,
      0.5 * tf.square(x), 
      tf.abs(x) - 0.5
  )

def log_shift_loss(x, alfa=0.1):
    return - tf.log(x + alfa * (1.0 - x))

class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.bool, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
            
            self.V = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]

            self.target = tf.where(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, huber_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)

            self.P = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)

            self.Pa = select_columns(self.P, inputs.a)

            self.loss = log_shift_loss(self.Pa) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [23]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(5, max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append(s, a, r, s1, done)
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [24]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{run}".format(run = run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{run}".format(path = os.getcwd(), run = run)
logs_path = "{path}/logs/{run}".format(path = os.getcwd(), run = run)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005
)

[2017-03-22 13:07:18,851] Making new env: LunarLander-v2
[2017-03-22 13:07:18,854] Creating monitor directory monitor/6


False


In [None]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-22 13:07:19,710] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000000.mp4
[2017-03-22 13:07:21,137] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000001.mp4


[MAX] Episode: 0, Length: 108, Reward: -196.308543204, buffer_len: 108
[MAX] Episode: 3, Length: 75, Reward: -171.05515843, buffer_len: 434


[2017-03-22 13:07:24,021] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000008.mp4


[NOR] Episode: 10, Length: 100, Avg Reward: -491.007652458, e: 0.3986735, Learning Rate: 0.01, buffer_len: 1138
Loss: -6.82157850266
[MAX] Episode: 14, Length: 71, Reward: -165.841762608, buffer_len: 1541
[NOR] Episode: 20, Length: 301, Avg Reward: -379.311416035, e: 0.397149833333, Learning Rate: 0.01, buffer_len: 2444
Loss: -3.08399653435
[MAX] Episode: 22, Length: 110, Reward: -162.204765591, buffer_len: 2696


[2017-03-22 13:07:31,000] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000027.mp4


[NOR] Episode: 30, Length: 144, Avg Reward: -287.528036212, e: 0.39559, Learning Rate: 0.01, buffer_len: 3781
Loss: -5.04049777985
[MAX] Episode: 39, Length: 225, Reward: -108.638214541, buffer_len: 5025
[MAX] Episode: 40, Length: 164, Reward: -97.2635942363, buffer_len: 5189
[NOR] Episode: 40, Length: 164, Avg Reward: -372.356807576, e: 0.393947333333, Learning Rate: 0.01, buffer_len: 5189
Loss: -4.96359682083
[MAX] Episode: 41, Length: 187, Reward: -69.0735479099, buffer_len: 5376
[MAX] Episode: 44, Length: 148, Reward: -55.9345753492, buffer_len: 5777
[NOR] Episode: 50, Length: 154, Avg Reward: -145.618273628, e: 0.391626833333, Learning Rate: 0.01, buffer_len: 7178
Loss: -6.851749897
[MAX] Episode: 51, Length: 1000, Reward: 35.25862875, buffer_len: 8178
[NOR] Episode: 60, Length: 105, Avg Reward: -68.0893705985, e: 0.383643333333, Learning Rate: 0.01, buffer_len: 14021
Loss: -4.6923494339


[2017-03-22 13:08:12,429] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000064.mp4


[NOR] Episode: 70, Length: 957, Avg Reward: -100.132055667, e: 0.377647833333, Learning Rate: 0.01, buffer_len: 19160
Loss: -28.3281593323
[NOR] Episode: 80, Length: 582, Avg Reward: -159.666887909, e: 0.37298, Learning Rate: 0.01, buffer_len: 23161
Loss: -12.7679824829
[NOR] Episode: 90, Length: 271, Avg Reward: -76.4724177923, e: 0.370754, Learning Rate: 0.01, buffer_len: 25069
Loss: -9.28185081482
[NOR] Episode: 100, Length: 188, Avg Reward: -64.3392857376, e: 0.365779333333, Learning Rate: 0.01, buffer_len: 29333
Loss: -37.8681640625
[NOR] Episode: 110, Length: 100, Avg Reward: -90.0110278166, e: 0.361768333333, Learning Rate: 0.01, buffer_len: 32771
Loss: -4.2365732193
[NOR] Episode: 120, Length: 77, Avg Reward: -99.0231593152, e: 0.355506833333, Learning Rate: 0.01, buffer_len: 38138
Loss: 1.12094211578


[2017-03-22 13:09:27,300] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000125.mp4


[NOR] Episode: 130, Length: 55, Avg Reward: -87.2693248896, e: 0.354427666667, Learning Rate: 0.01, buffer_len: 39063
Loss: -58.9785995483
[NOR] Episode: 140, Length: 102, Avg Reward: -114.271074276, e: 0.3532715, Learning Rate: 0.01, buffer_len: 40054
Loss: -15.6449642181
[NOR] Episode: 150, Length: 101, Avg Reward: -116.581148093, e: 0.352099, Learning Rate: 0.01, buffer_len: 41059
Loss: -26.7576007843
[NOR] Episode: 160, Length: 100, Avg Reward: -93.0344345455, e: 0.351050166667, Learning Rate: 0.01, buffer_len: 41958
Loss: 12.1305427551
[NOR] Episode: 170, Length: 94, Avg Reward: -83.4374821888, e: 0.349840333333, Learning Rate: 0.01, buffer_len: 42995
Loss: -35.2024841309
[NOR] Episode: 180, Length: 89, Avg Reward: -95.0029986405, e: 0.348674833333, Learning Rate: 0.01, buffer_len: 43994
Loss: 2.19860076904
[NOR] Episode: 190, Length: 118, Avg Reward: -121.024087106, e: 0.347504666667, Learning Rate: 0.01, buffer_len: 44997
Loss: -28.507144928
[NOR] Episode: 200, Length: 68, Avg R

[2017-03-22 13:09:48,769] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000216.mp4


[NOR] Episode: 220, Length: 95, Avg Reward: -109.536464873, e: 0.343906666667, Learning Rate: 0.01, buffer_len: 48081
Loss: -10.1137199402
[NOR] Episode: 230, Length: 92, Avg Reward: -167.225044019, e: 0.342875333333, Learning Rate: 0.01, buffer_len: 48965
Loss: -80.3557357788
[NOR] Episode: 240, Length: 66, Avg Reward: -196.052700114, e: 0.342094833333, Learning Rate: 0.01, buffer_len: 49634
Loss: -23.5874385834
[NOR] Episode: 250, Length: 56, Avg Reward: -170.788452075, e: 0.341208166667, Learning Rate: 0.01, buffer_len: 50394
Loss: 15.7550849915
[NOR] Episode: 260, Length: 110, Avg Reward: -179.521303801, e: 0.3402795, Learning Rate: 0.01, buffer_len: 51190
Loss: -17.8323135376
[NOR] Episode: 270, Length: 68, Avg Reward: -209.050362444, e: 0.339413833333, Learning Rate: 0.01, buffer_len: 51932
Loss: -9.69290542603
[NOR] Episode: 280, Length: 90, Avg Reward: -216.157026175, e: 0.338456, Learning Rate: 0.01, buffer_len: 52753
Loss: 1508.15283203
[NOR] Episode: 290, Length: 60, Avg Rew

[2017-03-22 13:10:13,931] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000343.mp4


[NOR] Episode: 350, Length: 69, Avg Reward: -222.917041467, e: 0.332129166667, Learning Rate: 0.01, buffer_len: 58176
Loss: 25.941116333
[NOR] Episode: 360, Length: 51, Avg Reward: -203.442354139, e: 0.3312705, Learning Rate: 0.01, buffer_len: 58912
Loss: -237.503387451
[NOR] Episode: 370, Length: 59, Avg Reward: -215.328445026, e: 0.330446833333, Learning Rate: 0.01, buffer_len: 59618
Loss: -100.968048096
[NOR] Episode: 380, Length: 62, Avg Reward: -250.887169596, e: 0.329505333333, Learning Rate: 0.01, buffer_len: 60425
Loss: -66.4619216919
[NOR] Episode: 390, Length: 67, Avg Reward: -261.649650852, e: 0.328707333333, Learning Rate: 0.01, buffer_len: 61109
Loss: -207.020202637
[NOR] Episode: 400, Length: 62, Avg Reward: -229.138201505, e: 0.327877833333, Learning Rate: 0.01, buffer_len: 61820
Loss: -48.8520927429
[NOR] Episode: 410, Length: 86, Avg Reward: -209.239463958, e: 0.3269515, Learning Rate: 0.01, buffer_len: 62614
Loss: 299.426269531
[NOR] Episode: 420, Length: 76, Avg Rewa

[2017-03-22 13:10:42,756] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000512.mp4


[NOR] Episode: 510, Length: 59, Avg Reward: -342.791219692, e: 0.318108166667, Learning Rate: 0.01, buffer_len: 70194
Loss: -48.6509170532
[NOR] Episode: 520, Length: 66, Avg Reward: -352.103374261, e: 0.317324166667, Learning Rate: 0.01, buffer_len: 70866
Loss: -160.255050659
[NOR] Episode: 530, Length: 69, Avg Reward: -398.999279352, e: 0.316470166667, Learning Rate: 0.01, buffer_len: 71598
Loss: 579.366210938
[NOR] Episode: 540, Length: 83, Avg Reward: -377.660286018, e: 0.315692, Learning Rate: 0.01, buffer_len: 72265
Loss: -1076.52294922
[NOR] Episode: 550, Length: 86, Avg Reward: -415.99544185, e: 0.314867166667, Learning Rate: 0.01, buffer_len: 72972
Loss: -64.7070159912
[NOR] Episode: 560, Length: 57, Avg Reward: -464.578748144, e: 0.3140365, Learning Rate: 0.01, buffer_len: 73684
Loss: -1148.11547852
[NOR] Episode: 570, Length: 85, Avg Reward: -505.904513601, e: 0.313219833333, Learning Rate: 0.01, buffer_len: 74384
Loss: -10.0903244019
[NOR] Episode: 580, Length: 55, Avg Rewa

[2017-03-22 13:11:20,941] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video000729.mp4


[NOR] Episode: 730, Length: 69, Avg Reward: -263.982357752, e: 0.299520833333, Learning Rate: 0.01, buffer_len: 86126
Loss: -63.8265037537
[NOR] Episode: 740, Length: 76, Avg Reward: -282.192682087, e: 0.298596833333, Learning Rate: 0.01, buffer_len: 86918
Loss: -34.7510910034
[NOR] Episode: 750, Length: 81, Avg Reward: -258.096122088, e: 0.297695, Learning Rate: 0.01, buffer_len: 87691
Loss: -41.7135238647
[NOR] Episode: 760, Length: 82, Avg Reward: -173.82750323, e: 0.296769833333, Learning Rate: 0.01, buffer_len: 88484
Loss: -124.118644714
[NOR] Episode: 770, Length: 68, Avg Reward: -205.742279886, e: 0.295889, Learning Rate: 0.01, buffer_len: 89239
Loss: 496.949554443
[NOR] Episode: 780, Length: 85, Avg Reward: -208.27215563, e: 0.294967333333, Learning Rate: 0.01, buffer_len: 90029
Loss: 51.1315803528
[NOR] Episode: 790, Length: 85, Avg Reward: -180.725131879, e: 0.294126166667, Learning Rate: 0.01, buffer_len: 90750
Loss: -81.8230743408
[NOR] Episode: 800, Length: 104, Avg Reward

[2017-03-22 13:12:07,535] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video001000.mp4


[NOR] Episode: 1000, Length: 72, Avg Reward: -161.44619205, e: 0.276009, Learning Rate: 0.01, buffer_len: 106279
Loss: -35.7788085938
[NOR] Episode: 1010, Length: 56, Avg Reward: -169.162630608, e: 0.275017333333, Learning Rate: 0.01, buffer_len: 107129
Loss: -547.008117676
[NOR] Episode: 1020, Length: 53, Avg Reward: -162.882783088, e: 0.274190166667, Learning Rate: 0.01, buffer_len: 107838
Loss: -218.333786011
[NOR] Episode: 1030, Length: 84, Avg Reward: -167.783694899, e: 0.273357166667, Learning Rate: 0.01, buffer_len: 108552
Loss: -370.313171387
[NOR] Episode: 1040, Length: 53, Avg Reward: -165.320050119, e: 0.272457666667, Learning Rate: 0.01, buffer_len: 109323
Loss: -9.41236877441
[NOR] Episode: 1050, Length: 86, Avg Reward: -174.912140362, e: 0.271548833333, Learning Rate: 0.01, buffer_len: 110102
Loss: -107.216766357
[NOR] Episode: 1060, Length: 84, Avg Reward: -183.246285431, e: 0.270631833333, Learning Rate: 0.01, buffer_len: 110888
Loss: -98.8639984131
[NOR] Episode: 1070,

[2017-03-22 13:14:58,592] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video002000.mp4


[NOR] Episode: 2000, Length: 68, Avg Reward: -528.241058317, e: 0.191793166667, Learning Rate: 0.01, buffer_len: 178464
Loss: -3744.43505859
[NOR] Episode: 2010, Length: 91, Avg Reward: -514.276686288, e: 0.190952, Learning Rate: 0.01, buffer_len: 179185
Loss: -24125.0097656
[NOR] Episode: 2020, Length: 59, Avg Reward: -483.747993253, e: 0.190166833333, Learning Rate: 0.01, buffer_len: 179858
Loss: -20480.5214844
[NOR] Episode: 2030, Length: 67, Avg Reward: -525.922251376, e: 0.189308166667, Learning Rate: 0.01, buffer_len: 180594
Loss: 39822.65625
[NOR] Episode: 2040, Length: 67, Avg Reward: -485.802445153, e: 0.188574333333, Learning Rate: 0.01, buffer_len: 181223
Loss: 22136.7324219
[NOR] Episode: 2050, Length: 84, Avg Reward: -555.60761104, e: 0.187771666667, Learning Rate: 0.01, buffer_len: 181911
Loss: -13757.5
[NOR] Episode: 2060, Length: 77, Avg Reward: -508.139055684, e: 0.1869865, Learning Rate: 0.01, buffer_len: 182584
Loss: -18776.40625
[NOR] Episode: 2070, Length: 58, Avg 

[2017-03-22 13:18:33,519] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video003000.mp4


[NOR] Episode: 3000, Length: 66, Avg Reward: -572.070894454, e: 0.089151, Learning Rate: 0.01, buffer_len: 266443
Loss: 80581.328125
[NOR] Episode: 3010, Length: 66, Avg Reward: -591.14228349, e: 0.0883693333333, Learning Rate: 0.01, buffer_len: 267113
Loss: 250591.171875
[NOR] Episode: 3020, Length: 58, Avg Reward: -611.825893064, e: 0.0875235, Learning Rate: 0.01, buffer_len: 267838
Loss: -102688.882812
[NOR] Episode: 3030, Length: 86, Avg Reward: -597.67791386, e: 0.086736, Learning Rate: 0.01, buffer_len: 268513
Loss: -245645.546875
[NOR] Episode: 3040, Length: 55, Avg Reward: -580.377613276, e: 0.085938, Learning Rate: 0.01, buffer_len: 269197
Loss: 320939.96875
[NOR] Episode: 3050, Length: 68, Avg Reward: -570.476247308, e: 0.085168, Learning Rate: 0.01, buffer_len: 269857
Loss: 1130934.75
[NOR] Episode: 3060, Length: 80, Avg Reward: -573.027185894, e: 0.0844213333333, Learning Rate: 0.01, buffer_len: 270497
Loss: 152539.15625
[NOR] Episode: 3070, Length: 71, Avg Reward: -562.333

[2017-03-22 13:21:15,725] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video004000.mp4


[NOR] Episode: 4000, Length: 60, Avg Reward: -545.021820139, e: 0.05, Learning Rate: 0.01, buffer_len: 335104
Loss: -711583.25
[NOR] Episode: 4010, Length: 67, Avg Reward: -590.08659064, e: 0.05, Learning Rate: 0.01, buffer_len: 335775
Loss: -471449.46875
[NOR] Episode: 4020, Length: 53, Avg Reward: -554.880705275, e: 0.05, Learning Rate: 0.01, buffer_len: 336451
Loss: -341482.5
[NOR] Episode: 4030, Length: 49, Avg Reward: -507.378056611, e: 0.05, Learning Rate: 0.01, buffer_len: 337060
Loss: -791228.4375
[NOR] Episode: 4040, Length: 79, Avg Reward: -710.880392121, e: 0.05, Learning Rate: 0.01, buffer_len: 337826
Loss: -152734.359375
[NOR] Episode: 4050, Length: 58, Avg Reward: -581.467568607, e: 0.05, Learning Rate: 0.01, buffer_len: 338532
Loss: -582746.5
[NOR] Episode: 4060, Length: 56, Avg Reward: -605.971448079, e: 0.05, Learning Rate: 0.01, buffer_len: 339249
Loss: -98726.65625
[NOR] Episode: 4070, Length: 81, Avg Reward: -623.500206087, e: 0.05, Learning Rate: 0.01, buffer_len: 

[2017-03-22 13:24:00,226] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video005000.mp4


[NOR] Episode: 5000, Length: 62, Avg Reward: -636.984997299, e: 0.05, Learning Rate: 0.01, buffer_len: 403118
Loss: -10726646.0
[NOR] Episode: 5010, Length: 81, Avg Reward: -565.094160647, e: 0.05, Learning Rate: 0.01, buffer_len: 403748
Loss: -2310159.5
[NOR] Episode: 5020, Length: 76, Avg Reward: -587.740121738, e: 0.05, Learning Rate: 0.01, buffer_len: 404439
Loss: -3558940.25
[NOR] Episode: 5030, Length: 74, Avg Reward: -687.028617827, e: 0.05, Learning Rate: 0.01, buffer_len: 405170
Loss: -6964301.0
[NOR] Episode: 5040, Length: 58, Avg Reward: -572.858305141, e: 0.05, Learning Rate: 0.01, buffer_len: 405854
Loss: -3481519.25
[NOR] Episode: 5050, Length: 79, Avg Reward: -636.323119963, e: 0.05, Learning Rate: 0.01, buffer_len: 406558
Loss: -18894520.0
[NOR] Episode: 5060, Length: 78, Avg Reward: -691.131284354, e: 0.05, Learning Rate: 0.01, buffer_len: 407277
Loss: -7949091.0
[NOR] Episode: 5070, Length: 75, Avg Reward: -615.369876593, e: 0.05, Learning Rate: 0.01, buffer_len: 4079

[2017-03-22 13:26:46,227] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video006000.mp4


[NOR] Episode: 6000, Length: 70, Avg Reward: -610.788036659, e: 0.05, Learning Rate: 0.01, buffer_len: 471141
Loss: -47475116.0
[NOR] Episode: 6010, Length: 74, Avg Reward: -613.327186594, e: 0.05, Learning Rate: 0.01, buffer_len: 471798
Loss: -10969987.0
[NOR] Episode: 6020, Length: 84, Avg Reward: -606.938769813, e: 0.05, Learning Rate: 0.01, buffer_len: 472491
Loss: -29560090.0
[NOR] Episode: 6030, Length: 55, Avg Reward: -517.452868959, e: 0.05, Learning Rate: 0.01, buffer_len: 473082
Loss: -10856105.0
[NOR] Episode: 6040, Length: 57, Avg Reward: -635.063879683, e: 0.05, Learning Rate: 0.01, buffer_len: 473732
Loss: -11820040.0
[NOR] Episode: 6050, Length: 68, Avg Reward: -601.673747184, e: 0.05, Learning Rate: 0.01, buffer_len: 474415
Loss: -3273534.0
[NOR] Episode: 6060, Length: 62, Avg Reward: -646.958303655, e: 0.05, Learning Rate: 0.01, buffer_len: 475080
Loss: -11395856.0
[NOR] Episode: 6070, Length: 59, Avg Reward: -595.101949841, e: 0.05, Learning Rate: 0.01, buffer_len: 47

[2017-03-22 13:29:59,800] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video007000.mp4


[NOR] Episode: 7000, Length: 63, Avg Reward: -682.603551927, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -4620871.0
[NOR] Episode: 7010, Length: 62, Avg Reward: -655.699999172, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 757824.25
[NOR] Episode: 7020, Length: 62, Avg Reward: -703.458896357, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -12602346.0
[NOR] Episode: 7030, Length: 57, Avg Reward: -537.384509882, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -11056428.0
[NOR] Episode: 7040, Length: 77, Avg Reward: -635.272245546, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -10688305.0
[NOR] Episode: 7050, Length: 69, Avg Reward: -607.979694956, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -14241268.0
[NOR] Episode: 7060, Length: 51, Avg Reward: -624.947308418, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5863436.5
[NOR] Episode: 7070, Length: 52, Avg Reward: -524.00521493, e: 0.05, Learning Rate: 0.01, buffer_len: 500000


[2017-03-22 13:33:32,159] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video008000.mp4


[NOR] Episode: 8000, Length: 76, Avg Reward: -584.86753721, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -20374960.0
[NOR] Episode: 8010, Length: 55, Avg Reward: -585.140082658, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2925648.5
[NOR] Episode: 8020, Length: 55, Avg Reward: -570.607793353, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -6638801.5
[NOR] Episode: 8030, Length: 60, Avg Reward: -559.878288456, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -16124996.0
[NOR] Episode: 8040, Length: 73, Avg Reward: -616.636807954, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -5231338.0
[NOR] Episode: 8050, Length: 64, Avg Reward: -685.148016522, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -55194000.0
[NOR] Episode: 8060, Length: 70, Avg Reward: -658.72514325, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -10460678.0
[NOR] Episode: 8070, Length: 80, Avg Reward: -632.478221972, e: 0.05, Learning Rate: 0.01, buffer_len: 500000

[2017-03-22 13:37:02,686] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video009000.mp4


[NOR] Episode: 9000, Length: 82, Avg Reward: -639.658777398, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -23368010.0
[NOR] Episode: 9010, Length: 81, Avg Reward: -559.872786598, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -26900110.0
[NOR] Episode: 9020, Length: 74, Avg Reward: -600.282423023, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -24460768.0
[NOR] Episode: 9030, Length: 64, Avg Reward: -603.216525038, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -22727516.0
[NOR] Episode: 9040, Length: 83, Avg Reward: -608.944525415, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -52110296.0
[NOR] Episode: 9050, Length: 59, Avg Reward: -592.960676809, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -21970202.0
[NOR] Episode: 9060, Length: 64, Avg Reward: -657.570586591, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -42396224.0
[NOR] Episode: 9070, Length: 60, Avg Reward: -514.010684156, e: 0.05, Learning Rate: 0.01, buffer_len: 5

[2017-03-22 13:40:32,260] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video010000.mp4


[NOR] Episode: 10000, Length: 50, Avg Reward: -516.928127651, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -94573296.0
[NOR] Episode: 10010, Length: 83, Avg Reward: -623.406001165, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -29031582.0
[NOR] Episode: 10020, Length: 52, Avg Reward: -565.384295833, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -11589307.0
[NOR] Episode: 10030, Length: 50, Avg Reward: -523.962718918, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -36356392.0
[NOR] Episode: 10040, Length: 79, Avg Reward: -655.264223426, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -56709344.0
[NOR] Episode: 10050, Length: 66, Avg Reward: -590.27281686, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -58698764.0
[NOR] Episode: 10060, Length: 82, Avg Reward: -604.823475411, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -47848680.0
[NOR] Episode: 10070, Length: 77, Avg Reward: -688.755488093, e: 0.05, Learning Rate: 0.01, buffer

[2017-03-22 13:44:03,531] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video011000.mp4


[NOR] Episode: 11000, Length: 76, Avg Reward: -631.969642959, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2659612.5
[NOR] Episode: 11010, Length: 62, Avg Reward: -580.66242186, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5179799.0
[NOR] Episode: 11020, Length: 53, Avg Reward: -511.138782458, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7184337.5
[NOR] Episode: 11030, Length: 67, Avg Reward: -655.002935477, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -9487497.0
[NOR] Episode: 11040, Length: 58, Avg Reward: -693.589205958, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7089736.0
[NOR] Episode: 11050, Length: 50, Avg Reward: -610.565674764, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 11060, Length: 55, Avg Reward: -631.954413097, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5888384.5
[NOR] Episode: 11070, Length: 80, Avg Reward: -645.947775028, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 

[2017-03-22 13:47:38,806] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video012000.mp4


[NOR] Episode: 12000, Length: 81, Avg Reward: -642.564063651, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -4405103.0
[NOR] Episode: 12010, Length: 87, Avg Reward: -592.788090911, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1544509.625
[NOR] Episode: 12020, Length: 82, Avg Reward: -737.348933445, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 12030, Length: 54, Avg Reward: -561.448854354, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3233253.25
[NOR] Episode: 12040, Length: 78, Avg Reward: -641.689580519, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5276300.0
[NOR] Episode: 12050, Length: 81, Avg Reward: -638.260309774, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4240961.0
[NOR] Episode: 12060, Length: 60, Avg Reward: -662.794446164, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9679976.0
[NOR] Episode: 12070, Length: 72, Avg Reward: -573.387472176, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Lo

[2017-03-22 13:51:12,405] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video013000.mp4


[NOR] Episode: 13000, Length: 78, Avg Reward: -742.235399761, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -176304.34375
[NOR] Episode: 13010, Length: 56, Avg Reward: -608.571868258, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 13020, Length: 80, Avg Reward: -706.39113199, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5230561.5
[NOR] Episode: 13030, Length: 54, Avg Reward: -617.671007385, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3950347.25
[NOR] Episode: 13040, Length: 74, Avg Reward: -570.847626337, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7012993.0
[NOR] Episode: 13050, Length: 73, Avg Reward: -627.065799064, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8151621.0
[NOR] Episode: 13060, Length: 58, Avg Reward: -585.44352674, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1439698.25
[NOR] Episode: 13070, Length: 55, Avg Reward: -635.479117421, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Lo

[2017-03-22 13:54:44,547] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video014000.mp4


[NOR] Episode: 14000, Length: 53, Avg Reward: -546.393497046, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -326038528.0
[NOR] Episode: 14010, Length: 89, Avg Reward: -720.650770305, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -251007264.0
[NOR] Episode: 14020, Length: 59, Avg Reward: -601.82824916, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6637708.0
[NOR] Episode: 14030, Length: 83, Avg Reward: -629.560778548, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12087006.0
[NOR] Episode: 14040, Length: 65, Avg Reward: -653.571239833, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2686794.5
[NOR] Episode: 14050, Length: 78, Avg Reward: -738.746249495, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 14060, Length: 76, Avg Reward: -541.248691737, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2982709.0
[NOR] Episode: 14070, Length: 78, Avg Reward: -585.478550962, e: 0.05, Learning Rate: 0.01, buffer_len: 500000


[2017-03-22 13:58:16,487] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video015000.mp4


[NOR] Episode: 15000, Length: 61, Avg Reward: -572.56721416, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 10527954.0
[NOR] Episode: 15010, Length: 63, Avg Reward: -578.615913454, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9875014.0
[NOR] Episode: 15020, Length: 64, Avg Reward: -618.870288563, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -96769272.0
[NOR] Episode: 15030, Length: 74, Avg Reward: -631.168165199, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9076910.0
[NOR] Episode: 15040, Length: 70, Avg Reward: -594.611556591, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13736780.0
[NOR] Episode: 15050, Length: 75, Avg Reward: -633.905622819, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12063428.0
[NOR] Episode: 15060, Length: 70, Avg Reward: -715.850183985, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3046661.0
[NOR] Episode: 15070, Length: 51, Avg Reward: -536.876785259, e: 0.05, Learning Rate: 0.01, buffer_len: 500

[2017-03-22 14:01:50,941] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video016000.mp4


[NOR] Episode: 16000, Length: 52, Avg Reward: -741.586883187, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6490158.5
[NOR] Episode: 16010, Length: 69, Avg Reward: -561.500551902, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12126362.0
[NOR] Episode: 16020, Length: 58, Avg Reward: -623.103530517, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2275506.75
[NOR] Episode: 16030, Length: 52, Avg Reward: -569.657348029, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4979773.0
[NOR] Episode: 16040, Length: 61, Avg Reward: -631.193926603, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7859994.0
[NOR] Episode: 16050, Length: 60, Avg Reward: -543.851504361, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6117803.0
[NOR] Episode: 16060, Length: 79, Avg Reward: -617.568516566, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 16070, Length: 52, Avg Reward: -621.929788549, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Los

[2017-03-22 14:05:21,800] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video017000.mp4


[NOR] Episode: 17000, Length: 64, Avg Reward: -667.585906349, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6583846.0
[NOR] Episode: 17010, Length: 79, Avg Reward: -552.199551395, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 10551219.0
[NOR] Episode: 17020, Length: 64, Avg Reward: -587.090798324, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 17030, Length: 64, Avg Reward: -617.119533143, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 10260973.0
[NOR] Episode: 17040, Length: 78, Avg Reward: -637.50584133, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 17688716.0
[NOR] Episode: 17050, Length: 50, Avg Reward: -577.333816545, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 17060, Length: 83, Avg Reward: -583.37095072, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13035551.0
[NOR] Episode: 17070, Length: 84, Avg Reward: -616.954907007, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -295

[2017-03-22 14:08:55,076] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video018000.mp4


[NOR] Episode: 18000, Length: 82, Avg Reward: -676.022354537, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -664881280.0
[NOR] Episode: 18010, Length: 54, Avg Reward: -554.98256218, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 27260656.0
[NOR] Episode: 18020, Length: 62, Avg Reward: -621.213583862, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 18030, Length: 52, Avg Reward: -644.296623094, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4895544.5
[NOR] Episode: 18040, Length: 78, Avg Reward: -646.588283234, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3183904.25
[NOR] Episode: 18050, Length: 73, Avg Reward: -599.087277403, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4885091.0
[NOR] Episode: 18060, Length: 52, Avg Reward: -558.672633527, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -143607120.0
[NOR] Episode: 18070, Length: 57, Avg Reward: -666.434986887, e: 0.05, Learning Rate: 0.01, buffer_len: 500000

[2017-03-22 14:12:29,270] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video019000.mp4


[NOR] Episode: 19000, Length: 54, Avg Reward: -669.308117481, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 14402983.0
[NOR] Episode: 19010, Length: 76, Avg Reward: -548.683352852, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6221198.0
[NOR] Episode: 19020, Length: 85, Avg Reward: -607.243880881, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -597250688.0
[NOR] Episode: 19030, Length: 55, Avg Reward: -502.797785859, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2604518.5
[NOR] Episode: 19040, Length: 81, Avg Reward: -612.645471766, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 19050, Length: 51, Avg Reward: -648.116133772, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -81530720.0
[NOR] Episode: 19060, Length: 55, Avg Reward: -484.088389494, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 17276342.0
[NOR] Episode: 19070, Length: 62, Avg Reward: -635.461841416, e: 0.05, Learning Rate: 0.01, buffer_len: 50000

[2017-03-22 14:16:01,118] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video020000.mp4


[NOR] Episode: 20000, Length: 78, Avg Reward: -611.309587173, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 18012478.0
[NOR] Episode: 20010, Length: 82, Avg Reward: -645.178898624, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4956517.0
[NOR] Episode: 20020, Length: 85, Avg Reward: -627.228021967, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9715730.0
[NOR] Episode: 20030, Length: 58, Avg Reward: -619.703047661, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 20040, Length: 72, Avg Reward: -607.543871414, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 34064040.0
[NOR] Episode: 20050, Length: 52, Avg Reward: -588.47474403, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12045946.0
[NOR] Episode: 20060, Length: 80, Avg Reward: -650.802293449, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 11780320.0
[NOR] Episode: 20070, Length: 81, Avg Reward: -642.43494441, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss

[2017-03-22 14:19:33,708] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video021000.mp4


[NOR] Episode: 21000, Length: 52, Avg Reward: -620.299606562, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 21010, Length: 60, Avg Reward: -587.220007298, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13140945.0
[NOR] Episode: 21020, Length: 55, Avg Reward: -513.899454932, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8012517.0
[NOR] Episode: 21030, Length: 84, Avg Reward: -659.582796104, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 16453094.0
[NOR] Episode: 21040, Length: 84, Avg Reward: -587.291783737, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 21050, Length: 64, Avg Reward: -681.584383065, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 21060, Length: 80, Avg Reward: -660.381921535, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13661588.0
[NOR] Episode: 21070, Length: 69, Avg Reward: -570.032690407, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR]

[2017-03-22 14:23:08,770] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video022000.mp4


[NOR] Episode: 22000, Length: 77, Avg Reward: -642.916039699, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 22010, Length: 81, Avg Reward: -590.558321001, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 29638396.0
[NOR] Episode: 22020, Length: 51, Avg Reward: -564.711378252, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 30132956.0
[NOR] Episode: 22030, Length: 65, Avg Reward: -498.014199609, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 38069504.0
[NOR] Episode: 22040, Length: 60, Avg Reward: -617.435506616, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 15052533.0
[NOR] Episode: 22050, Length: 60, Avg Reward: -629.310685403, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 22060, Length: 79, Avg Reward: -668.722716756, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 24641788.0
[NOR] Episode: 22070, Length: 83, Avg Reward: -582.447154866, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6

[2017-03-22 14:26:41,054] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video023000.mp4


[NOR] Episode: 23000, Length: 86, Avg Reward: -581.510709362, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 30593656.0
[NOR] Episode: 23010, Length: 81, Avg Reward: -645.632046005, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 23020, Length: 57, Avg Reward: -542.916204495, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 29157138.0
[NOR] Episode: 23030, Length: 54, Avg Reward: -508.309986715, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 32044100.0
[NOR] Episode: 23040, Length: 52, Avg Reward: -546.150504216, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 39097380.0
[NOR] Episode: 23050, Length: 79, Avg Reward: -576.615768521, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13283006.0
[NOR] Episode: 23060, Length: 67, Avg Reward: -633.092820295, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 32191944.0
[NOR] Episode: 23070, Length: 66, Avg Reward: -641.830027114, e: 0.05, Learning Rate: 0.01, buffer_len: 500000


[2017-03-22 14:30:14,960] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video024000.mp4


[NOR] Episode: 24000, Length: 66, Avg Reward: -640.450643924, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 14843532.0
[NOR] Episode: 24010, Length: 85, Avg Reward: -685.878559681, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 24020, Length: 61, Avg Reward: -570.524244427, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13669140.0
[NOR] Episode: 24030, Length: 55, Avg Reward: -657.732826727, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7790529.5
[NOR] Episode: 24040, Length: 55, Avg Reward: -575.040651451, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 25564092.0
[NOR] Episode: 24050, Length: 77, Avg Reward: -659.117505193, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 30061336.0
[NOR] Episode: 24060, Length: 69, Avg Reward: -618.331850063, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 24070, Length: 77, Avg Reward: -620.345161345, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 48

[2017-03-22 14:33:49,729] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/6/openaigym.video.6.937.video025000.mp4


[NOR] Episode: 25000, Length: 62, Avg Reward: -624.645140291, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 33600536.0
[NOR] Episode: 25010, Length: 81, Avg Reward: -648.979815622, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 36512848.0
[NOR] Episode: 25020, Length: 82, Avg Reward: -650.871924767, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 25030, Length: 58, Avg Reward: -584.577132715, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 40495584.0
[NOR] Episode: 25040, Length: 63, Avg Reward: -598.968435381, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.0
[NOR] Episode: 25050, Length: 53, Avg Reward: -529.045973151, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 20023648.0
[NOR] Episode: 25060, Length: 57, Avg Reward: -634.029734374, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7603210.0
[NOR] Episode: 25070, Length: 63, Avg Reward: -596.373828501, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 14

In [6]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-17 18:20:33,289] Making new env: LunarLander-v2


False
212.63485096
222.209029125
137.585769854
180.875673014
206.109792056
244.175226865
219.223174017
226.254220443
189.530083255
180.200875147
226.533856294
215.507410864
257.758179742


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type