In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time


name = "actor-critic-reward-task"

In [60]:
class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, reward_loss_proportion, scope):
        with tf.variable_scope(scope):
            
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
            
            self.V = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]
            
            r_net = net
#             r_net = tf.layers.dense(r_net, 32, name="r_relu_layer", activation=tf.nn.relu, **ops)
            r = tf.layers.dense(r_net, 1, name='r', **ops)[:, 0]
            
            
            self.r_loss = r - inputs.r |> tf.nn.l2_loss |> tf.reduce_mean

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean) + reward_loss_proportion * self.r_loss

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, reward_loss_proportion, scope):
        with tf.variable_scope(scope):
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 128, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)

            self.P = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
            self.Pa = select_columns(self.P, inputs.a)
            
            r_net = net
#             r_net = tf.layers.dense(r_net, 32, name="r_relu_layer", activation=tf.nn.relu, **ops)
            r = tf.layers.dense(r_net, 1, name='r', **ops)[:, 0]
            
            
            self.r_loss = r - inputs.r |> tf.nn.l2_loss |> tf.reduce_mean
            self.action_loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error

            self.loss = self.action_loss + reward_loss_proportion * self.r_loss
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [61]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, reward_loss_proportion=0.1, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, reward_loss_proportion, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, reward_loss_proportion, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, reward_loss_proportion, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [62]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{run}".format(run = run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{run}".format(path = os.getcwd(), run = run)
logs_path = "{path}/logs/{run}".format(path = os.getcwd(), run = run)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005,
    reward_loss_proportion = 0.05
)

[2017-03-20 01:16:54,713] Making new env: LunarLander-v2
[2017-03-20 01:16:54,716] Creating monitor directory monitor/21


In [63]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-20 01:16:55,743] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000000.mp4
[2017-03-20 01:16:57,072] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000001.mp4


[MAX] Episode: 0, Length: 84, Reward: -465.086305932, buffer_len: 84
[MAX] Episode: 2, Length: 71, Reward: -381.337579648, buffer_len: 227


[2017-03-20 01:17:00,102] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000008.mp4


[NOR] Episode: 10, Length: 71, Avg Reward: -568.647667717, e: 0.398990833333, Learning Rate: 0.01, buffer_len: 866
Loss: 15.0958900452
[MAX] Episode: 16, Length: 62, Reward: -364.095665353, buffer_len: 1322
[MAX] Episode: 18, Length: 56, Reward: -324.584533853, buffer_len: 1464
[NOR] Episode: 20, Length: 53, Avg Reward: -487.224757581, e: 0.398164833333, Learning Rate: 0.01, buffer_len: 1574
Loss: 232.254516602


[2017-03-20 01:17:07,243] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000027.mp4


[NOR] Episode: 30, Length: 82, Avg Reward: -520.282610233, e: 0.397362166667, Learning Rate: 0.01, buffer_len: 2262
Loss: 78.5168762207
[NOR] Episode: 40, Length: 58, Avg Reward: -461.892474617, e: 0.396550166667, Learning Rate: 0.01, buffer_len: 2958
Loss: 38.4052963257
[MAX] Episode: 49, Length: 57, Reward: -232.955786218, buffer_len: 3585
[NOR] Episode: 50, Length: 95, Avg Reward: -485.125928511, e: 0.395707833333, Learning Rate: 0.01, buffer_len: 3680
Loss: 108.508590698
[NOR] Episode: 60, Length: 63, Avg Reward: -526.870545775, e: 0.394821166667, Learning Rate: 0.01, buffer_len: 4440
Loss: 113.263641357


[2017-03-20 01:17:16,452] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000064.mp4


[NOR] Episode: 70, Length: 92, Avg Reward: -549.157585035, e: 0.3939415, Learning Rate: 0.01, buffer_len: 5194
Loss: 3.13351631165
[NOR] Episode: 80, Length: 71, Avg Reward: -535.567064727, e: 0.393072333333, Learning Rate: 0.01, buffer_len: 5939
Loss: -28.2442169189
[NOR] Episode: 90, Length: 92, Avg Reward: -505.477107744, e: 0.391618666667, Learning Rate: 0.01, buffer_len: 7185
Loss: 32.0144271851
[NOR] Episode: 100, Length: 57, Avg Reward: -495.664452294, e: 0.390522, Learning Rate: 0.01, buffer_len: 8125
Loss: -32.346950531
[NOR] Episode: 110, Length: 81, Avg Reward: -563.22241162, e: 0.389661, Learning Rate: 0.01, buffer_len: 8863
Loss: -36.9103050232
[NOR] Episode: 120, Length: 76, Avg Reward: -518.033899858, e: 0.388739333333, Learning Rate: 0.01, buffer_len: 9653
Loss: -30.2687244415


[2017-03-20 01:17:34,830] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000125.mp4


[MAX] Episode: 130, Length: 82, Reward: -102.677411209, buffer_len: 10481
[NOR] Episode: 130, Length: 82, Avg Reward: -485.715502477, e: 0.387773333333, Learning Rate: 0.01, buffer_len: 10481
Loss: -26.5765953064
[NOR] Episode: 140, Length: 82, Avg Reward: -558.200017038, e: 0.386771166667, Learning Rate: 0.01, buffer_len: 11340
Loss: 349.133239746
[NOR] Episode: 150, Length: 83, Avg Reward: -345.684749484, e: 0.385841333333, Learning Rate: 0.01, buffer_len: 12137
Loss: 183.941894531
[MAX] Episode: 159, Length: 65, Reward: -34.9710501268, buffer_len: 12869
[NOR] Episode: 160, Length: 92, Avg Reward: -266.988489895, e: 0.38488, Learning Rate: 0.01, buffer_len: 12961
Loss: -16.1831111908
[NOR] Episode: 170, Length: 131, Avg Reward: -344.447739031, e: 0.383767, Learning Rate: 0.01, buffer_len: 13915
Loss: -30.5140113831
[NOR] Episode: 180, Length: 112, Avg Reward: -394.277760334, e: 0.382616666667, Learning Rate: 0.01, buffer_len: 14901
Loss: -25.5084228516
[NOR] Episode: 190, Length: 78,

[2017-03-20 01:18:07,241] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000216.mp4


[NOR] Episode: 220, Length: 60, Avg Reward: -366.532786685, e: 0.377698, Learning Rate: 0.01, buffer_len: 19117
Loss: 245.767990112
[MAX] Episode: 221, Length: 104, Reward: -1.99339983876, buffer_len: 19221
[NOR] Episode: 230, Length: 74, Avg Reward: -271.041101988, e: 0.3765955, Learning Rate: 0.01, buffer_len: 20062
Loss: 109.348594666
[NOR] Episode: 240, Length: 115, Avg Reward: -292.443406455, e: 0.375385666667, Learning Rate: 0.01, buffer_len: 21099
Loss: 183.239929199
[NOR] Episode: 250, Length: 100, Avg Reward: -230.282844587, e: 0.3743975, Learning Rate: 0.01, buffer_len: 21946
Loss: 440.177032471
[NOR] Episode: 260, Length: 75, Avg Reward: -233.146248467, e: 0.373386, Learning Rate: 0.01, buffer_len: 22813
Loss: 1.23110103607
[NOR] Episode: 270, Length: 133, Avg Reward: -206.480927764, e: 0.372284666667, Learning Rate: 0.01, buffer_len: 23757
Loss: 183.366714478
[NOR] Episode: 280, Length: 103, Avg Reward: -205.241125579, e: 0.371238166667, Learning Rate: 0.01, buffer_len: 246

[2017-03-20 01:18:46,214] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000343.mp4


[NOR] Episode: 350, Length: 110, Avg Reward: -181.224244614, e: 0.3643, Learning Rate: 0.01, buffer_len: 30601
Loss: 13.6606054306
[NOR] Episode: 360, Length: 122, Avg Reward: -138.992321782, e: 0.3632745, Learning Rate: 0.01, buffer_len: 31480
Loss: 27.2608184814
[NOR] Episode: 370, Length: 88, Avg Reward: -151.274239669, e: 0.36227, Learning Rate: 0.01, buffer_len: 32341
Loss: -12.0646209717
[NOR] Episode: 380, Length: 83, Avg Reward: -145.666712812, e: 0.3612375, Learning Rate: 0.01, buffer_len: 33226
Loss: 82.095413208
[NOR] Episode: 390, Length: 104, Avg Reward: -144.645298222, e: 0.360305333333, Learning Rate: 0.01, buffer_len: 34025
Loss: -5.29364299774
[NOR] Episode: 400, Length: 100, Avg Reward: -147.361993644, e: 0.359227333333, Learning Rate: 0.01, buffer_len: 34949
Loss: -0.340466737747
[NOR] Episode: 410, Length: 86, Avg Reward: -155.013129596, e: 0.358225166667, Learning Rate: 0.01, buffer_len: 35808
Loss: 246.041412354
[NOR] Episode: 420, Length: 61, Avg Reward: -147.280

[2017-03-20 01:19:37,739] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000512.mp4


[NOR] Episode: 520, Length: 74, Avg Reward: -139.083742178, e: 0.347211833333, Learning Rate: 0.01, buffer_len: 45248
Loss: -4.19596147537
[NOR] Episode: 530, Length: 82, Avg Reward: -141.28004416, e: 0.346258666667, Learning Rate: 0.01, buffer_len: 46065
Loss: -13.232670784
[NOR] Episode: 540, Length: 65, Avg Reward: -140.537190509, e: 0.3452215, Learning Rate: 0.01, buffer_len: 46954
Loss: -4.53147315979
[NOR] Episode: 550, Length: 84, Avg Reward: -151.566395804, e: 0.344264833333, Learning Rate: 0.01, buffer_len: 47774
Loss: 12.9827108383
[NOR] Episode: 560, Length: 105, Avg Reward: -156.634969816, e: 0.343125, Learning Rate: 0.01, buffer_len: 48751
Loss: 33.3791618347
[NOR] Episode: 570, Length: 80, Avg Reward: -150.043270383, e: 0.3421135, Learning Rate: 0.01, buffer_len: 49618
Loss: 1.03576886654
[NOR] Episode: 580, Length: 84, Avg Reward: -140.424880361, e: 0.3411125, Learning Rate: 0.01, buffer_len: 50476
Loss: -4.42746019363
[MAX] Episode: 584, Length: 75, Reward: -1.022844580

[2017-03-20 01:20:48,371] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video000729.mp4


[NOR] Episode: 730, Length: 108, Avg Reward: -141.077259991, e: 0.324897, Learning Rate: 0.01, buffer_len: 64375
Loss: -41.9917869568
[NOR] Episode: 740, Length: 94, Avg Reward: -145.798781335, e: 0.323835333333, Learning Rate: 0.01, buffer_len: 65285
Loss: 85.9658584595
[NOR] Episode: 750, Length: 116, Avg Reward: -131.065645827, e: 0.321629166667, Learning Rate: 0.01, buffer_len: 67176
Loss: -0.947748541832
[NOR] Episode: 760, Length: 101, Avg Reward: -122.703221643, e: 0.3205535, Learning Rate: 0.01, buffer_len: 68098
Loss: 2.0998404026
[NOR] Episode: 770, Length: 103, Avg Reward: -102.287777208, e: 0.319473166667, Learning Rate: 0.01, buffer_len: 69024
Loss: -4.74233055115
[NOR] Episode: 780, Length: 85, Avg Reward: -137.624951873, e: 0.318437166667, Learning Rate: 0.01, buffer_len: 69912
Loss: 1.06264305115
[NOR] Episode: 790, Length: 103, Avg Reward: -120.897634905, e: 0.317474666667, Learning Rate: 0.01, buffer_len: 70737
Loss: -1.36531257629
[NOR] Episode: 800, Length: 90, Avg 

[2017-03-20 01:22:14,731] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video001000.mp4


[NOR] Episode: 1000, Length: 110, Avg Reward: -141.521088862, e: 0.296867833333, Learning Rate: 0.01, buffer_len: 88400
Loss: 32.9282608032
[NOR] Episode: 1010, Length: 89, Avg Reward: -126.019514508, e: 0.295779333333, Learning Rate: 0.01, buffer_len: 89333
Loss: -0.288255572319
[NOR] Episode: 1020, Length: 126, Avg Reward: -129.392373177, e: 0.294601, Learning Rate: 0.01, buffer_len: 90343
Loss: 4.70752477646
[NOR] Episode: 1030, Length: 121, Avg Reward: -155.868059043, e: 0.293355, Learning Rate: 0.01, buffer_len: 91411
Loss: -16.7612743378
[NOR] Episode: 1040, Length: 90, Avg Reward: -154.468383948, e: 0.292187166667, Learning Rate: 0.01, buffer_len: 92412
Loss: 54.5832824707
[NOR] Episode: 1050, Length: 98, Avg Reward: -116.876803395, e: 0.290970333333, Learning Rate: 0.01, buffer_len: 93455
Loss: 22.5468120575
[NOR] Episode: 1060, Length: 82, Avg Reward: -120.874952371, e: 0.289863166667, Learning Rate: 0.01, buffer_len: 94404
Loss: 398.600708008
[NOR] Episode: 1070, Length: 94, 

[2017-03-20 01:28:48,886] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video002000.mp4


[NOR] Episode: 2000, Length: 197, Avg Reward: -112.836800558, e: 0.15311, Learning Rate: 0.01, buffer_len: 211621
Loss: 0.234939098358
[NOR] Episode: 2010, Length: 159, Avg Reward: -115.722907391, e: 0.151397333333, Learning Rate: 0.01, buffer_len: 213089
Loss: -2.44032526016
[NOR] Episode: 2020, Length: 122, Avg Reward: -170.180717637, e: 0.149838666667, Learning Rate: 0.01, buffer_len: 214425
Loss: 70.0360717773
[NOR] Episode: 2030, Length: 103, Avg Reward: -134.570330067, e: 0.147857666667, Learning Rate: 0.01, buffer_len: 216123
Loss: -8.46421527863
[NOR] Episode: 2040, Length: 176, Avg Reward: -107.569839786, e: 0.146019, Learning Rate: 0.01, buffer_len: 217699
Loss: 22.7128372192
[NOR] Episode: 2050, Length: 191, Avg Reward: -87.3195128075, e: 0.144388, Learning Rate: 0.01, buffer_len: 219097
Loss: 54.0369186401
[NOR] Episode: 2060, Length: 118, Avg Reward: -145.445248933, e: 0.142621666667, Learning Rate: 0.01, buffer_len: 220611
Loss: -13.1034641266
[NOR] Episode: 2070, Length:

[2017-03-20 01:37:09,998] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video003000.mp4


[NOR] Episode: 3000, Length: 179, Avg Reward: -76.514339351, e: 0.05, Learning Rate: 0.01, buffer_len: 372834
Loss: 44.7933883667
[NOR] Episode: 3010, Length: 95, Avg Reward: -94.655375831, e: 0.05, Learning Rate: 0.01, buffer_len: 374549
Loss: 33.2605018616
[NOR] Episode: 3020, Length: 108, Avg Reward: -85.1086048062, e: 0.05, Learning Rate: 0.01, buffer_len: 376218
Loss: 38.0312919617
[NOR] Episode: 3030, Length: 228, Avg Reward: -35.9945651176, e: 0.05, Learning Rate: 0.01, buffer_len: 378499
Loss: -3.90676641464
[NOR] Episode: 3040, Length: 324, Avg Reward: -54.039189141, e: 0.05, Learning Rate: 0.01, buffer_len: 380613
Loss: 97.0826263428
[NOR] Episode: 3050, Length: 172, Avg Reward: -109.289001641, e: 0.05, Learning Rate: 0.01, buffer_len: 381970
Loss: 252.605163574
[NOR] Episode: 3060, Length: 140, Avg Reward: -97.6435901454, e: 0.05, Learning Rate: 0.01, buffer_len: 383516
Loss: 24.87591362
[NOR] Episode: 3070, Length: 85, Avg Reward: -113.735369555, e: 0.05, Learning Rate: 0.0

[2017-03-20 01:58:06,255] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video004000.mp4


[NOR] Episode: 4000, Length: 354, Avg Reward: 34.8354789684, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 10.5269603729
[NOR] Episode: 4010, Length: 584, Avg Reward: 144.176711748, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 14.5749435425
[NOR] Episode: 4020, Length: 158, Avg Reward: 29.0559845897, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.396255254745
[NOR] Episode: 4030, Length: 1000, Avg Reward: 32.3354739618, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.95072841644
[NOR] Episode: 4040, Length: 1000, Avg Reward: 45.7770173819, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.765479207039
[MAX] Episode: 4046, Length: 459, Reward: 251.67485821, buffer_len: 500000
[NOR] Episode: 4050, Length: 356, Avg Reward: 95.0302586711, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 250.539978027
[NOR] Episode: 4060, Length: 293, Avg Reward: 131.969412482, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12.1859617233
[NOR] Episo

[2017-03-20 02:18:48,853] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video005000.mp4


[NOR] Episode: 5000, Length: 276, Avg Reward: 186.637388962, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.263635873795
[NOR] Episode: 5010, Length: 263, Avg Reward: 164.670708126, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7.86114168167
[NOR] Episode: 5020, Length: 260, Avg Reward: 192.041873727, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.168060556054
[NOR] Episode: 5030, Length: 179, Avg Reward: 142.305056885, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.33479547501
[MAX] Episode: 5035, Length: 272, Reward: 272.313376154, buffer_len: 500000
[NOR] Episode: 5040, Length: 164, Avg Reward: 152.350507441, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 14.6625919342
[NOR] Episode: 5050, Length: 199, Avg Reward: 200.374845384, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.09260511398
[NOR] Episode: 5060, Length: 347, Avg Reward: 162.967224168, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7.52090072632
[NOR] Epis

[2017-03-20 02:51:50,506] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video006000.mp4


[NOR] Episode: 6000, Length: 404, Avg Reward: 141.458835911, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.37481451035
[NOR] Episode: 6010, Length: 259, Avg Reward: 178.278681712, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13.2872142792
[NOR] Episode: 6020, Length: 262, Avg Reward: 219.188850307, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.57602977753
[NOR] Episode: 6030, Length: 368, Avg Reward: 195.906281212, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.63027715683
[NOR] Episode: 6040, Length: 437, Avg Reward: 131.445349963, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.906276643276
[NOR] Episode: 6050, Length: 401, Avg Reward: 128.622338099, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.152612090111
[NOR] Episode: 6060, Length: 318, Avg Reward: 179.052564037, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.04090619087
[NOR] Episode: 6070, Length: 365, Avg Reward: 148.592077315, e: 0.05, Learning Rate: 0.0

[2017-03-20 03:20:50,991] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video007000.mp4


[NOR] Episode: 7000, Length: 614, Avg Reward: 163.712977436, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.16698074341
[NOR] Episode: 7010, Length: 525, Avg Reward: 92.5157296155, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7.11277580261
[NOR] Episode: 7020, Length: 369, Avg Reward: 176.913693831, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 118.501083374
[NOR] Episode: 7030, Length: 341, Avg Reward: 137.964282944, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4.39794445038
[NOR] Episode: 7040, Length: 339, Avg Reward: 161.868529432, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5.05223846436
[NOR] Episode: 7050, Length: 500, Avg Reward: 158.144279745, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9.75638961792
[NOR] Episode: 7060, Length: 508, Avg Reward: 107.488955296, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.50390458107
[NOR] Episode: 7070, Length: 684, Avg Reward: 29.0875185194, e: 0.05, Learning Rate: 0.01

[2017-03-20 04:00:26,010] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video008000.mp4


[NOR] Episode: 8000, Length: 75, Avg Reward: -148.893921962, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.87208688259
[NOR] Episode: 8010, Length: 57, Avg Reward: -150.331803647, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.426493763924
[NOR] Episode: 8020, Length: 66, Avg Reward: -172.298576514, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -8.79467582703
[NOR] Episode: 8030, Length: 161, Avg Reward: -181.49937127, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.43273162842
[NOR] Episode: 8040, Length: 205, Avg Reward: -131.099696753, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.40911912918
[NOR] Episode: 8050, Length: 172, Avg Reward: -134.981910076, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.30971503258
[NOR] Episode: 8060, Length: 105, Avg Reward: -149.109276344, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.25017976761
[NOR] Episode: 8070, Length: 145, Avg Reward: -130.773580008, e: 0.05, Learning Rat

[2017-03-20 04:08:44,771] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video009000.mp4


[NOR] Episode: 9000, Length: 121, Avg Reward: -104.058864091, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.72410917282
[NOR] Episode: 9010, Length: 179, Avg Reward: -145.385684244, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.505770087242
[NOR] Episode: 9020, Length: 147, Avg Reward: -158.295038119, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 10.6981868744
[NOR] Episode: 9030, Length: 84, Avg Reward: -166.812078675, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.61475133896
[NOR] Episode: 9040, Length: 160, Avg Reward: -119.432769409, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9.9980096817
[NOR] Episode: 9050, Length: 119, Avg Reward: -159.771080501, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.22212696075
[NOR] Episode: 9060, Length: 154, Avg Reward: -83.135790131, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4.1989402771
[NOR] Episode: 9070, Length: 125, Avg Reward: -149.18856122, e: 0.05, Learning Rate: 

[2017-03-20 04:24:31,644] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video010000.mp4


[NOR] Episode: 10000, Length: 80, Avg Reward: -91.1117474553, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4.8566236496
[NOR] Episode: 10010, Length: 100, Avg Reward: -103.489104915, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.208065271378
[NOR] Episode: 10020, Length: 76, Avg Reward: -95.3220536984, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 428.243469238
[NOR] Episode: 10030, Length: 86, Avg Reward: -72.9324111892, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -8.78845214844
[NOR] Episode: 10040, Length: 110, Avg Reward: -73.936028299, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5.47647380829
[NOR] Episode: 10050, Length: 98, Avg Reward: -54.6643591228, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8.4853515625
[NOR] Episode: 10060, Length: 127, Avg Reward: -48.4353401342, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.807794332504
[NOR] Episode: 10070, Length: 87, Avg Reward: -90.1245910157, e: 0.05, Learning

[2017-03-20 04:33:02,867] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video011000.mp4


[NOR] Episode: 11000, Length: 126, Avg Reward: -30.4837881249, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.64176571369
[NOR] Episode: 11010, Length: 135, Avg Reward: -56.2182954742, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 256.802246094
[NOR] Episode: 11020, Length: 152, Avg Reward: -66.5297731242, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 10.7146186829
[NOR] Episode: 11030, Length: 123, Avg Reward: -66.0994642878, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 36.2937927246
[NOR] Episode: 11040, Length: 160, Avg Reward: -64.3764595632, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 451.261810303
[NOR] Episode: 11050, Length: 183, Avg Reward: -77.334538514, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.01342797279
[NOR] Episode: 11060, Length: 127, Avg Reward: -56.9695715041, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9.10449314117
[NOR] Episode: 11070, Length: 135, Avg Reward: -92.1308036943, e: 0.05, Lea

[2017-03-20 05:04:04,325] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video012000.mp4


[NOR] Episode: 12000, Length: 178, Avg Reward: 71.2067680089, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.461243331432
[NOR] Episode: 12010, Length: 355, Avg Reward: 48.6042117273, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.96863365173
[NOR] Episode: 12020, Length: 334, Avg Reward: 53.6741748782, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.51088178158
[NOR] Episode: 12030, Length: 442, Avg Reward: 63.4174714897, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.75840353966
[NOR] Episode: 12040, Length: 485, Avg Reward: 50.3301336574, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.51859402657
[NOR] Episode: 12050, Length: 560, Avg Reward: 43.6976344017, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.02654409409
[NOR] Episode: 12060, Length: 1000, Avg Reward: -7.6061976676, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 14.1010808945
[NOR] Episode: 12070, Length: 514, Avg Reward: 126.74896199, e: 0.05, Learning R

[2017-03-20 05:39:12,931] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video013000.mp4


[NOR] Episode: 13000, Length: 102, Avg Reward: -351.289258453, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.76350831985
[NOR] Episode: 13010, Length: 226, Avg Reward: -237.589084261, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 10.0078706741
[NOR] Episode: 13020, Length: 536, Avg Reward: -169.912796816, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.7649769783
[NOR] Episode: 13030, Length: 193, Avg Reward: -108.738520687, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.76503944397
[NOR] Episode: 13040, Length: 516, Avg Reward: 68.7660534386, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5.35069274902
[NOR] Episode: 13050, Length: 217, Avg Reward: 79.6407458651, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -4.67413520813
[NOR] Episode: 13060, Length: 226, Avg Reward: 19.7544040272, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9.31791782379
[NOR] Episode: 13070, Length: 153, Avg Reward: -136.205555481, e: 0.05, Learn

[2017-03-20 05:49:46,562] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video014000.mp4


[NOR] Episode: 14000, Length: 113, Avg Reward: -69.24545915, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.7040708065
[NOR] Episode: 14010, Length: 74, Avg Reward: -121.375436357, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.25177836418
[NOR] Episode: 14020, Length: 70, Avg Reward: -122.445965987, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.419631958008
[NOR] Episode: 14030, Length: 54, Avg Reward: -153.613832515, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -5.37948608398
[NOR] Episode: 14040, Length: 80, Avg Reward: -114.876332458, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12.264799118
[NOR] Episode: 14050, Length: 89, Avg Reward: -128.030393949, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 11.453458786
[NOR] Episode: 14060, Length: 82, Avg Reward: -147.958320455, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.72286748886
[NOR] Episode: 14070, Length: 92, Avg Reward: -115.558520053, e: 0.05, Learning Rate

[2017-03-20 06:05:22,054] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video015000.mp4


[NOR] Episode: 15000, Length: 447, Avg Reward: -68.0079142443, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8.10780143738
[NOR] Episode: 15010, Length: 361, Avg Reward: -60.5065246693, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 156.744171143
[NOR] Episode: 15020, Length: 120, Avg Reward: -37.5456468884, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 56.0564422607
[NOR] Episode: 15030, Length: 695, Avg Reward: 11.2766972318, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12.2840423584
[NOR] Episode: 15040, Length: 182, Avg Reward: 85.4643101561, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.99030399323
[NOR] Episode: 15050, Length: 195, Avg Reward: 10.5436326813, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -16.2266578674
[NOR] Episode: 15060, Length: 101, Avg Reward: -85.6546729092, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 15.9728879929
[NOR] Episode: 15070, Length: 111, Avg Reward: 16.6990327808, e: 0.05, Learnin

[2017-03-20 06:28:50,461] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video016000.mp4


[NOR] Episode: 16000, Length: 236, Avg Reward: 138.279622371, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.27499294281
[NOR] Episode: 16010, Length: 509, Avg Reward: 107.974856102, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 14.2123794556
[NOR] Episode: 16020, Length: 475, Avg Reward: -20.8171567115, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 15.9823894501
[NOR] Episode: 16030, Length: 807, Avg Reward: 107.684101645, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 21.0249233246
[NOR] Episode: 16040, Length: 412, Avg Reward: 74.3010346959, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.294859409332
[NOR] Episode: 16050, Length: 1000, Avg Reward: 57.955131158, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 261.179077148
[NOR] Episode: 16060, Length: 867, Avg Reward: -19.6408703695, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.28025579453
[NOR] Episode: 16070, Length: 530, Avg Reward: 18.3704740041, e: 0.05, Learning

[2017-03-20 07:14:45,307] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video017000.mp4


[NOR] Episode: 17000, Length: 202, Avg Reward: -6.93980991803, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5.04566860199
[NOR] Episode: 17010, Length: 452, Avg Reward: -52.2958655985, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.49793624878
[NOR] Episode: 17020, Length: 375, Avg Reward: -52.1877883086, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.150255143642
[NOR] Episode: 17030, Length: 257, Avg Reward: -83.7499921031, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 5.42313671112
[NOR] Episode: 17040, Length: 761, Avg Reward: -60.6798753579, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12.8206119537
[NOR] Episode: 17050, Length: 1000, Avg Reward: -7.87656131469, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.531370937824
[NOR] Episode: 17060, Length: 1000, Avg Reward: -87.073302014, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4.91306066513
[NOR] Episode: 17070, Length: 1000, Avg Reward: -63.396964729, e: 0.05, 

[2017-03-20 07:56:20,240] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video018000.mp4


[NOR] Episode: 18000, Length: 88, Avg Reward: -34.5678378565, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.21974086761
[NOR] Episode: 18010, Length: 395, Avg Reward: -77.6564137714, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.92551612854
[NOR] Episode: 18020, Length: 188, Avg Reward: -41.0689769164, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.87977600098
[NOR] Episode: 18030, Length: 164, Avg Reward: 10.3884148772, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9.04654693604
[NOR] Episode: 18040, Length: 385, Avg Reward: 116.759271383, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 177.099243164
[NOR] Episode: 18050, Length: 461, Avg Reward: 76.9115276604, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.65790605545
[NOR] Episode: 18060, Length: 340, Avg Reward: 131.06370318, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.0454082489
[NOR] Episode: 18070, Length: 517, Avg Reward: 165.497307194, e: 0.05, Learning Ra

[2017-03-20 08:25:13,894] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video019000.mp4


[NOR] Episode: 19000, Length: 124, Avg Reward: -195.869054824, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -18.1347846985
[NOR] Episode: 19010, Length: 148, Avg Reward: -231.741754073, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -10.7635269165
[NOR] Episode: 19020, Length: 168, Avg Reward: -254.279331777, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -13.1024637222
[NOR] Episode: 19030, Length: 87, Avg Reward: -237.943811065, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -71.9389190674
[NOR] Episode: 19040, Length: 151, Avg Reward: -253.455171794, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -5.72317218781
[NOR] Episode: 19050, Length: 239, Avg Reward: -225.247301633, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.378988742828
[NOR] Episode: 19060, Length: 572, Avg Reward: -129.081639319, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.95998334885
[NOR] Episode: 19070, Length: 552, Avg Reward: -65.365335031, e: 0.05, 

[2017-03-20 08:35:09,639] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video020000.mp4


[NOR] Episode: 20000, Length: 250, Avg Reward: -342.316351553, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -11.2268648148
[NOR] Episode: 20010, Length: 173, Avg Reward: -362.701255621, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 434.938110352
[NOR] Episode: 20020, Length: 158, Avg Reward: -340.884399798, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 30.1310310364
[NOR] Episode: 20030, Length: 129, Avg Reward: -389.272831052, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 38.6866226196
[NOR] Episode: 20040, Length: 158, Avg Reward: -408.531975409, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.4561829567
[NOR] Episode: 20050, Length: 132, Avg Reward: -379.74794836, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -13.6854457855
[NOR] Episode: 20060, Length: 97, Avg Reward: -415.202742878, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -10.3174962997
[NOR] Episode: 20070, Length: 204, Avg Reward: -357.674582593, e: 0.05, Lea

[2017-03-20 08:42:36,456] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video021000.mp4


[NOR] Episode: 21000, Length: 213, Avg Reward: -358.061485808, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -15.6661624908
[NOR] Episode: 21010, Length: 283, Avg Reward: -337.774905588, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 177.033477783
[NOR] Episode: 21020, Length: 137, Avg Reward: -374.745019859, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 23.632768631
[NOR] Episode: 21030, Length: 138, Avg Reward: -281.523529416, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 354.779785156
[NOR] Episode: 21040, Length: 221, Avg Reward: -327.830600287, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -4.66844081879
[NOR] Episode: 21050, Length: 173, Avg Reward: -357.020274682, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -36.9437599182
[NOR] Episode: 21060, Length: 226, Avg Reward: -324.827763337, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -15.4343738556
[NOR] Episode: 21070, Length: 183, Avg Reward: -273.940141896, e: 0.05, L

[2017-03-20 08:49:55,026] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video022000.mp4


[NOR] Episode: 22000, Length: 81, Avg Reward: -178.792387346, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 25.3762607574
[NOR] Episode: 22010, Length: 100, Avg Reward: -197.878988606, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -34.1102561951
[NOR] Episode: 22020, Length: 78, Avg Reward: -221.681707943, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -26.0618057251
[NOR] Episode: 22030, Length: 116, Avg Reward: -281.55545436, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.104706287384
[NOR] Episode: 22040, Length: 96, Avg Reward: -265.06301879, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.11439800262
[NOR] Episode: 22050, Length: 113, Avg Reward: -255.653339285, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.18973731995
[NOR] Episode: 22060, Length: 71, Avg Reward: -283.010512224, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -16.6288948059
[NOR] Episode: 22070, Length: 77, Avg Reward: -193.29971561, e: 0.05, Learni

[2017-03-20 08:54:38,392] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video023000.mp4


[NOR] Episode: 23000, Length: 66, Avg Reward: -267.969567046, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -11.4741535187
[NOR] Episode: 23010, Length: 76, Avg Reward: -281.127775172, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -9.05968761444
[NOR] Episode: 23020, Length: 78, Avg Reward: -264.454824083, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -17.2330703735
[NOR] Episode: 23030, Length: 94, Avg Reward: -247.280326021, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.09115314484
[NOR] Episode: 23040, Length: 56, Avg Reward: -295.735256838, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -40.1912574768
[NOR] Episode: 23050, Length: 62, Avg Reward: -256.408563456, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -8.4739074707
[NOR] Episode: 23060, Length: 74, Avg Reward: -232.743391965, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.87731480598
[NOR] Episode: 23070, Length: 75, Avg Reward: -272.451416817, e: 0.05, Learnin

[2017-03-20 08:58:57,365] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video024000.mp4


[NOR] Episode: 24000, Length: 77, Avg Reward: -251.454301299, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 39.9619903564
[NOR] Episode: 24010, Length: 91, Avg Reward: -201.2048424, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.95321023464
[NOR] Episode: 24020, Length: 58, Avg Reward: -197.217921385, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 48.3752822876
[NOR] Episode: 24030, Length: 64, Avg Reward: -279.616947772, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8.63410186768
[NOR] Episode: 24040, Length: 57, Avg Reward: -278.670330393, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -6.00252723694
[NOR] Episode: 24050, Length: 86, Avg Reward: -220.611105672, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 53.146686554
[NOR] Episode: 24060, Length: 85, Avg Reward: -264.758807317, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.44446611404
[NOR] Episode: 24070, Length: 63, Avg Reward: -276.638100399, e: 0.05, Learning Rate

[2017-03-20 09:03:29,022] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video025000.mp4


[NOR] Episode: 25000, Length: 55, Avg Reward: -164.922832028, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -4.01035165787
[NOR] Episode: 25010, Length: 68, Avg Reward: -172.925706388, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 21.0309638977
[NOR] Episode: 25020, Length: 72, Avg Reward: -174.383060997, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 21.3339576721
[NOR] Episode: 25030, Length: 53, Avg Reward: -163.251365443, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 25.9325084686
[NOR] Episode: 25040, Length: 59, Avg Reward: -177.388164504, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.66248595715
[NOR] Episode: 25050, Length: 68, Avg Reward: -170.995113156, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 381.576416016
[NOR] Episode: 25060, Length: 58, Avg Reward: -168.807815929, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -10.707572937
[NOR] Episode: 25070, Length: 63, Avg Reward: -150.475527179, e: 0.05, Learning Ra

[2017-03-20 09:07:58,845] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video026000.mp4


[NOR] Episode: 26000, Length: 73, Avg Reward: -166.878456384, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.92252159119
[NOR] Episode: 26010, Length: 74, Avg Reward: -171.605379983, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 314.056304932
[NOR] Episode: 26020, Length: 95, Avg Reward: -163.247315247, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 10.833814621
[NOR] Episode: 26030, Length: 59, Avg Reward: -156.946941742, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 286.426727295
[NOR] Episode: 26040, Length: 57, Avg Reward: -157.905298813, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.720085382462
[NOR] Episode: 26050, Length: 99, Avg Reward: -159.724534802, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.6959104538
[NOR] Episode: 26060, Length: 128, Avg Reward: -157.048842766, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.39549815655
[NOR] Episode: 26070, Length: 94, Avg Reward: -167.755156716, e: 0.05, Learning 

[2017-03-20 09:12:30,091] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video027000.mp4


[NOR] Episode: 27000, Length: 91, Avg Reward: -144.176330923, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 57.833732605
[NOR] Episode: 27010, Length: 89, Avg Reward: -131.176471397, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.530030012131
[NOR] Episode: 27020, Length: 63, Avg Reward: -158.267646514, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.55554389954
[NOR] Episode: 27030, Length: 95, Avg Reward: -141.094341866, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 21.1758861542
[NOR] Episode: 27040, Length: 74, Avg Reward: -156.101382089, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 181.891448975
[NOR] Episode: 27050, Length: 57, Avg Reward: -151.045179674, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 319.23449707
[NOR] Episode: 27060, Length: 75, Avg Reward: -151.286927903, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 263.116699219
[NOR] Episode: 27070, Length: 59, Avg Reward: -134.452537335, e: 0.05, Learning Rat

[2017-03-20 09:17:10,537] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video028000.mp4


[NOR] Episode: 28000, Length: 72, Avg Reward: -156.445692118, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.32931315899
[NOR] Episode: 28010, Length: 71, Avg Reward: -158.925769564, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 81.9434280396
[NOR] Episode: 28020, Length: 87, Avg Reward: -149.697796542, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 71.8801727295
[NOR] Episode: 28030, Length: 85, Avg Reward: -147.50077033, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.103390932083
[NOR] Episode: 28040, Length: 91, Avg Reward: -144.33356712, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.8439399004
[NOR] Episode: 28050, Length: 98, Avg Reward: -149.807322758, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.36744356155
[NOR] Episode: 28060, Length: 85, Avg Reward: -138.773067055, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -7.85751628876
[NOR] Episode: 28070, Length: 68, Avg Reward: -140.238057063, e: 0.05, Learning 

[2017-03-20 09:21:55,563] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video029000.mp4


[NOR] Episode: 29000, Length: 81, Avg Reward: -170.017973854, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -7.06403923035
[NOR] Episode: 29010, Length: 63, Avg Reward: -199.643544572, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.22237920761
[NOR] Episode: 29020, Length: 83, Avg Reward: -196.796398769, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 61.6628952026
[NOR] Episode: 29030, Length: 97, Avg Reward: -216.246254023, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -7.22938632965
[NOR] Episode: 29040, Length: 91, Avg Reward: -197.414400903, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 12.6381263733
[NOR] Episode: 29050, Length: 75, Avg Reward: -198.174876583, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 15.6428804398
[NOR] Episode: 29060, Length: 86, Avg Reward: -174.686879328, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8.8603477478
[NOR] Episode: 29070, Length: 100, Avg Reward: -183.778235111, e: 0.05, Learning 

[2017-03-20 09:27:10,083] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video030000.mp4


[NOR] Episode: 30000, Length: 120, Avg Reward: -144.25223124, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.669044971466
[NOR] Episode: 30010, Length: 97, Avg Reward: -141.227283623, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 23.1975669861
[NOR] Episode: 30020, Length: 65, Avg Reward: -174.719849622, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 60.0115966797
[NOR] Episode: 30030, Length: 71, Avg Reward: -172.448510895, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8.91933822632
[NOR] Episode: 30040, Length: 89, Avg Reward: -199.631773822, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8.4771900177
[NOR] Episode: 30050, Length: 81, Avg Reward: -138.425839543, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 22.076084137
[NOR] Episode: 30060, Length: 100, Avg Reward: -118.750522821, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 59.2250404358
[NOR] Episode: 30070, Length: 63, Avg Reward: -143.156156533, e: 0.05, Learning Rat

[2017-03-20 09:35:33,783] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video031000.mp4


[NOR] Episode: 31000, Length: 69, Avg Reward: -133.346754536, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -20.3285007477
[NOR] Episode: 31010, Length: 75, Avg Reward: -117.719143423, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 293.667114258
[NOR] Episode: 31020, Length: 75, Avg Reward: -101.237148012, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13.4867258072
[NOR] Episode: 31030, Length: 68, Avg Reward: -119.426517426, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 127.026145935
[NOR] Episode: 31040, Length: 82, Avg Reward: -108.371811108, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 120.159545898
[NOR] Episode: 31050, Length: 68, Avg Reward: -100.967480647, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -30.7338542938
[NOR] Episode: 31060, Length: 80, Avg Reward: -126.368000926, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -29.669757843
[NOR] Episode: 31070, Length: 73, Avg Reward: -125.563975178, e: 0.05, Learning R

[2017-03-20 09:41:21,566] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video032000.mp4


[NOR] Episode: 32000, Length: 216, Avg Reward: -128.237475412, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.20303583145
[NOR] Episode: 32010, Length: 72, Avg Reward: -144.959607426, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.245734781027
[NOR] Episode: 32020, Length: 64, Avg Reward: -143.522464868, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.40028202534
[NOR] Episode: 32030, Length: 89, Avg Reward: -111.671469281, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 51.50522995
[NOR] Episode: 32040, Length: 136, Avg Reward: -89.8293620834, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.822877287865
[NOR] Episode: 32050, Length: 76, Avg Reward: -97.6693660712, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.4312915802
[NOR] Episode: 32060, Length: 99, Avg Reward: -157.610728374, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 48.9243545532
[NOR] Episode: 32070, Length: 69, Avg Reward: -129.144579187, e: 0.05, Learning

[2017-03-20 09:48:43,820] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video033000.mp4


[NOR] Episode: 33000, Length: 107, Avg Reward: -86.2781395876, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.48206686974
[NOR] Episode: 33010, Length: 116, Avg Reward: -79.2499906262, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.573325157166
[NOR] Episode: 33020, Length: 86, Avg Reward: -82.397937899, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9.67536830902
[NOR] Episode: 33030, Length: 356, Avg Reward: -87.8579888415, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 428.9402771
[NOR] Episode: 33040, Length: 210, Avg Reward: -101.646204573, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 1.14794254303
[NOR] Episode: 33050, Length: 129, Avg Reward: -110.689325951, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 89.4447631836
[NOR] Episode: 33060, Length: 584, Avg Reward: -73.2571984054, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.35595464706
[NOR] Episode: 33070, Length: 78, Avg Reward: -109.618202556, e: 0.05, Learnin

[2017-03-20 10:01:58,767] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video034000.mp4


[NOR] Episode: 34000, Length: 64, Avg Reward: -135.374107981, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.73990869522
[NOR] Episode: 34010, Length: 89, Avg Reward: -132.974560729, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 263.405303955
[NOR] Episode: 34020, Length: 57, Avg Reward: -154.349486746, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 6.27914142609
[NOR] Episode: 34030, Length: 89, Avg Reward: -144.817773667, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 19.0900230408
[NOR] Episode: 34040, Length: 71, Avg Reward: -148.95073755, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.178569197655
[NOR] Episode: 34050, Length: 85, Avg Reward: -135.511916208, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 17.4436092377
[NOR] Episode: 34060, Length: 63, Avg Reward: -142.121816017, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 118.453346252
[NOR] Episode: 34070, Length: 91, Avg Reward: -135.804308566, e: 0.05, Learning R

[2017-03-20 10:08:25,640] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video035000.mp4


[NOR] Episode: 35000, Length: 132, Avg Reward: -88.7955336749, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 7.37106800079
[NOR] Episode: 35010, Length: 275, Avg Reward: -114.089726342, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 176.16784668
[NOR] Episode: 35020, Length: 123, Avg Reward: -111.307636352, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.04912829399
[NOR] Episode: 35030, Length: 207, Avg Reward: -89.649277904, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 2.23860836029
[NOR] Episode: 35040, Length: 118, Avg Reward: -98.7779294848, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 3.34817767143
[NOR] Episode: 35050, Length: 216, Avg Reward: -94.9367386668, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 11.0819587708
[NOR] Episode: 35060, Length: 205, Avg Reward: -78.1378979216, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -3.94248795509
[NOR] Episode: 35070, Length: 130, Avg Reward: -60.8689523812, e: 0.05, Lear

[2017-03-20 10:26:48,526] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/reward-task/monitor/21/openaigym.video.21.20184.video036000.mp4


[NOR] Episode: 36000, Length: 130, Avg Reward: -99.1670860506, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 9.17866325378
[NOR] Episode: 36010, Length: 176, Avg Reward: -25.3429787403, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4.23073768616
[NOR] Episode: 36020, Length: 324, Avg Reward: 14.228374527, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4.90224552155
[NOR] Episode: 36030, Length: 173, Avg Reward: -0.0574611458138, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 148.171020508
[NOR] Episode: 36040, Length: 363, Avg Reward: 36.2952381732, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 8.69233036041
[NOR] Episode: 36050, Length: 235, Avg Reward: -63.2415639042, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 13.37870121
[NOR] Episode: 36060, Length: 1000, Avg Reward: -26.4817633636, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 15.0110015869
[NOR] Episode: 36070, Length: 299, Avg Reward: 55.4505006636, e: 0.05, Learnin

KeyboardInterrupt: 

In [66]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/21.272.691146069".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-20 10:41:45,516] Making new env: LunarLander-v2


243.797470956
-32.8488014114
219.353111093
30.568509689
176.156562556
-38.8324403062
114.753748804
218.049416314
239.012542657
-21.1754624372
204.824257629
221.1510406
233.165809066
190.769837155
226.111052744
232.375507618
246.158753518
201.513484877
260.02294592
202.344976616
225.880992423
-9.64755026126
-6.03692961905
234.001375633
209.467160545
87.5170965186
-28.0624531548
251.780547564
6.11093998915
234.962878443
243.457889913
217.791354451
231.393275445
242.334538746
233.551304121
201.214691061
184.507148375
176.875103386
203.221645247
206.039553326
228.494334658
183.623170261
233.926858399
233.831669087
183.859952167
220.885013382
229.316361555
247.971327615
225.934850965
250.040910851
111.452411384
214.949280811
114.053585001
255.138792085
130.970860274
223.51669256
-12.0294004791
259.521591465
245.417253086
95.523710089
201.21619177
195.542195408
261.715082604
237.888645878
216.372327063
154.560204699
234.574622526
226.822058493
230.557572324
210.545263243
-19.5257988617
245.6

In [64]:
run

21