In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time


name = "actor-critic-base"

In [53]:
def shifted_log_loss(x, alfa=0.05):
    return - tf.log(x + alfa * (1.0 - x))

class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
            
            self.V = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 128, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)
            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer2", use_bias=True, **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)

            
            self.logits = tf.layers.dense(net, n_actions, name='P', use_bias=False, **ops)
            self.P = tf.nn.softmax(self.logits)
            
            self.Pa = select_columns(self.P, inputs.a)

#             self.loss = tf.nn.softmax_cross_entropy_with_logits(
#                 logits = self.logits, 
#                 labels = tf.one_hot(inputs.a, n_actions)
#             )
#             print self.loss
#             self.loss = select_columns(self.loss, inputs.a)
            self.loss = shifted_log_loss(self.Pa) * critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [70]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [75]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{run}".format(run = run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{run}".format(path = os.getcwd(), run = run)
logs_path = "{path}/logs/{run}".format(path = os.getcwd(), run = run)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005
)

[2017-03-23 01:13:00,377] Making new env: LunarLander-v2
[2017-03-23 01:13:00,380] Creating monitor directory monitor/24


In [76]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.002, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-23 01:13:01,454] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000000.mp4
[2017-03-23 01:13:08,466] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000001.mp4


[MAX] Episode: 0, Length: 276, Reward: -110.734014186, buffer_len: 276


[2017-03-23 01:13:13,542] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000008.mp4


[NOR] Episode: 10, Length: 102, Avg Reward: -485.148865628, e: 0.397859166667, Learning Rate: 0.002, buffer_len: 1836
Loss: -6.88629245758
[MAX] Episode: 14, Length: 1000, Reward: -42.7222007203, buffer_len: 3177
[MAX] Episode: 16, Length: 1000, Reward: 61.3215828042, buffer_len: 4293
[NOR] Episode: 20, Length: 287, Avg Reward: -144.908075334, e: 0.3928215, Learning Rate: 0.002, buffer_len: 6154
Loss: -4.51028299332


[2017-03-23 01:13:41,989] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000027.mp4


[NOR] Episode: 30, Length: 1000, Avg Reward: -93.2477725664, e: 0.387566833333, Learning Rate: 0.002, buffer_len: 10658
Loss: -5.41270160675
[NOR] Episode: 40, Length: 1000, Avg Reward: -117.776512161, e: 0.383902333333, Learning Rate: 0.002, buffer_len: 13799
Loss: -1.59926497936
[NOR] Episode: 50, Length: 263, Avg Reward: -106.539652408, e: 0.380281, Learning Rate: 0.002, buffer_len: 16903
Loss: -3.91002941132
[NOR] Episode: 60, Length: 141, Avg Reward: -41.2669486018, e: 0.371097, Learning Rate: 0.002, buffer_len: 24775
Loss: -0.949034571648


[2017-03-23 01:14:56,724] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000064.mp4


[MAX] Episode: 69, Length: 1000, Reward: 70.6373755308, buffer_len: 29620
[NOR] Episode: 70, Length: 332, Avg Reward: -101.721203357, e: 0.365057166667, Learning Rate: 0.002, buffer_len: 29952
Loss: 0.498363405466
[MAX] Episode: 79, Length: 1000, Reward: 73.8736000134, buffer_len: 33553
[NOR] Episode: 80, Length: 1000, Avg Reward: -49.1995556742, e: 0.359689333333, Learning Rate: 0.002, buffer_len: 34553
Loss: -1.63813722134
[MAX] Episode: 85, Length: 1000, Reward: 105.662860026, buffer_len: 37160
[NOR] Episode: 90, Length: 282, Avg Reward: -64.4683891573, e: 0.355449666667, Learning Rate: 0.002, buffer_len: 38187
Loss: -8.08340740204
[NOR] Episode: 100, Length: 338, Avg Reward: -71.0201901791, e: 0.350092333333, Learning Rate: 0.002, buffer_len: 42779
Loss: -2.569617033
[NOR] Episode: 110, Length: 1000, Avg Reward: -30.4682065516, e: 0.342908, Learning Rate: 0.002, buffer_len: 48937
Loss: -1.32207274437
[NOR] Episode: 120, Length: 1000, Avg Reward: -21.9435946196, e: 0.336949833333, L

[2017-03-23 01:16:54,061] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000125.mp4


[NOR] Episode: 130, Length: 384, Avg Reward: -14.5336454289, e: 0.327503333333, Learning Rate: 0.002, buffer_len: 62141
Loss: 4.77951622009
[NOR] Episode: 140, Length: 1000, Avg Reward: -21.9957870161, e: 0.317149166667, Learning Rate: 0.002, buffer_len: 71016
Loss: -0.397850781679
[NOR] Episode: 150, Length: 1000, Avg Reward: -13.3092830504, e: 0.307553333333, Learning Rate: 0.002, buffer_len: 79241
Loss: -2.29038476944
[MAX] Episode: 153, Length: 990, Reward: 117.899109947, buffer_len: 82118
[NOR] Episode: 160, Length: 1000, Avg Reward: 22.5284431271, e: 0.296477, Learning Rate: 0.002, buffer_len: 88735
Loss: 0.0906465649605
[NOR] Episode: 170, Length: 1000, Avg Reward: -2.01178325789, e: 0.287490166667, Learning Rate: 0.002, buffer_len: 96438
Loss: -2.67467069626
[NOR] Episode: 180, Length: 98, Avg Reward: 2.15837839238, e: 0.2779095, Learning Rate: 0.002, buffer_len: 104650
Loss: -2.7666156292
[MAX] Episode: 184, Length: 534, Reward: 206.82874657, buffer_len: 108184
[NOR] Episode: 

[2017-03-23 01:22:09,335] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000216.mp4


[NOR] Episode: 220, Length: 935, Avg Reward: 61.9113441283, e: 0.239548333333, Learning Rate: 0.002, buffer_len: 137531
Loss: -0.793254256248
[MAX] Episode: 222, Length: 549, Reward: 235.530647789, buffer_len: 139080
[NOR] Episode: 230, Length: 490, Avg Reward: 96.9677895858, e: 0.2292175, Learning Rate: 0.002, buffer_len: 146386
Loss: -1.93373680115
[NOR] Episode: 240, Length: 501, Avg Reward: 107.340951331, e: 0.218791, Learning Rate: 0.002, buffer_len: 155323
Loss: -3.35356998444
[NOR] Episode: 250, Length: 105, Avg Reward: 77.6931485819, e: 0.210458666667, Learning Rate: 0.002, buffer_len: 162465
Loss: -0.959419071674
[NOR] Episode: 260, Length: 919, Avg Reward: 27.6069264486, e: 0.205005666667, Learning Rate: 0.002, buffer_len: 167139
Loss: -1.6795232296
[NOR] Episode: 270, Length: 504, Avg Reward: 10.1686769874, e: 0.201751833333, Learning Rate: 0.002, buffer_len: 169928
Loss: -2.14257383347
[NOR] Episode: 280, Length: 110, Avg Reward: 98.3108000542, e: 0.196375833333, Learning R

[2017-03-23 01:26:23,396] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000343.mp4


[NOR] Episode: 350, Length: 555, Avg Reward: 69.3352999502, e: 0.1593645, Learning Rate: 0.002, buffer_len: 206260
Loss: -0.802798271179
[NOR] Episode: 360, Length: 316, Avg Reward: 74.9446583008, e: 0.155075833333, Learning Rate: 0.002, buffer_len: 209936
Loss: -0.302848160267
[NOR] Episode: 370, Length: 374, Avg Reward: 93.4154837075, e: 0.148183166667, Learning Rate: 0.002, buffer_len: 215844
Loss: -1.46220266819
[NOR] Episode: 380, Length: 132, Avg Reward: 68.9704364647, e: 0.1439295, Learning Rate: 0.002, buffer_len: 219490
Loss: -0.0121326446533
[MAX] Episode: 382, Length: 300, Reward: 236.468349315, buffer_len: 220621
[NOR] Episode: 390, Length: 586, Avg Reward: 96.3660434412, e: 0.139380666667, Learning Rate: 0.002, buffer_len: 223389
Loss: -0.34280538559
[NOR] Episode: 400, Length: 541, Avg Reward: 75.0311199044, e: 0.1356975, Learning Rate: 0.002, buffer_len: 226546
Loss: 0.0892119407654
[NOR] Episode: 410, Length: 85, Avg Reward: 97.1850769197, e: 0.131659666667, Learning Ra

[2017-03-23 01:30:21,402] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000512.mp4


[NOR] Episode: 520, Length: 524, Avg Reward: 125.649604513, e: 0.0804056666667, Learning Rate: 0.002, buffer_len: 273939
Loss: -0.329622209072
[NOR] Episode: 530, Length: 320, Avg Reward: 97.5111650268, e: 0.0762768333333, Learning Rate: 0.002, buffer_len: 277478
Loss: -1.62040007114
[NOR] Episode: 540, Length: 75, Avg Reward: 81.2694240828, e: 0.0726216666667, Learning Rate: 0.002, buffer_len: 280611
Loss: -0.0692533254623
[NOR] Episode: 550, Length: 1000, Avg Reward: 111.273605193, e: 0.0672188333333, Learning Rate: 0.002, buffer_len: 285242
Loss: -1.74000835419
[NOR] Episode: 560, Length: 353, Avg Reward: 61.6392953386, e: 0.0639206666667, Learning Rate: 0.002, buffer_len: 288069
Loss: -0.15679910779
[NOR] Episode: 570, Length: 591, Avg Reward: 81.2559800146, e: 0.0594908333333, Learning Rate: 0.002, buffer_len: 291866
Loss: -1.89224755764
[NOR] Episode: 580, Length: 216, Avg Reward: 83.4367117545, e: 0.0561028333333, Learning Rate: 0.002, buffer_len: 294770
Loss: -2.2788438797
[NOR

[2017-03-23 01:34:49,766] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video000729.mp4


[NOR] Episode: 730, Length: 286, Avg Reward: 103.85104574, e: 0.05, Learning Rate: 0.002, buffer_len: 347598
Loss: -0.425585031509
[NOR] Episode: 740, Length: 381, Avg Reward: 105.606009282, e: 0.05, Learning Rate: 0.002, buffer_len: 351287
Loss: 1.17494595051
[NOR] Episode: 750, Length: 366, Avg Reward: 161.404563968, e: 0.05, Learning Rate: 0.002, buffer_len: 355339
Loss: -1.02347683907
[NOR] Episode: 760, Length: 442, Avg Reward: 175.152166839, e: 0.05, Learning Rate: 0.002, buffer_len: 359028
Loss: -0.312816441059
[NOR] Episode: 770, Length: 228, Avg Reward: 113.00647592, e: 0.05, Learning Rate: 0.002, buffer_len: 363202
Loss: -0.612167298794
[NOR] Episode: 780, Length: 383, Avg Reward: 173.938103966, e: 0.05, Learning Rate: 0.002, buffer_len: 366960
Loss: 0.177760094404
[NOR] Episode: 790, Length: 279, Avg Reward: 120.685691378, e: 0.05, Learning Rate: 0.002, buffer_len: 371381
Loss: -0.781987369061
[NOR] Episode: 800, Length: 254, Avg Reward: 161.620636341, e: 0.05, Learning Rate

[2017-03-23 01:43:57,831] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video001000.mp4


[NOR] Episode: 1000, Length: 402, Avg Reward: 176.276595591, e: 0.05, Learning Rate: 0.002, buffer_len: 495783
Loss: -2.33706784248
[NOR] Episode: 1010, Length: 588, Avg Reward: 135.89198285, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -4.38565301895
[NOR] Episode: 1020, Length: 720, Avg Reward: 144.284986457, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.54419279099
[NOR] Episode: 1030, Length: 372, Avg Reward: 126.909941185, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.00876140594482
[NOR] Episode: 1040, Length: 1000, Avg Reward: 115.631675221, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.509763002396
[NOR] Episode: 1050, Length: 1000, Avg Reward: -45.6617702296, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.539056301117
[NOR] Episode: 1060, Length: 1000, Avg Reward: 24.278846772, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.26029330492
[NOR] Episode: 1070, Length: 1000, Avg Reward: 115.668029346, e: 0.05, 

[2017-03-23 02:12:43,866] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video002000.mp4


[NOR] Episode: 2000, Length: 105, Avg Reward: 74.8528093931, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.42925632
[NOR] Episode: 2010, Length: 330, Avg Reward: 202.118730405, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.61160850525
[NOR] Episode: 2020, Length: 237, Avg Reward: 183.332002496, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.939157903194
[NOR] Episode: 2030, Length: 281, Avg Reward: 192.565243475, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.328345358372
[NOR] Episode: 2040, Length: 269, Avg Reward: 222.145410345, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.397533714771
[NOR] Episode: 2050, Length: 334, Avg Reward: 189.040629324, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 3.76170086861
[NOR] Episode: 2060, Length: 347, Avg Reward: 219.696205163, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.67498338223
[NOR] Episode: 2070, Length: 229, Avg Reward: 193.72001469, e: 0.05, Learning

[2017-03-23 02:34:39,251] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video003000.mp4


[NOR] Episode: 3000, Length: 605, Avg Reward: 165.118672331, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.915148615837
[NOR] Episode: 3010, Length: 681, Avg Reward: 160.581038683, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.0497445575893
[NOR] Episode: 3020, Length: 511, Avg Reward: 123.571426859, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.670706152916
[NOR] Episode: 3030, Length: 496, Avg Reward: 157.657445991, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.712878465652
[NOR] Episode: 3040, Length: 734, Avg Reward: 148.606211654, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.478569447994
[NOR] Episode: 3050, Length: 999, Avg Reward: 141.376530958, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.263548433781
[NOR] Episode: 3060, Length: 551, Avg Reward: 110.372215424, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.399818629026
[NOR] Episode: 3070, Length: 403, Avg Reward: 169.476588211, e: 0.05, 

[2017-03-23 02:56:57,418] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video004000.mp4


[NOR] Episode: 4000, Length: 233, Avg Reward: 176.658699281, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.21478867531
[NOR] Episode: 4010, Length: 120, Avg Reward: 174.377655181, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.56213474274
[NOR] Episode: 4020, Length: 243, Avg Reward: 162.380363589, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.769235193729
[NOR] Episode: 4030, Length: 344, Avg Reward: 143.792913857, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.102557569742
[NOR] Episode: 4040, Length: 366, Avg Reward: 192.278477142, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.65838599205
[NOR] Episode: 4050, Length: 189, Avg Reward: 148.310425734, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.40101480484
[NOR] Episode: 4060, Length: 264, Avg Reward: 77.7165886792, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.756788432598
[NOR] Episode: 4070, Length: 303, Avg Reward: 203.336939419, e: 0.05, Lear

[2017-03-23 03:19:27,958] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video005000.mp4


[NOR] Episode: 5000, Length: 307, Avg Reward: 154.023435977, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.55990344286
[NOR] Episode: 5010, Length: 711, Avg Reward: 171.04682546, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.0824528485537
[NOR] Episode: 5020, Length: 316, Avg Reward: 116.730321648, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.524769425392
[NOR] Episode: 5030, Length: 457, Avg Reward: 199.843368775, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.426777362823
[NOR] Episode: 5040, Length: 459, Avg Reward: 190.58298995, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.21442848444
[NOR] Episode: 5050, Length: 383, Avg Reward: 149.232729199, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.447280943394
[NOR] Episode: 5060, Length: 404, Avg Reward: 178.318789531, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.288961470127
[NOR] Episode: 5070, Length: 1000, Avg Reward: 171.251713109, e: 0.05, Learn

[2017-03-23 03:52:02,647] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video006000.mp4


[NOR] Episode: 6000, Length: 164, Avg Reward: 158.018954451, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.337623983622
[NOR] Episode: 6010, Length: 431, Avg Reward: 206.344613971, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.389557719231
[NOR] Episode: 6020, Length: 277, Avg Reward: 161.106274687, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.75872451067
[NOR] Episode: 6030, Length: 310, Avg Reward: 201.376333989, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.45432555676
[NOR] Episode: 6040, Length: 318, Avg Reward: 220.77635926, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.253580302
[NOR] Episode: 6050, Length: 309, Avg Reward: 204.055070355, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.691224694252
[NOR] Episode: 6060, Length: 384, Avg Reward: 213.892085491, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.0146597623825
[NOR] Episode: 6070, Length: 400, Avg Reward: 209.223471892, e: 0.05, Learni

[2017-03-23 04:14:48,691] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video007000.mp4


[NOR] Episode: 7000, Length: 317, Avg Reward: 198.351995221, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.70604014397
[NOR] Episode: 7010, Length: 505, Avg Reward: 156.885776462, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.684836566448
[NOR] Episode: 7020, Length: 353, Avg Reward: 185.102464804, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.23024463654
[NOR] Episode: 7030, Length: 197, Avg Reward: 160.9582096, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.619274556637
[NOR] Episode: 7040, Length: 341, Avg Reward: 183.859810617, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.186781138182
[NOR] Episode: 7050, Length: 413, Avg Reward: 180.210002317, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.256481766701
[NOR] Episode: 7060, Length: 514, Avg Reward: 203.908873315, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.568216502666
[NOR] Episode: 7070, Length: 341, Avg Reward: 197.190153905, e: 0.05, Learni

[2017-03-23 04:43:23,446] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video008000.mp4


[NOR] Episode: 8000, Length: 322, Avg Reward: 193.255776059, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.27429035306
[NOR] Episode: 8010, Length: 312, Avg Reward: 204.967344088, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.783667206764
[NOR] Episode: 8020, Length: 328, Avg Reward: 207.874787064, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.542711019516
[NOR] Episode: 8030, Length: 529, Avg Reward: 212.622782263, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.21141338348
[NOR] Episode: 8040, Length: 649, Avg Reward: 208.434696595, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -8.94071674347
[NOR] Episode: 8050, Length: 319, Avg Reward: 180.965328985, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 1.0081486702
[NOR] Episode: 8060, Length: 276, Avg Reward: 215.494580145, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.0888994038105
[NOR] Episode: 8070, Length: 283, Avg Reward: 219.791781153, e: 0.05, Learn

[2017-03-23 05:13:18,838] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video009000.mp4


[NOR] Episode: 9000, Length: 787, Avg Reward: 152.481231526, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.606279850006
[NOR] Episode: 9010, Length: 297, Avg Reward: 133.314469092, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.145873874426
[NOR] Episode: 9020, Length: 205, Avg Reward: 65.5681959452, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.672143936157
[NOR] Episode: 9030, Length: 280, Avg Reward: 168.183262082, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.100618287921
[NOR] Episode: 9040, Length: 186, Avg Reward: 93.1996258895, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.184693410993
[NOR] Episode: 9050, Length: 1000, Avg Reward: 64.6543939753, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.87618303299
[NOR] Episode: 9060, Length: 255, Avg Reward: 165.490013234, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.564447045326
[NOR] Episode: 9070, Length: 658, Avg Reward: 139.358946674, e: 0.05, L

[2017-03-23 05:35:54,374] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video010000.mp4


[NOR] Episode: 10000, Length: 301, Avg Reward: 215.859526657, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.242112115026
[NOR] Episode: 10010, Length: 365, Avg Reward: 219.586125706, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.0429403334856
[NOR] Episode: 10020, Length: 373, Avg Reward: 211.933235394, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.363117396832
[NOR] Episode: 10030, Length: 298, Avg Reward: 220.782512331, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.969698309898
[NOR] Episode: 10040, Length: 395, Avg Reward: 218.53817196, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.172413855791
[NOR] Episode: 10050, Length: 388, Avg Reward: 212.009448546, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.532916128635
[NOR] Episode: 10060, Length: 379, Avg Reward: 220.118396625, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.909634709358
[NOR] Episode: 10070, Length: 401, Avg Reward: 209.600969164, e

[2017-03-23 06:12:11,979] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video011000.mp4


[NOR] Episode: 11000, Length: 356, Avg Reward: 185.688603719, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.821406841278
[NOR] Episode: 11010, Length: 341, Avg Reward: 171.263262975, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.406763285398
[NOR] Episode: 11020, Length: 352, Avg Reward: 198.668548282, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.767876029015
[NOR] Episode: 11030, Length: 395, Avg Reward: 195.289554831, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -2.3298535347
[NOR] Episode: 11040, Length: 188, Avg Reward: 173.680038327, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.45583570004
[NOR] Episode: 11050, Length: 364, Avg Reward: 195.171161247, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.482598572969
[NOR] Episode: 11060, Length: 545, Avg Reward: 196.808728911, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.97969174385
[NOR] Episode: 11070, Length: 345, Avg Reward: 217.918479607, e: 0

[2017-03-23 06:48:28,883] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video012000.mp4


[NOR] Episode: 12000, Length: 323, Avg Reward: 41.3672979409, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.00386797636747
[NOR] Episode: 12010, Length: 692, Avg Reward: -17.8602356071, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.532893836498
[NOR] Episode: 12020, Length: 951, Avg Reward: 51.7037693555, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.457917422056
[NOR] Episode: 12030, Length: 1000, Avg Reward: -36.5657848162, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.156159073114
[NOR] Episode: 12040, Length: 558, Avg Reward: 38.1504539473, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.118020638824
[NOR] Episode: 12050, Length: 437, Avg Reward: 46.2315207067, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.432966262102
[NOR] Episode: 12060, Length: 172, Avg Reward: -6.57488078849, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.393674612045
[NOR] Episode: 12070, Length: 428, Avg Reward: 18.982031831

[2017-03-23 07:20:11,074] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/24/openaigym.video.23.10254.video013000.mp4


[NOR] Episode: 13000, Length: 913, Avg Reward: 110.293591455, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.683110296726
[NOR] Episode: 13010, Length: 634, Avg Reward: 104.562120538, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.193180352449
[NOR] Episode: 13020, Length: 735, Avg Reward: 94.0510525536, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.745446681976
[NOR] Episode: 13030, Length: 733, Avg Reward: 82.3592709385, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.12540626526
[NOR] Episode: 13040, Length: 531, Avg Reward: 103.876386144, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.935826838017
[NOR] Episode: 13050, Length: 274, Avg Reward: 82.5586403889, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.36504709721
[NOR] Episode: 13060, Length: 121, Avg Reward: -104.283707728, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.30247426033
[NOR] Episode: 13070, Length: 163, Avg Reward: -98.4221063992, e

KeyboardInterrupt: 

In [6]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-17 18:20:33,289] Making new env: LunarLander-v2


False
212.63485096
222.209029125
137.585769854
180.875673014
206.109792056
244.175226865
219.223174017
226.254220443
189.530083255
180.200875147
226.533856294
215.507410864
257.758179742


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type