In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os


name = "lunar-lander-base"

In [4]:
class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            self.V = base_model.define_critic_network(inputs, n_actions, n_states)

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            self.P = base_model.define_actor_network(inputs, n_actions, n_states)

            self.Pa = select_columns(self.P, inputs.a)

            self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [5]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
            
            
    def define_actor_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )
        
        net = inputs.s
        
        net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
        net = tf.nn.dropout(net, inputs.keep_prob)
        
        net = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
        
        return net


    def define_critic_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )
        
        net = inputs.s
        
        net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
        net = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]
        
        return net
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}.max".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [6]:
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "tmp/monitor/{}".format(name))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{name}.model".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/{name}".format(path = os.getcwd(), name = name)

print("Run: {}".format(run))

model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005
)

[2017-03-16 13:41:55,070] Making new env: LunarLander-v2
[2017-03-16 13:41:55,328] Creating monitor directory tmp/monitor57


False
Run: 57


In [7]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-16 10:44:10,080] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000000.mp4
[2017-03-16 10:44:12,361] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000001.mp4


[MAX] Episode: 0, Length: 87, Reward: -457.01852629, buffer_len: 87
[MAX] Episode: 1, Length: 203, Reward: -390.848072174, buffer_len: 290
[MAX] Episode: 2, Length: 126, Reward: -368.230072013, buffer_len: 416


[2017-03-16 10:44:16,406] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000008.mp4


[MAX] Episode: 8, Length: 63, Reward: -144.60731326, buffer_len: 1151
[MAX] Episode: 9, Length: 109, Reward: -73.8010988155, buffer_len: 1260
[NOR] Episode: 10, Length: 92, Avg Reward: -428.579284157, e: 0.398423833333, Learning Rate: 0.01, buffer_len: 1352
Loss: -32.450428009
[MAX] Episode: 13, Length: 1000, Reward: 47.6789619196, buffer_len: 2690
[NOR] Episode: 20, Length: 102, Avg Reward: -148.688130463, e: 0.394859666667, Learning Rate: 0.01, buffer_len: 4407
Loss: -11.5019416809


[2017-03-16 10:44:31,873] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000027.mp4


[NOR] Episode: 30, Length: 176, Avg Reward: -72.148762292, e: 0.3917575, Learning Rate: 0.01, buffer_len: 7066
Loss: 1.22499370575
[NOR] Episode: 40, Length: 1000, Avg Reward: -69.7134553905, e: 0.387684666667, Learning Rate: 0.01, buffer_len: 10557
Loss: -10.9945850372
[NOR] Episode: 50, Length: 309, Avg Reward: -85.6355955524, e: 0.383800833333, Learning Rate: 0.01, buffer_len: 13886
Loss: -7.93014812469
[MAX] Episode: 53, Length: 1000, Reward: 79.2486911251, buffer_len: 15311
[MAX] Episode: 56, Length: 596, Reward: 109.417531459, buffer_len: 17907
[NOR] Episode: 60, Length: 123, Avg Reward: -4.36597404064, e: 0.376503333333, Learning Rate: 0.01, buffer_len: 20141
Loss: -8.08912658691


[2017-03-16 10:45:20,274] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000064.mp4


[NOR] Episode: 70, Length: 1000, Avg Reward: -31.3861294674, e: 0.3706035, Learning Rate: 0.01, buffer_len: 25198
Loss: 7.95368099213
[NOR] Episode: 80, Length: 1000, Avg Reward: -36.77623516, e: 0.365051333333, Learning Rate: 0.01, buffer_len: 29957
Loss: -9.58049297333
[NOR] Episode: 90, Length: 225, Avg Reward: -45.5070731288, e: 0.360147833333, Learning Rate: 0.01, buffer_len: 34160
Loss: -1.79070329666
[NOR] Episode: 100, Length: 134, Avg Reward: -71.2668845376, e: 0.356976833333, Learning Rate: 0.01, buffer_len: 36878
Loss: -6.25906276703
[NOR] Episode: 110, Length: 177, Avg Reward: -37.6401780045, e: 0.352029, Learning Rate: 0.01, buffer_len: 41119
Loss: 3.45888948441
[NOR] Episode: 120, Length: 116, Avg Reward: -51.6038872647, e: 0.345135166667, Learning Rate: 0.01, buffer_len: 47028
Loss: -3.90584683418


[2017-03-16 10:47:02,407] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000125.mp4


[MAX] Episode: 126, Length: 523, Reward: 134.833892431, buffer_len: 50359
[NOR] Episode: 130, Length: 1000, Avg Reward: -1.1841441287, e: 0.3373605, Learning Rate: 0.01, buffer_len: 53692
Loss: 21.6594333649
[NOR] Episode: 140, Length: 116, Avg Reward: 12.5260148717, e: 0.328520666667, Learning Rate: 0.01, buffer_len: 61269
Loss: -4.17341375351
[MAX] Episode: 145, Length: 559, Reward: 182.317735485, buffer_len: 65283
[NOR] Episode: 150, Length: 215, Avg Reward: 14.2347012856, e: 0.321661833333, Learning Rate: 0.01, buffer_len: 67148
Loss: -3.46813130379
[NOR] Episode: 160, Length: 1000, Avg Reward: -14.0461541302, e: 0.314299, Learning Rate: 0.01, buffer_len: 73459
Loss: 4.3194975853
[MAX] Episode: 164, Length: 408, Reward: 205.010610369, buffer_len: 75242
[NOR] Episode: 170, Length: 270, Avg Reward: -11.6110120232, e: 0.3093115, Learning Rate: 0.01, buffer_len: 77734
Loss: -10.7033576965
[MAX] Episode: 176, Length: 685, Reward: 216.928243653, buffer_len: 81878
[NOR] Episode: 180, Leng

[2017-03-16 10:50:26,663] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000216.mp4


[NOR] Episode: 220, Length: 128, Avg Reward: 38.5149973797, e: 0.267988166667, Learning Rate: 0.01, buffer_len: 113154
Loss: 2.68323755264
[NOR] Episode: 230, Length: 1000, Avg Reward: 47.2137635391, e: 0.2598775, Learning Rate: 0.01, buffer_len: 120106
Loss: -3.71074056625
[NOR] Episode: 240, Length: 307, Avg Reward: 31.2103931428, e: 0.250754166667, Learning Rate: 0.01, buffer_len: 127926
Loss: -1.64285564423
[NOR] Episode: 250, Length: 568, Avg Reward: 69.1927049243, e: 0.241725333333, Learning Rate: 0.01, buffer_len: 135665
Loss: -1.56709599495
[NOR] Episode: 260, Length: 632, Avg Reward: 155.307123289, e: 0.233192333333, Learning Rate: 0.01, buffer_len: 142979
Loss: -0.63961738348
[MAX] Episode: 263, Length: 374, Reward: 221.578576471, buffer_len: 145196
[NOR] Episode: 270, Length: 442, Avg Reward: 155.465071631, e: 0.223983833333, Learning Rate: 0.01, buffer_len: 150872
Loss: -6.01575517654
[NOR] Episode: 280, Length: 584, Avg Reward: 127.460696161, e: 0.214931666667, Learning Ra

[2017-03-16 10:56:29,522] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000343.mp4


[NOR] Episode: 350, Length: 844, Avg Reward: 142.625246151, e: 0.146838, Learning Rate: 0.01, buffer_len: 216997
Loss: 0.0976938009262
[NOR] Episode: 360, Length: 632, Avg Reward: 133.897176573, e: 0.1383575, Learning Rate: 0.01, buffer_len: 224266
Loss: -10.5042276382
[NOR] Episode: 370, Length: 589, Avg Reward: 153.500765458, e: 0.129966833333, Learning Rate: 0.01, buffer_len: 231458
Loss: -4.02126264572
[NOR] Episode: 380, Length: 659, Avg Reward: 148.4335002, e: 0.1228875, Learning Rate: 0.01, buffer_len: 237526
Loss: 3.96236777306
[MAX] Episode: 382, Length: 462, Reward: 222.873168397, buffer_len: 238454
[NOR] Episode: 390, Length: 654, Avg Reward: 184.117158588, e: 0.116476666667, Learning Rate: 0.01, buffer_len: 243021
Loss: -3.90280771255
[NOR] Episode: 400, Length: 699, Avg Reward: 176.947032007, e: 0.110235, Learning Rate: 0.01, buffer_len: 248371
Loss: 0.491518825293
[NOR] Episode: 410, Length: 453, Avg Reward: 126.078457626, e: 0.102749666667, Learning Rate: 0.01, buffer_le

[2017-03-16 11:02:21,762] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000512.mp4


[NOR] Episode: 520, Length: 704, Avg Reward: 102.145918062, e: 0.05, Learning Rate: 0.01, buffer_len: 326051
Loss: -2.73225212097
[NOR] Episode: 530, Length: 574, Avg Reward: 139.902687615, e: 0.05, Learning Rate: 0.01, buffer_len: 332902
Loss: -3.37321710587
[NOR] Episode: 540, Length: 937, Avg Reward: 138.346785993, e: 0.05, Learning Rate: 0.01, buffer_len: 340181
Loss: 2.24881196022
[NOR] Episode: 550, Length: 566, Avg Reward: 156.955330163, e: 0.05, Learning Rate: 0.01, buffer_len: 346494
Loss: -1.19764828682
[NOR] Episode: 560, Length: 611, Avg Reward: 133.053876399, e: 0.05, Learning Rate: 0.01, buffer_len: 352436
Loss: -1.76882362366
[MAX] Episode: 566, Length: 474, Reward: 229.69342739, buffer_len: 355709
[NOR] Episode: 570, Length: 683, Avg Reward: 126.008719404, e: 0.05, Learning Rate: 0.01, buffer_len: 358084
Loss: -1.83122324944
[NOR] Episode: 580, Length: 478, Avg Reward: 135.694028198, e: 0.05, Learning Rate: 0.01, buffer_len: 364483
Loss: -9.27827644348
[NOR] Episode: 59

[2017-03-16 11:09:25,194] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video000729.mp4


[NOR] Episode: 730, Length: 404, Avg Reward: 127.252255688, e: 0.05, Learning Rate: 0.01, buffer_len: 444958
Loss: -1.62529611588
[NOR] Episode: 740, Length: 505, Avg Reward: 52.0928802653, e: 0.05, Learning Rate: 0.01, buffer_len: 449163
Loss: -5.46403264999
[NOR] Episode: 750, Length: 243, Avg Reward: 99.3296176444, e: 0.05, Learning Rate: 0.01, buffer_len: 453202
Loss: -2.77278590202
[NOR] Episode: 760, Length: 420, Avg Reward: 40.8728026399, e: 0.05, Learning Rate: 0.01, buffer_len: 457099
Loss: -0.582754075527
[NOR] Episode: 770, Length: 291, Avg Reward: -8.29672123571, e: 0.05, Learning Rate: 0.01, buffer_len: 461084
Loss: -0.343167483807
[NOR] Episode: 780, Length: 637, Avg Reward: 71.205509793, e: 0.05, Learning Rate: 0.01, buffer_len: 465194
Loss: -4.40495967865
[NOR] Episode: 790, Length: 578, Avg Reward: 15.4993917253, e: 0.05, Learning Rate: 0.01, buffer_len: 469169
Loss: 13.8341121674
[NOR] Episode: 800, Length: 574, Avg Reward: 60.820480122, e: 0.05, Learning Rate: 0.01, 

[2017-03-16 11:17:12,423] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video001000.mp4


[NOR] Episode: 1000, Length: 367, Avg Reward: 194.484978962, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.15144920349
[NOR] Episode: 1010, Length: 364, Avg Reward: 208.253204158, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.837719917297
[NOR] Episode: 1020, Length: 307, Avg Reward: 206.633619102, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.265700161457
[NOR] Episode: 1030, Length: 350, Avg Reward: 169.098889432, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -3.81208229065
[NOR] Episode: 1040, Length: 357, Avg Reward: 192.049465174, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.50517392159
[NOR] Episode: 1050, Length: 314, Avg Reward: 173.199658965, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -1.5434602499
[NOR] Episode: 1060, Length: 360, Avg Reward: 204.912166051, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.92996001244
[NOR] Episode: 1070, Length: 382, Avg Reward: 199.603278539, e: 0.05, Learning Rat

[2017-03-16 11:51:10,890] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55/openaigym.video.0.27716.video002000.mp4


[NOR] Episode: 2000, Length: 127, Avg Reward: 43.995813235, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.0286655426
[NOR] Episode: 2010, Length: 240, Avg Reward: 84.0501289719, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -4.6355175972
[NOR] Episode: 2020, Length: 134, Avg Reward: 136.204084089, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 0.792154073715
[NOR] Episode: 2030, Length: 242, Avg Reward: 102.398188033, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: 4.03542375565
[NOR] Episode: 2040, Length: 225, Avg Reward: 136.963581985, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -0.33333337307
[NOR] Episode: 2050, Length: 260, Avg Reward: 116.022964447, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -2.22750210762
[NOR] Episode: 2060, Length: 204, Avg Reward: 125.769399224, e: 0.05, Learning Rate: 0.01, buffer_len: 500000
Loss: -4.71191978455
[NOR] Episode: 2070, Length: 275, Avg Reward: 162.933239741, e: 0.05, Learning Rate: 0.

KeyboardInterrupt: 

In [8]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{name}.model.{value}.max".format(path = os.getcwd(), name = name, value="271.434416936")
logs_path = "{path}/logs/{name}".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    print(total)
env.render(close=True)


[2017-03-16 12:11:47,778] Making new env: LunarLander-v2


False
256.995211545
219.719334228
188.150320307
225.556655162
242.314481766
232.00207613
-20.6487954378
228.875880237
244.614869857
207.152128157
213.577989837
219.722156779
228.111420993
188.291081195
248.019976222
115.839584285
208.234549767
221.813891917
225.499633792
204.782779931
147.330067555
217.343215756
222.146525834
229.015016524
179.817766037
19.3131672435
251.024919233
218.619111896
244.743774959
231.125136202
212.506361632
31.9656486777
223.620479943
203.400869115
218.980732617
212.252630485
221.465818493
223.956464597
254.314942186
199.561276632
223.12134933
238.142993847
229.657699388
190.477349433
206.487812275
235.674270079
191.74325577
239.389023056
220.721559604
211.528843596
200.210569897


[2017-03-16 12:15:48,643] Finished writing results. You can upload them to the scoreboard via gym.upload('/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor55')


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type

In [1]:

from pynput.keyboard import Key, Listener
import time
import gym
from gym import wrappers

ACTION = 0

UP = False
LEFT = False
RIGHT = False

def set_action():
    global ACTION
    
    if RIGHT:
        ACTION = 3
    elif LEFT:
        ACTION = 1
    elif UP:
        ACTION = 0
    else:
        ACTION = 2
        

def on_press(key):
    global UP, LEFT, RIGHT

    if key == Key.left:
        LEFT = True
        RIGHT = False
    elif key == Key.right:
        RIGHT = True
        LEFT = False
    elif key == Key.down:
        UP = True
        
    set_action()

def on_release(key):
    global UP, LEFT, RIGHT
    
    if key == Key.left:
        LEFT = False
    elif key == Key.right:
        RIGHT = False
    elif key == Key.down:
        UP = False
        
    set_action()


# Collect events until released
with Listener(
        on_press=on_press,
        on_release=on_release):


    env = gym.make("LunarLander-v2")

    
    while True:
        s = env.reset()
        done = False
        total = 0.
        ep = 0
        while not done and ep < 700:
            ep += 1
            a = ACTION
            s, r, done, info = env.step(a)
            total += r
            env.render()
            time.sleep(0.02)
        print(total)
    
    env.render(close=True)


[2017-03-16 14:52:50,439] Making new env: LunarLander-v2


-197.609169383


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type