In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run, shifted_log_loss
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time


name = "actor-critic-base"

In [10]:


class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.bool, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
            
            self.V = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]

            self.target = tf.where(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 128, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)
            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer2", use_bias=True, **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)

            
            self.logits = tf.layers.dense(net, n_actions, name='P', use_bias=False, **ops)
            self.P = tf.nn.softmax(self.logits)
            
            self.Pa = select_columns(self.P, inputs.a)

            self.loss = shifted_log_loss(self.Pa) * critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [11]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(5, max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append(s, a, r, s1, done)
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [12]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{run}".format(run = run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{run}".format(path = os.getcwd(), run = run)
logs_path = "{path}/logs/{run}".format(path = os.getcwd(), run = run)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005
)

[2017-03-23 16:11:05,707] Making new env: LunarLander-v2
[2017-03-23 16:11:05,710] Creating monitor directory monitor/27


False


In [None]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.002, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-23 16:11:06,521] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000000.mp4
[2017-03-23 16:11:14,540] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000001.mp4


[MAX] Episode: 0, Length: 153, Reward: -229.244791235, buffer_len: 153
[MAX] Episode: 6, Length: 156, Reward: -199.190314485, buffer_len: 1374


[2017-03-23 16:11:20,165] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000008.mp4


[MAX] Episode: 10, Length: 73, Reward: -83.1431761836, buffer_len: 1984
[NOR] Episode: 10, Length: 73, Avg Reward: -349.558546986, e: 0.3976865, Learning Rate: 0.002, buffer_len: 1984
Loss: -4.00674438477
[NOR] Episode: 20, Length: 74, Avg Reward: -520.052531677, e: 0.396374, Learning Rate: 0.002, buffer_len: 3109
Loss: -5.40413951874


[2017-03-23 16:11:29,330] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000027.mp4


[NOR] Episode: 30, Length: 151, Avg Reward: -549.930262161, e: 0.395048666667, Learning Rate: 0.002, buffer_len: 4245
Loss: -4.74908542633
[NOR] Episode: 40, Length: 78, Avg Reward: -456.695379397, e: 0.393844666667, Learning Rate: 0.002, buffer_len: 5277
Loss: -7.82006978989
[MAX] Episode: 42, Length: 1000, Reward: 21.8118941444, buffer_len: 6355
[NOR] Episode: 50, Length: 325, Avg Reward: -134.876163275, e: 0.388686833333, Learning Rate: 0.002, buffer_len: 9698
Loss: -8.99314880371
[NOR] Episode: 60, Length: 135, Avg Reward: -191.0788197, e: 0.3847855, Learning Rate: 0.002, buffer_len: 13042
Loss: -9.96517467499


[2017-03-23 16:12:06,084] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000064.mp4


[MAX] Episode: 66, Length: 645, Reward: 97.4244092196, buffer_len: 15412
[NOR] Episode: 70, Length: 312, Avg Reward: -92.9634802731, e: 0.3801375, Learning Rate: 0.002, buffer_len: 17026
Loss: -3.62788248062
[NOR] Episode: 80, Length: 181, Avg Reward: -135.806014708, e: 0.375088166667, Learning Rate: 0.002, buffer_len: 21354
Loss: -4.69043779373
[NOR] Episode: 90, Length: 280, Avg Reward: -119.100028599, e: 0.3700155, Learning Rate: 0.002, buffer_len: 25702
Loss: -1.21044552326
[NOR] Episode: 100, Length: 121, Avg Reward: -100.119687985, e: 0.3650385, Learning Rate: 0.002, buffer_len: 29968
Loss: -1.13451874256
[NOR] Episode: 110, Length: 160, Avg Reward: -91.9696142684, e: 0.362153333333, Learning Rate: 0.002, buffer_len: 32441
Loss: -2.80442619324
[NOR] Episode: 120, Length: 154, Avg Reward: -175.738541751, e: 0.360591166667, Learning Rate: 0.002, buffer_len: 33780
Loss: -0.0747852921486


[2017-03-23 16:13:31,672] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000125.mp4


[NOR] Episode: 130, Length: 156, Avg Reward: -220.988399114, e: 0.3591305, Learning Rate: 0.002, buffer_len: 35032
Loss: -2.04960727692
[NOR] Episode: 140, Length: 109, Avg Reward: -127.140908591, e: 0.355793833333, Learning Rate: 0.002, buffer_len: 37892
Loss: -3.48042011261
[NOR] Episode: 150, Length: 99, Avg Reward: -296.739705315, e: 0.354736833333, Learning Rate: 0.002, buffer_len: 38798
Loss: -1.29190063477
[NOR] Episode: 160, Length: 176, Avg Reward: -172.053190404, e: 0.353332166667, Learning Rate: 0.002, buffer_len: 40002
Loss: -6.97999238968
[NOR] Episode: 170, Length: 1000, Avg Reward: -159.037032277, e: 0.3489035, Learning Rate: 0.002, buffer_len: 43798
Loss: -4.02130079269
[NOR] Episode: 180, Length: 281, Avg Reward: -58.9089999952, e: 0.344793333333, Learning Rate: 0.002, buffer_len: 47321
Loss: -2.51121640205
[MAX] Episode: 181, Length: 1000, Reward: 99.8685626909, buffer_len: 48321
[NOR] Episode: 190, Length: 197, Avg Reward: -84.4453929308, e: 0.3416515, Learning Rate:

[2017-03-23 16:14:59,481] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000216.mp4


[NOR] Episode: 220, Length: 174, Avg Reward: -97.018105782, e: 0.332966833333, Learning Rate: 0.002, buffer_len: 57458
Loss: 0.977512359619
[NOR] Episode: 230, Length: 155, Avg Reward: -114.464612211, e: 0.331258833333, Learning Rate: 0.002, buffer_len: 58922
Loss: -3.66474342346
[NOR] Episode: 240, Length: 224, Avg Reward: -101.437091019, e: 0.329322166667, Learning Rate: 0.002, buffer_len: 60582
Loss: -4.18593883514
[NOR] Episode: 250, Length: 128, Avg Reward: -128.302399929, e: 0.327424, Learning Rate: 0.002, buffer_len: 62209
Loss: -0.0509030222893
[NOR] Episode: 260, Length: 153, Avg Reward: -97.3189406017, e: 0.325270333333, Learning Rate: 0.002, buffer_len: 64055
Loss: -2.78596735001
[NOR] Episode: 270, Length: 127, Avg Reward: -124.103892438, e: 0.323422333333, Learning Rate: 0.002, buffer_len: 65639
Loss: -1.84047353268
[NOR] Episode: 280, Length: 167, Avg Reward: -127.16001214, e: 0.321488, Learning Rate: 0.002, buffer_len: 67297
Loss: -3.40476894379
[NOR] Episode: 290, Lengt

[2017-03-23 16:16:51,967] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000343.mp4


[NOR] Episode: 350, Length: 202, Avg Reward: -35.678343063, e: 0.294987166667, Learning Rate: 0.002, buffer_len: 90012
Loss: 1.81864070892
[MAX] Episode: 351, Length: 723, Reward: 146.523224822, buffer_len: 90735
[NOR] Episode: 360, Length: 244, Avg Reward: -6.86647791673, e: 0.2880595, Learning Rate: 0.002, buffer_len: 95950
Loss: -2.13402938843
[MAX] Episode: 361, Length: 340, Reward: 164.040888163, buffer_len: 96290
[NOR] Episode: 370, Length: 138, Avg Reward: -42.6090627936, e: 0.2823965, Learning Rate: 0.002, buffer_len: 100804
Loss: -2.3611676693
[NOR] Episode: 380, Length: 1000, Avg Reward: -21.6554577176, e: 0.273995333333, Learning Rate: 0.002, buffer_len: 108005
Loss: -5.26689958572
[NOR] Episode: 390, Length: 497, Avg Reward: 9.67759390557, e: 0.264995666667, Learning Rate: 0.002, buffer_len: 115719
Loss: -8.89630794525
[NOR] Episode: 400, Length: 1000, Avg Reward: -7.42759707553, e: 0.256623666667, Learning Rate: 0.002, buffer_len: 122895
Loss: -1.92040622234
[MAX] Episode:

[2017-03-23 16:23:59,898] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000512.mp4


[NOR] Episode: 520, Length: 398, Avg Reward: -54.3171931464, e: 0.173187166667, Learning Rate: 0.002, buffer_len: 194412
Loss: -4.18339443207
[NOR] Episode: 530, Length: 209, Avg Reward: -32.0010323107, e: 0.1660075, Learning Rate: 0.002, buffer_len: 200566
Loss: -1.37670195103
[NOR] Episode: 540, Length: 344, Avg Reward: 4.59464100836, e: 0.158885, Learning Rate: 0.002, buffer_len: 206671
Loss: 5.54103183746
[NOR] Episode: 550, Length: 1000, Avg Reward: 28.8735830419, e: 0.150909666667, Learning Rate: 0.002, buffer_len: 213507
Loss: -2.07209062576
[NOR] Episode: 560, Length: 452, Avg Reward: -63.7860949957, e: 0.146623333333, Learning Rate: 0.002, buffer_len: 217181
Loss: -0.932694673538
[NOR] Episode: 570, Length: 150, Avg Reward: -152.831682143, e: 0.142071, Learning Rate: 0.002, buffer_len: 221083
Loss: -3.21540880203
[NOR] Episode: 580, Length: 381, Avg Reward: -164.445426455, e: 0.138712166667, Learning Rate: 0.002, buffer_len: 223962
Loss: -1.38223290443
[NOR] Episode: 590, Leng

[2017-03-23 16:31:22,947] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video000729.mp4


[NOR] Episode: 730, Length: 380, Avg Reward: -64.3893151943, e: 0.0500233333333, Learning Rate: 0.002, buffer_len: 299981
Loss: -2.26348233223
[NOR] Episode: 740, Length: 576, Avg Reward: -45.489821629, e: 0.05, Learning Rate: 0.002, buffer_len: 305275
Loss: -0.522294938564
[NOR] Episode: 750, Length: 539, Avg Reward: -42.2189712273, e: 0.05, Learning Rate: 0.002, buffer_len: 309823
Loss: -0.705880284309
[NOR] Episode: 760, Length: 1000, Avg Reward: -11.9431309635, e: 0.05, Learning Rate: 0.002, buffer_len: 314959
Loss: -0.301007390022
[NOR] Episode: 770, Length: 740, Avg Reward: -15.7184341403, e: 0.05, Learning Rate: 0.002, buffer_len: 320479
Loss: 1.84604632854
[NOR] Episode: 780, Length: 762, Avg Reward: 12.2524531479, e: 0.05, Learning Rate: 0.002, buffer_len: 326242
Loss: -1.81003117561
[NOR] Episode: 790, Length: 453, Avg Reward: 28.2625820243, e: 0.05, Learning Rate: 0.002, buffer_len: 331708
Loss: -0.355839192867
[NOR] Episode: 800, Length: 559, Avg Reward: -45.2608920484, e: 

[2017-03-23 16:40:47,680] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video001000.mp4


[NOR] Episode: 1000, Length: 964, Avg Reward: -43.9108466078, e: 0.05, Learning Rate: 0.002, buffer_len: 435063
Loss: -1.95838820934
[NOR] Episode: 1010, Length: 944, Avg Reward: -12.4057315411, e: 0.05, Learning Rate: 0.002, buffer_len: 439978
Loss: 0.552754819393
[NOR] Episode: 1020, Length: 350, Avg Reward: -58.5776861254, e: 0.05, Learning Rate: 0.002, buffer_len: 444907
Loss: -3.17436933517
[NOR] Episode: 1030, Length: 569, Avg Reward: -48.386242217, e: 0.05, Learning Rate: 0.002, buffer_len: 449774
Loss: -2.94911241531
[NOR] Episode: 1040, Length: 411, Avg Reward: -22.2013142673, e: 0.05, Learning Rate: 0.002, buffer_len: 453901
Loss: -1.40606343746
[NOR] Episode: 1050, Length: 269, Avg Reward: 3.42773443422, e: 0.05, Learning Rate: 0.002, buffer_len: 459382
Loss: -1.31490254402
[NOR] Episode: 1060, Length: 728, Avg Reward: 15.1008029895, e: 0.05, Learning Rate: 0.002, buffer_len: 464356
Loss: 0.402203202248
[NOR] Episode: 1070, Length: 197, Avg Reward: -83.0089497378, e: 0.05, L

[2017-03-23 17:26:10,158] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/27/openaigym.video.2.25564.video002000.mp4


[NOR] Episode: 2000, Length: 288, Avg Reward: 141.942679739, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.165142059326
[NOR] Episode: 2010, Length: 457, Avg Reward: 150.630464735, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.13773488998
[NOR] Episode: 2020, Length: 530, Avg Reward: 56.651510002, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -1.57807087898
[NOR] Episode: 2030, Length: 637, Avg Reward: 96.1385635797, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.1858522892
[NOR] Episode: 2040, Length: 421, Avg Reward: 127.57834515, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.00994777679443
[NOR] Episode: 2050, Length: 571, Avg Reward: 125.094223443, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: 0.10862416029
[NOR] Episode: 2060, Length: 466, Avg Reward: 153.216891706, e: 0.05, Learning Rate: 0.002, buffer_len: 500000
Loss: -0.426585495472
[NOR] Episode: 2070, Length: 492, Avg Reward: 178.768384365, e: 0.05, Learni

In [6]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-17 18:20:33,289] Making new env: LunarLander-v2


False
212.63485096
222.209029125
137.585769854
180.875673014
206.109792056
244.175226865
219.223174017
226.254220443
189.530083255
180.200875147
226.533856294
215.507410864
257.758179742


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type