In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run, map_gradients
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time
from itertools import groupby

name = "actor-critic-shared-base"

'module' object has no attribute '__module__'


In [3]:
def update_dict(d, key, default, f):
    if key in d:
        d[key] = f(d[key])
    else:
        d[key] = default

def combine_gradients(grads1, grads2):
    d = {}
    
    for g, v in grads1 + grads2:
        update_dict(d, v, g, (g1) -> g1 + g)
    
    return [ (g, v) for v, g in d.items() ]


grads1 = [(5, 0), (3, 1)]
grads2 = [(2, 0)]
combine_gradients(grads1, grads2)

[(7, 0), (3, 1)]

In [23]:
class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')
            self.episode_reward = tf.placeholder(tf.float32, [], name='episode_reward')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Base(object):
    def __init__(self, inputs, n_states, scope, ops):
        with tf.variable_scope(scope):
            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
            self.net = tf.nn.dropout(net, inputs.keep_prob)
            
            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

class Critic(object):
    def __init__(self, base, inputs, n_actions, n_states, y, scope, ops):
        with tf.variable_scope(scope):
            
            self.V = (
                base.net
                |> tf.layers.dense(?, 32, name='relu_layer', activation=tf.nn.relu, **ops)
                |> tf.nn.dropout(?, inputs.keep_prob)
                |> tf.layers.dense(?, n_actions, name='V', **ops)
                |> (lambda net: net[:, 0])
            )

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) + base.variables

            self.gradients = tf.train.AdamOptimizer(inputs.learning_rate).compute_gradients(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base, inputs, target_critic, n_actions, n_states, y, scope, ops):
        with tf.variable_scope(scope):
            self.P = (
                base.net
                |> tf.layers.dense(?, 32, name='relu_layer', activation=tf.nn.relu, **ops)
                |> tf.nn.dropout(?, inputs.keep_prob)
                |> tf.layers.dense(base.net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
            )
            
            
            self.Pa = select_columns(self.P, inputs.a)

            self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) + base.variables

            self.gradients = tf.train.AdamOptimizer(inputs.learning_rate).compute_gradients(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/coconut/icoconut/root.py", line 96, in cache
    compiled = memoized_parse_sys(code)
  File "/usr/local/lib/python2.7/dist-packages/coconut/icoconut/root.py", line 73, in memoized_parse_sys
    return COMPILER.header_proc(memoized_parse_block(code), header="sys", initial="none")
  File "/usr/local/lib/python2.7/dist-packages/coconut/icoconut/root.py", line 66, in memoized_parse_block
    raise result
CoconutParseError: parsing failed (line 34)
  self.V = (                 base.net                 |> tf.layers.dense(?, 32, name='relu_layer', activation=tf.nn.relu, **ops)                 |> tf.nn.dropout(?, inputs.keep_prob)                 |> tf.layers.dense(?, n_actions, name='V', **ops)                 |> (lambda net: net[:, 0])             )
                                                                        ^


SyntaxError: parsing failed
  self.V = (                 base.net                 |> tf.layers.dense(?, 32, name='relu_layer', activation=tf.nn.relu, **ops)                 |> tf.nn.dropout(?, inputs.keep_prob)                 |> tf.layers.dense(?, n_actions, name='V', **ops)                 |> (lambda net: net[:, 0])             )

In [39]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1, clip=10):
        self.global_max = float('-inf')
        self.replay_buffer = ExperienceReplay(max_length=buffer_length)
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")
            
            self.base = Base(self.inputs, n_states, "base", ops)
            self.target_base = Base(self.inputs, n_states, "target_base", ops)
            
            self.critic = Critic(self.base, self.inputs, n_actions, n_states, y, "critic", ops)
            self.target_critic = Critic(self.target_base, self.inputs, n_actions, n_states, y, "target_critic", ops)
            
            self.actor = Actor(self.base, self.inputs, self.target_critic, n_actions, n_states, y, "actor", ops)
            
            with tf.name_scope("combine_gradients"):
                self.gradients = (
                    combine_gradients(self.actor.gradients, self.critic.gradients)
                    |> map_gradients$(tf.clip_by_norm$(?, clip))
                )
            
            self.update = tf.train.AdamOptimizer(self.inputs.learning_rate).apply_gradients(self.gradients)

            self.episode_summaries = tf.summary.merge([
                tf.summary.scalar('episode_length', self.inputs.episode_length),
                tf.summary.scalar('episode_reward', self.inputs.episode_reward)
            ])

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])
            
            with tf.name_scope("update_targets"):
                self.update_target = tf.group(*[
                    t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
                ])
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_summaries = self.sess.run(self.episode_summaries,feed_dict={
                self.inputs.episode_length: episode_length,
                self.inputs.episode_reward: ep_reward
            })
            self.writer.add_summary(episode_summaries, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [40]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{run}".format(run = run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/{run}".format(path = os.getcwd(), name = name, run = run)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.001,
    clip = 1.
)

[2017-03-17 16:42:23,485] Making new env: LunarLander-v2
[2017-03-17 16:42:23,571] Creating monitor directory monitor/18


False


In [41]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.005, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-17 16:42:24,272] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000000.mp4


[MAX] Episode: 0, Length: 107, Reward: -203.012263263, buffer_len: 107


[2017-03-17 16:42:26,694] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000001.mp4


[MAX] Episode: 1, Length: 103, Reward: -111.979307251, buffer_len: 210
[MAX] Episode: 2, Length: 1000, Reward: 77.1124244404, buffer_len: 1210
[MAX] Episode: 6, Length: 1000, Reward: 85.0371613394, buffer_len: 3095


[2017-03-17 16:42:38,609] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000008.mp4


[NOR] Episode: 10, Length: 229, Avg Reward: -183.836558465, e: 0.394752333333, Learning Rate: 0.005, buffer_len: 4499
Loss: -2.69007825851
[NOR] Episode: 20, Length: 96, Avg Reward: -166.329593138, e: 0.3888035, Learning Rate: 0.005, buffer_len: 9598
Loss: -2.90823554993


[2017-03-17 16:43:16,801] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000027.mp4


[NOR] Episode: 30, Length: 1000, Avg Reward: -154.126588885, e: 0.383674833333, Learning Rate: 0.005, buffer_len: 13994
Loss: -3.56683635712
[NOR] Episode: 40, Length: 585, Avg Reward: -139.990539321, e: 0.3757415, Learning Rate: 0.005, buffer_len: 20794
Loss: -5.59932327271
[NOR] Episode: 50, Length: 1000, Avg Reward: -166.049754661, e: 0.370069166667, Learning Rate: 0.005, buffer_len: 25656
Loss: -2.71342277527
[NOR] Episode: 60, Length: 1000, Avg Reward: -146.766133836, e: 0.361911833333, Learning Rate: 0.005, buffer_len: 32648
Loss: -2.85318017006


[2017-03-17 16:44:48,240] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000064.mp4


[NOR] Episode: 70, Length: 1000, Avg Reward: -124.471849263, e: 0.353283166667, Learning Rate: 0.005, buffer_len: 40044
Loss: 0.575408935547
[NOR] Episode: 80, Length: 239, Avg Reward: -212.367024655, e: 0.347483666667, Learning Rate: 0.005, buffer_len: 45015
Loss: -2.27229118347
[NOR] Episode: 90, Length: 1000, Avg Reward: -141.810697014, e: 0.337814333333, Learning Rate: 0.005, buffer_len: 53303
Loss: -2.23347210884
[NOR] Episode: 100, Length: 551, Avg Reward: -173.095287581, e: 0.330523833333, Learning Rate: 0.005, buffer_len: 59552
Loss: -6.47022914886
[NOR] Episode: 110, Length: 351, Avg Reward: -108.55942773, e: 0.319938666667, Learning Rate: 0.005, buffer_len: 68625
Loss: -6.21079349518
[NOR] Episode: 120, Length: 506, Avg Reward: -143.906085886, e: 0.311202666667, Learning Rate: 0.005, buffer_len: 76113
Loss: -3.97128748894


[2017-03-17 16:47:31,036] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000125.mp4


[NOR] Episode: 130, Length: 1000, Avg Reward: -198.704841922, e: 0.301809833333, Learning Rate: 0.005, buffer_len: 84164
Loss: -6.07128620148
[NOR] Episode: 140, Length: 1000, Avg Reward: -154.708287606, e: 0.291416, Learning Rate: 0.005, buffer_len: 93073
Loss: -4.73452615738
[NOR] Episode: 150, Length: 1000, Avg Reward: -162.787895553, e: 0.281310333333, Learning Rate: 0.005, buffer_len: 101735
Loss: -3.45110487938
[NOR] Episode: 160, Length: 1000, Avg Reward: -132.949199284, e: 0.2706715, Learning Rate: 0.005, buffer_len: 110854
Loss: -0.300063252449
[NOR] Episode: 170, Length: 1000, Avg Reward: -114.672772384, e: 0.260135333333, Learning Rate: 0.005, buffer_len: 119885
Loss: 0.494818031788
[NOR] Episode: 180, Length: 1000, Avg Reward: -133.22803155, e: 0.249337833333, Learning Rate: 0.005, buffer_len: 129140
Loss: -3.48935317993
[NOR] Episode: 190, Length: 1000, Avg Reward: -131.574531183, e: 0.237671166667, Learning Rate: 0.005, buffer_len: 139140
Loss: -4.56330966949
[NOR] Episod

[2017-03-17 16:53:05,722] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000216.mp4


[NOR] Episode: 220, Length: 1000, Avg Reward: -149.257113033, e: 0.203727, Learning Rate: 0.005, buffer_len: 168235
Loss: -1.82857179642
[NOR] Episode: 230, Length: 1000, Avg Reward: -136.781224701, e: 0.192060333333, Learning Rate: 0.005, buffer_len: 178235
Loss: 0.729239583015
[NOR] Episode: 240, Length: 1000, Avg Reward: -128.985386691, e: 0.181420333333, Learning Rate: 0.005, buffer_len: 187355
Loss: -1.62600064278
[NOR] Episode: 250, Length: 1000, Avg Reward: -133.786873503, e: 0.170719666667, Learning Rate: 0.005, buffer_len: 196527
Loss: -4.27626991272
[NOR] Episode: 260, Length: 1000, Avg Reward: -124.278950418, e: 0.159053, Learning Rate: 0.005, buffer_len: 206527
Loss: -1.99479913712
[NOR] Episode: 270, Length: 1000, Avg Reward: -152.247156899, e: 0.147386333333, Learning Rate: 0.005, buffer_len: 216527
Loss: -3.87882232666
[NOR] Episode: 280, Length: 1000, Avg Reward: -169.70437647, e: 0.1371115, Learning Rate: 0.005, buffer_len: 225334
Loss: -3.2510368824
[NOR] Episode: 290

[2017-03-17 17:00:50,273] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000343.mp4


[NOR] Episode: 350, Length: 1000, Avg Reward: -167.306676351, e: 0.0580313333333, Learning Rate: 0.005, buffer_len: 293117
Loss: -1.08064293861
[NOR] Episode: 360, Length: 1000, Avg Reward: -161.174088126, e: 0.05, Learning Rate: 0.005, buffer_len: 303009
Loss: -2.50993943214
[NOR] Episode: 370, Length: 1000, Avg Reward: -163.284908394, e: 0.05, Learning Rate: 0.005, buffer_len: 312850
Loss: -3.94488048553
[NOR] Episode: 380, Length: 1000, Avg Reward: -148.452704446, e: 0.05, Learning Rate: 0.005, buffer_len: 322344
Loss: -1.68650698662
[NOR] Episode: 390, Length: 368, Avg Reward: -152.5871093, e: 0.05, Learning Rate: 0.005, buffer_len: 331712
Loss: -1.82364737988
[NOR] Episode: 400, Length: 1000, Avg Reward: -147.729386431, e: 0.05, Learning Rate: 0.005, buffer_len: 341712
Loss: -2.0093536377
[NOR] Episode: 410, Length: 1000, Avg Reward: -141.439928737, e: 0.05, Learning Rate: 0.005, buffer_len: 351712
Loss: -2.58661818504
[NOR] Episode: 420, Length: 1000, Avg Reward: -146.861370317, 

[2017-03-17 17:11:49,041] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000512.mp4


[NOR] Episode: 520, Length: 1000, Avg Reward: -157.997559895, e: 0.05, Learning Rate: 0.005, buffer_len: 457316
Loss: -2.69373369217
[NOR] Episode: 530, Length: 1000, Avg Reward: -135.72006186, e: 0.05, Learning Rate: 0.005, buffer_len: 467316
Loss: -1.22300744057
[NOR] Episode: 540, Length: 1000, Avg Reward: -136.443225475, e: 0.05, Learning Rate: 0.005, buffer_len: 477316
Loss: -0.171847641468
[NOR] Episode: 550, Length: 1000, Avg Reward: -167.210920983, e: 0.05, Learning Rate: 0.005, buffer_len: 486952
Loss: -3.0469751358
[NOR] Episode: 560, Length: 1000, Avg Reward: -140.870162724, e: 0.05, Learning Rate: 0.005, buffer_len: 496322
Loss: -0.0920520424843
[NOR] Episode: 570, Length: 1000, Avg Reward: -159.263061959, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: -0.00500243902206
[NOR] Episode: 580, Length: 198, Avg Reward: -159.439990167, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: -1.48093414307
[NOR] Episode: 590, Length: 571, Avg Reward: -181.326126216, e: 0.

[2017-03-17 17:24:02,740] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/18/openaigym.video.13.4296.video000729.mp4


[NOR] Episode: 730, Length: 301, Avg Reward: -279.703557127, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: -0.644678711891
[NOR] Episode: 740, Length: 219, Avg Reward: -355.41284555, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: 3.04566311836
[NOR] Episode: 750, Length: 225, Avg Reward: -384.149518644, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: 6.12514543533
[NOR] Episode: 760, Length: 209, Avg Reward: -370.535625788, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: -2.7898235321
[NOR] Episode: 770, Length: 129, Avg Reward: -343.310018219, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: -3.31606960297
[NOR] Episode: 780, Length: 169, Avg Reward: -356.807460591, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: -2.72251462936
[NOR] Episode: 790, Length: 225, Avg Reward: -381.701390722, e: 0.05, Learning Rate: 0.005, buffer_len: 500000
Loss: -1.02618277073
[NOR] Episode: 800, Length: 227, Avg Reward: -278.949455841, e: 0.05, Learning 

[2017-03-17 17:29:10,285] Finished writing results. You can upload them to the scoreboard via gym.upload(u'/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/17')


KeyboardInterrupt: 

In [6]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-17 12:07:02,890] Making new env: LunarLander-v2
[2017-03-17 12:07:02,892] Finished writing results. You can upload them to the scoreboard via gym.upload('/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/actor-critic-base')


False
196.104081847
149.867198645
215.673060563
217.593832845
201.192692875
198.152117614
-35.4402700038
218.394082887
193.044127714
120.793062792
234.628837747
213.755804367
0.398475525016
219.955313615
219.817810851
-31.113964878
204.808844432
218.3261461
223.759720668
230.32005815
192.594561856
192.463367622


KeyboardInterrupt: 