In [1]:
import numpy as np
import gym
from gym import wrappers
from numpy.random import choice
import random
from phi.api import *
import tensorflow as tf
from tfinterface.reinforcement import DeepActorCritic, ExpandedStateEnv
from tfinterface.interfaces import EnvironmentInterface
from tfinterface.model_base import ModelBase
from tensorflow.python import debug as tf_debug
import os
from scipy.interpolate import interp1d
import numbers


def get_run():
    try:
        with open("run.txt") as f:
            run = int(f.read().split("/n")[0])
    except:
        run = -1
    
    with open("run.txt", 'w+') as f:
        run += 1
        
        f.seek(0)
        f.write(str(run))
        f.truncate()
        
    return run

'module' object has no attribute '__module__'


In [2]:
class LanderAC(DeepActorCritic):
    
    def define_actor_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            use_bias=False,
            bias_initializer=None
        )

        return (
            inputs.s
            |> tf.layers.dense$(?, 128, activation=tf.nn.relu, name='relu_layer', **ops)
            |> tf.nn.dropout$(?, inputs.keep_prob)
            |> tf.layers.dense$(?, n_actions, activation=tf.nn.softmax, name='softmax_layer', **ops)
        )


    def define_critic_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            use_bias=False,
            bias_initializer=None
        )

        return (
            inputs.s
            |> tf.layers.dense$(?, 128, activation=tf.nn.relu, name='relu_layer', **ops)
            |> tf.layers.dense$(?, 1, name='linear_layer', **ops)
            |> (lambda t: t[:, 0])
        )
    
    def fit(self, env, keep_prob=0.5, e=0., learning_rate=0.01, print_step=10, update_target=1, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.


            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1

                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e

                a = self.choose_action(s, keep_prob, e=_e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r

                self.replay_buffer.append((s, a, r, s1, float(done)))

                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                V1 = self.sess.run(self.target_critic.V, feed_dict={self.inputs.s: S1, self.inputs.keep_prob: 1.0})

                feed_dict = self.fit_feed(S, A, R, V1, Done, _learning_rate, True)

                
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=feed_dict)
                self.writer.add_summary(summaries)

                if self.global_step % update_target == 0:
                    self.sess.run(self.update_target)

                s = s1



            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
#                 self.save(model_path = self.model_path + ".max")
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=feed_dict)
                print("[NOR] Episode: {}, Length: {}, e: {}, Avg Reward: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, _e, avg_r, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.

In [15]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "tmp/monitor{}".format(run))
env = ExpandedStateEnv(env, 3)
print(env.action_space)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/actor-critic.model"
logs_path = "logs/run{}".format(run)

print("Run: {}".format(run))

model = LanderAC(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000, pi=0.1,
    model_path = model_path,
    logs_path = logs_path,
    restore = True
)

[2017-03-08 23:28:36,190] Making new env: LunarLander-v2
[2017-03-08 23:28:36,195] Creating monitor directory tmp/monitor43


False
Discrete(4)
Run: 43


In [16]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 100000], [0.3, 0.05], fill_value=0.05, bounds_error=False),
    update_target = 1,
    keep_prob = 0.9
)

[2017-03-08 23:28:37,557] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000000.mp4
[2017-03-08 23:28:47,419] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000001.mp4


[MAX] Episode: 0, Length: 734, Reward: 186.581236974, buffer_len: 734


[2017-03-08 23:29:08,659] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000008.mp4


[NOR] Episode: 10, Length: 132, e: 0.288245, Avg Reward: 8.76421429705, Learning Rate: 0.01, buffer_len: 4703
Loss: -3.33818101883
[NOR] Episode: 20, Length: 1000, e: 0.28064, Avg Reward: -37.3684527483, Learning Rate: 0.01, buffer_len: 7745
Loss: -6.54964876175
[MAX] Episode: 24, Length: 978, Reward: 213.573827224, buffer_len: 10038


[2017-03-08 23:29:32,126] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000027.mp4


[NOR] Episode: 30, Length: 1000, e: 0.260975, Avg Reward: 64.2716256871, Learning Rate: 0.01, buffer_len: 15611
Loss: -4.47060441971
[NOR] Episode: 40, Length: 288, e: 0.245155, Avg Reward: 47.8759990875, Learning Rate: 0.01, buffer_len: 21939
Loss: -3.02051925659
[MAX] Episode: 42, Length: 585, Reward: 237.618499196, buffer_len: 22710
[NOR] Episode: 50, Length: 219, e: 0.2314925, Avg Reward: 27.7660034664, Learning Rate: 0.01, buffer_len: 27404
Loss: -1.26924729347
[NOR] Episode: 60, Length: 1000, e: 0.2139775, Avg Reward: 94.4065900239, Learning Rate: 0.01, buffer_len: 34410
Loss: -1.14252781868


[2017-03-08 23:31:15,838] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000064.mp4


[NOR] Episode: 70, Length: 369, e: 0.1974925, Avg Reward: 151.98138393, Learning Rate: 0.01, buffer_len: 41004
Loss: 1.15239810944
[NOR] Episode: 80, Length: 660, e: 0.1810275, Avg Reward: 131.586723522, Learning Rate: 0.01, buffer_len: 47590
Loss: -1.15628731251
[MAX] Episode: 81, Length: 526, Reward: 239.807085716, buffer_len: 48116
[NOR] Episode: 90, Length: 377, e: 0.16731, Avg Reward: 146.080888223, Learning Rate: 0.01, buffer_len: 53077
Loss: -1.77882409096
[NOR] Episode: 100, Length: 477, e: 0.149095, Avg Reward: 106.732988924, Learning Rate: 0.01, buffer_len: 60363
Loss: -3.39716291428
[MAX] Episode: 109, Length: 271, Reward: 260.499954768, buffer_len: 65435
[NOR] Episode: 110, Length: 647, e: 0.1347975, Avg Reward: 165.909044415, Learning Rate: 0.01, buffer_len: 66082
Loss: 1.04456341267
[NOR] Episode: 120, Length: 212, e: 0.1219175, Avg Reward: 150.496565588, Learning Rate: 0.01, buffer_len: 71234
Loss: -1.59784340858


[2017-03-08 23:33:37,887] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000125.mp4


[NOR] Episode: 130, Length: 213, e: 0.1128025, Avg Reward: 153.927652825, Learning Rate: 0.01, buffer_len: 74880
Loss: -1.42150211334
[MAX] Episode: 132, Length: 459, Reward: 261.399608601, buffer_len: 75712
[NOR] Episode: 140, Length: 337, e: 0.10506, Avg Reward: 55.349863492, Learning Rate: 0.01, buffer_len: 77977
Loss: -0.557442307472
[NOR] Episode: 150, Length: 1000, e: 0.09368, Avg Reward: 140.156037309, Learning Rate: 0.01, buffer_len: 82529
Loss: -1.90528202057
[NOR] Episode: 160, Length: 503, e: 0.0794675, Avg Reward: 160.491117633, Learning Rate: 0.01, buffer_len: 88214
Loss: 0.143522560596
[NOR] Episode: 170, Length: 501, e: 0.06763, Avg Reward: 192.250683488, Learning Rate: 0.01, buffer_len: 92949
Loss: 2.47724103928
[NOR] Episode: 180, Length: 416, e: 0.05553, Avg Reward: 192.446568575, Learning Rate: 0.01, buffer_len: 97789
Loss: -2.74128437042
[NOR] Episode: 190, Length: 332, e: 0.05, Avg Reward: 161.977755564, Learning Rate: 0.01, buffer_len: 102133
Loss: -2.06088757515


[2017-03-08 23:35:48,833] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000216.mp4


[NOR] Episode: 220, Length: 300, e: 0.05, Avg Reward: 181.374250832, Learning Rate: 0.01, buffer_len: 114216
Loss: -5.18792486191
[NOR] Episode: 230, Length: 142, e: 0.05, Avg Reward: 173.993680611, Learning Rate: 0.01, buffer_len: 117434
Loss: -2.24826216698
[NOR] Episode: 240, Length: 341, e: 0.05, Avg Reward: 161.029299104, Learning Rate: 0.01, buffer_len: 121242
Loss: 1.23340845108
[NOR] Episode: 250, Length: 355, e: 0.05, Avg Reward: 185.099105843, Learning Rate: 0.01, buffer_len: 125808
Loss: -0.663695812225
[NOR] Episode: 260, Length: 278, e: 0.05, Avg Reward: 177.04394541, Learning Rate: 0.01, buffer_len: 129618
Loss: 1.45378124714
[NOR] Episode: 270, Length: 1000, e: 0.05, Avg Reward: 123.120892663, Learning Rate: 0.01, buffer_len: 134749
Loss: 2.85730266571
[NOR] Episode: 280, Length: 349, e: 0.05, Avg Reward: 57.9963975971, Learning Rate: 0.01, buffer_len: 137886
Loss: -3.53489947319
[NOR] Episode: 290, Length: 342, e: 0.05, Avg Reward: 142.663286478, Learning Rate: 0.01, bu

[2017-03-08 23:37:59,015] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000343.mp4


[NOR] Episode: 350, Length: 166, e: 0.05, Avg Reward: 48.8927487185, Learning Rate: 0.01, buffer_len: 157419
Loss: -3.04878330231
[NOR] Episode: 360, Length: 193, e: 0.05, Avg Reward: 68.3675388163, Learning Rate: 0.01, buffer_len: 162062
Loss: 8.08561134338
[NOR] Episode: 370, Length: 373, e: 0.05, Avg Reward: 169.870368258, Learning Rate: 0.01, buffer_len: 166297
Loss: 2.64764308929
[NOR] Episode: 380, Length: 362, e: 0.05, Avg Reward: 179.130460558, Learning Rate: 0.01, buffer_len: 170802
Loss: -3.52459049225
[NOR] Episode: 390, Length: 445, e: 0.05, Avg Reward: 154.739487229, Learning Rate: 0.01, buffer_len: 174234
Loss: 5.67033004761
[NOR] Episode: 400, Length: 402, e: 0.05, Avg Reward: 151.101330451, Learning Rate: 0.01, buffer_len: 179202
Loss: -2.79199528694
[NOR] Episode: 410, Length: 339, e: 0.05, Avg Reward: 156.19050289, Learning Rate: 0.01, buffer_len: 183980
Loss: -0.380550384521
[NOR] Episode: 420, Length: 312, e: 0.05, Avg Reward: 67.3777637877, Learning Rate: 0.01, buf

[2017-03-08 23:41:53,652] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000512.mp4


[NOR] Episode: 520, Length: 466, e: 0.05, Avg Reward: 124.545270191, Learning Rate: 0.01, buffer_len: 232075
Loss: -3.6137008667
[NOR] Episode: 530, Length: 409, e: 0.05, Avg Reward: 90.0197450786, Learning Rate: 0.01, buffer_len: 238531
Loss: 5.89645957947
[NOR] Episode: 540, Length: 420, e: 0.05, Avg Reward: 164.446920094, Learning Rate: 0.01, buffer_len: 243366
Loss: 5.87877035141
[NOR] Episode: 550, Length: 1000, e: 0.05, Avg Reward: 79.6580206459, Learning Rate: 0.01, buffer_len: 246833
Loss: -2.02893853188
[NOR] Episode: 560, Length: 158, e: 0.05, Avg Reward: 138.184247546, Learning Rate: 0.01, buffer_len: 252224
Loss: 0.116968482733
[NOR] Episode: 570, Length: 394, e: 0.05, Avg Reward: 167.501094221, Learning Rate: 0.01, buffer_len: 256108
Loss: -1.6836514473
[NOR] Episode: 580, Length: 524, e: 0.05, Avg Reward: 186.251079863, Learning Rate: 0.01, buffer_len: 260879
Loss: -1.28253304958
[NOR] Episode: 590, Length: 334, e: 0.05, Avg Reward: 205.562821323, Learning Rate: 0.01, buf

[2017-03-08 23:47:10,819] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video000729.mp4


[NOR] Episode: 730, Length: 445, e: 0.05, Avg Reward: 155.49336141, Learning Rate: 0.01, buffer_len: 323951
Loss: 1.35682320595
[NOR] Episode: 740, Length: 281, e: 0.05, Avg Reward: 197.293985441, Learning Rate: 0.01, buffer_len: 327942
Loss: -3.08729600906
[NOR] Episode: 750, Length: 264, e: 0.05, Avg Reward: 217.015940356, Learning Rate: 0.01, buffer_len: 331230
Loss: 0.377942085266
[NOR] Episode: 760, Length: 388, e: 0.05, Avg Reward: 202.407697194, Learning Rate: 0.01, buffer_len: 335269
Loss: -0.0894748270512
[NOR] Episode: 770, Length: 390, e: 0.05, Avg Reward: 185.952365653, Learning Rate: 0.01, buffer_len: 340056
Loss: -0.041238874197
[NOR] Episode: 780, Length: 598, e: 0.05, Avg Reward: 173.576232568, Learning Rate: 0.01, buffer_len: 345355
Loss: -3.795327425
[NOR] Episode: 790, Length: 441, e: 0.05, Avg Reward: 170.063655872, Learning Rate: 0.01, buffer_len: 350719
Loss: -2.32701253891
[NOR] Episode: 800, Length: 405, e: 0.05, Avg Reward: 184.265564166, Learning Rate: 0.01, b

[2017-03-08 23:52:55,906] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/tmp/monitor43/openaigym.video.5.4373.video001000.mp4


[NOR] Episode: 1000, Length: 1000, e: 0.05, Avg Reward: 100.007555524, Learning Rate: 0.01, buffer_len: 433411
Loss: -0.0355783104897
[NOR] Episode: 1010, Length: 1000, e: 0.05, Avg Reward: 159.441273339, Learning Rate: 0.01, buffer_len: 437824
Loss: -0.753508210182
[NOR] Episode: 1020, Length: 232, e: 0.05, Avg Reward: 213.414597847, Learning Rate: 0.01, buffer_len: 440960
Loss: -5.68595314026
[NOR] Episode: 1030, Length: 375, e: 0.05, Avg Reward: 169.392162183, Learning Rate: 0.01, buffer_len: 444111
Loss: -1.78346991539
[NOR] Episode: 1040, Length: 190, e: 0.05, Avg Reward: 197.883674202, Learning Rate: 0.01, buffer_len: 446994
Loss: 5.00374126434
[NOR] Episode: 1050, Length: 263, e: 0.05, Avg Reward: 166.884857618, Learning Rate: 0.01, buffer_len: 449841
Loss: -3.50586009026
[NOR] Episode: 1060, Length: 358, e: 0.05, Avg Reward: 214.492849972, Learning Rate: 0.01, buffer_len: 452911
Loss: -0.321946591139
[NOR] Episode: 1070, Length: 299, e: 0.05, Avg Reward: 198.930713135, Learning

KeyboardInterrupt: 

In [8]:
import time

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
print(env.action_space)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/actor-critic.model"
logs_path = "logs/run0"


model_run = LanderAC(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.choose_action(s, 1.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    print(total)
env.render(close=True)


[2017-03-08 21:48:38,328] Making new env: LunarLander-v2


False
Discrete(4)
147.728599704


KeyboardInterrupt: 

In [None]:

# gym.upload("tmp/monitor{}".format(run), api_key='sk_WASyK12rQxais3gwyG4Vg', ignore_open_monitors=True)

In [None]:
gym.upload?