In [1]:
import numpy as np
import gym
from gym import wrappers
from numpy.random import choice
import random
from phi.api import *
import tensorflow as tf
from tfinterface.reinforcement import DeepActorCritic, ExpandedStateEnv
from tfinterface.interfaces import EnvironmentInterface
from tfinterface.model_base import ModelBase
from tensorflow.python import debug as tf_debug
import os
from scipy.interpolate import interp1d
import numbers


def get_run():
    try:
        with open("run.txt") as f:
            run = int(f.read().split("/n")[0])
    except:
        run = -1
    
    with open("run.txt", 'w+') as f:
        run += 1
        
        f.seek(0)
        f.write(str(run))
        f.truncate()
        
    return run

'module' object has no attribute '__module__'


In [2]:
class LanderAC(DeepActorCritic):
    
    def define_actor_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            use_bias=False,
            bias_initializer=None
        )

        return (
            inputs.s
            |> tf.layers.dense$(?, 128, activation=tf.nn.relu, name='relu_layer', **ops)
            |> tf.nn.dropout$(?, inputs.keep_prob)
            |> tf.layers.dense$(?, n_actions, activation=tf.nn.softmax, name='softmax_layer', **ops)
        )


    def define_critic_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            use_bias=False,
            bias_initializer=None
        )

        return (
            inputs.s
            |> tf.layers.dense$(?, 128, activation=tf.nn.relu, name='relu_layer', **ops)
            |> tf.layers.dense$(?, 1, name='linear_layer', **ops)
            |> (lambda t: t[:, 0])
        )
    
    def fit(self, env, keep_prob=0.5, e=0., learning_rate=0.01, print_step=10, update_target=1, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.


            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1

                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e

                a = self.choose_action(s, keep_prob, e=_e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r

                self.replay_buffer.append((s, a, r, s1, float(done)))

                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                V1 = self.sess.run(self.target_critic.V, feed_dict={self.inputs.s: S1, self.inputs.keep_prob: 1.0})

                feed_dict = self.fit_feed(S, A, R, V1, Done, _learning_rate, True)

                
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=feed_dict)
                self.writer.add_summary(summaries)

                if self.global_step % update_target == 0:
                    self.sess.run(self.update_target)

                s = s1



            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".max")
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=feed_dict)
                print("[NOR] Episode: {}, Length: {}, e: {}, Avg Reward: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, _e, avg_r, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.

In [6]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "tmp/monitor{}".format(run))
env = ExpandedStateEnv(env, 3)
print(env.action_space)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/actor-critic.model"
logs_path = "logs/run{}".format(run)

print("Run: {}".format(run))

model = LanderAC(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000, pi=0.1,
    model_path = model_path,
    logs_path = logs_path
)

[2017-03-04 19:11:13,215] Making new env: LunarLander-v2
[2017-03-04 19:11:13,219] Creating monitor directory tmp/monitor36


False
Discrete(4)
Run: 36


In [None]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 400000], [0.3, 0.05], fill_value=0.05, bounds_error=False),
    update_target = 1,
    keep_prob = 0.9
)

[2017-03-04 19:11:14,404] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000000.mp4
[2017-03-04 19:11:15,605] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000001.mp4


[MAX] Episode: 0, Length: 85, Reward: -571.101488109, buffer_len: 85
[MAX] Episode: 2, Length: 72, Reward: -558.427635262, buffer_len: 227
[MAX] Episode: 3, Length: 56, Reward: -374.769981865, buffer_len: 283


[2017-03-04 19:11:17,682] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000008.mp4


[NOR] Episode: 10, Length: 69, e: 0.29948875, Avg Reward: -617.693441672, Learning Rate: 0.01, buffer_len: 819
Loss: -4.87128400803
[MAX] Episode: 17, Length: 59, Reward: -362.720455936, buffer_len: 1257
[NOR] Episode: 20, Length: 61, e: 0.299084375, Avg Reward: -482.511761697, Learning Rate: 0.01, buffer_len: 1466
Loss: 3.48610496521
[MAX] Episode: 21, Length: 54, Reward: -306.262725344, buffer_len: 1520


[2017-03-04 19:11:22,536] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000027.mp4


[NOR] Episode: 30, Length: 60, e: 0.298651875, Avg Reward: -531.426202984, Learning Rate: 0.01, buffer_len: 2158
Loss: 0.833386421204
[NOR] Episode: 40, Length: 68, e: 0.298190625, Avg Reward: -570.186718292, Learning Rate: 0.01, buffer_len: 2896
Loss: 4.84059238434
[NOR] Episode: 50, Length: 74, e: 0.29779125, Avg Reward: -496.402465529, Learning Rate: 0.01, buffer_len: 3535
Loss: 1.01878547668
[MAX] Episode: 52, Length: 57, Reward: -284.899332524, buffer_len: 3659
[NOR] Episode: 60, Length: 60, e: 0.297381875, Avg Reward: -465.73124642, Learning Rate: 0.01, buffer_len: 4190
Loss: 2.1835258007


[2017-03-04 19:11:30,733] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000064.mp4


[NOR] Episode: 70, Length: 71, e: 0.296945625, Avg Reward: -590.644976548, Learning Rate: 0.01, buffer_len: 4888
Loss: -0.034855723381
[MAX] Episode: 75, Length: 58, Reward: -238.899010409, buffer_len: 5238
[NOR] Episode: 80, Length: 89, e: 0.296493125, Avg Reward: -533.939736028, Learning Rate: 0.01, buffer_len: 5612
Loss: 4.02430725098
[NOR] Episode: 90, Length: 96, e: 0.29605, Avg Reward: -522.408789834, Learning Rate: 0.01, buffer_len: 6321
Loss: 4.97219276428
[NOR] Episode: 100, Length: 87, e: 0.29560375, Avg Reward: -542.612176749, Learning Rate: 0.01, buffer_len: 7035
Loss: 1.28779625893
[NOR] Episode: 110, Length: 62, e: 0.29517875, Avg Reward: -506.627986272, Learning Rate: 0.01, buffer_len: 7715
Loss: 5.41174030304
[NOR] Episode: 120, Length: 58, e: 0.2947325, Avg Reward: -569.7389836, Learning Rate: 0.01, buffer_len: 8429
Loss: -1.61611795425


[2017-03-04 19:11:43,936] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000125.mp4


[NOR] Episode: 130, Length: 74, e: 0.294261875, Avg Reward: -551.849909444, Learning Rate: 0.01, buffer_len: 9182
Loss: -7.53889751434
[MAX] Episode: 133, Length: 67, Reward: -152.193558724, buffer_len: 9396
[NOR] Episode: 140, Length: 88, e: 0.29381625, Avg Reward: -492.695489815, Learning Rate: 0.01, buffer_len: 9895
Loss: -7.35953712463
[MAX] Episode: 150, Length: 58, Reward: -92.5424785727, buffer_len: 10667
[NOR] Episode: 150, Length: 58, e: 0.29333375, Avg Reward: -408.062066576, Learning Rate: 0.01, buffer_len: 10667
Loss: -8.86532783508
[MAX] Episode: 151, Length: 72, Reward: -67.6398392542, buffer_len: 10739
[NOR] Episode: 160, Length: 68, e: 0.292845625, Avg Reward: -315.556604986, Learning Rate: 0.01, buffer_len: 11448
Loss: -16.2126865387
[NOR] Episode: 170, Length: 99, e: 0.29232, Avg Reward: -223.784820495, Learning Rate: 0.01, buffer_len: 12289
Loss: -5.08924198151
[MAX] Episode: 174, Length: 123, Reward: -23.7613187799, buffer_len: 12674
[NOR] Episode: 180, Length: 66, 

[2017-03-04 19:12:10,069] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000216.mp4


[NOR] Episode: 220, Length: 92, e: 0.28924875, Avg Reward: -148.127760581, Learning Rate: 0.01, buffer_len: 17203
Loss: -12.268453598
[NOR] Episode: 230, Length: 84, e: 0.288654375, Avg Reward: -103.972376968, Learning Rate: 0.01, buffer_len: 18154
Loss: -13.0516500473
[NOR] Episode: 240, Length: 95, e: 0.28798125, Avg Reward: -161.962200794, Learning Rate: 0.01, buffer_len: 19231
Loss: -8.52717399597
[NOR] Episode: 250, Length: 104, e: 0.28731375, Avg Reward: -164.761372662, Learning Rate: 0.01, buffer_len: 20299
Loss: -14.1318006516
[NOR] Episode: 260, Length: 146, e: 0.286723125, Avg Reward: -150.81542635, Learning Rate: 0.01, buffer_len: 21244
Loss: -26.0001525879
[NOR] Episode: 270, Length: 99, e: 0.28604, Avg Reward: -162.677949998, Learning Rate: 0.01, buffer_len: 22337
Loss: -26.1550369263
[MAX] Episode: 277, Length: 105, Reward: -9.17961721343, buffer_len: 23025
[NOR] Episode: 280, Length: 119, e: 0.285415625, Avg Reward: -114.827483527, Learning Rate: 0.01, buffer_len: 23336


[2017-03-04 19:12:50,792] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000343.mp4


[NOR] Episode: 350, Length: 98, e: 0.280575, Avg Reward: -203.80922394, Learning Rate: 0.01, buffer_len: 31081
Loss: -8.45300483704
[NOR] Episode: 360, Length: 80, e: 0.27983125, Avg Reward: -138.310061803, Learning Rate: 0.01, buffer_len: 32271
Loss: -24.2673435211
[NOR] Episode: 370, Length: 112, e: 0.27908875, Avg Reward: -175.494501214, Learning Rate: 0.01, buffer_len: 33459
Loss: -16.7844581604
[NOR] Episode: 380, Length: 81, e: 0.278395625, Avg Reward: -193.94890201, Learning Rate: 0.01, buffer_len: 34568
Loss: -5.3514919281
[NOR] Episode: 390, Length: 105, e: 0.2776675, Avg Reward: -145.110874766, Learning Rate: 0.01, buffer_len: 35733
Loss: -24.3319129944
[NOR] Episode: 400, Length: 83, e: 0.27699, Avg Reward: -128.641432049, Learning Rate: 0.01, buffer_len: 36817
Loss: -27.4935417175
[NOR] Episode: 410, Length: 149, e: 0.276208125, Avg Reward: -143.588854345, Learning Rate: 0.01, buffer_len: 38068
Loss: -1.73762726784
[MAX] Episode: 414, Length: 1000, Reward: 43.1435405663, bu

[2017-03-04 19:13:54,865] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000512.mp4


[NOR] Episode: 520, Length: 136, e: 0.26577, Avg Reward: -126.311275635, Learning Rate: 0.01, buffer_len: 54769
Loss: 21.3416976929
[NOR] Episode: 530, Length: 138, e: 0.265001875, Avg Reward: -83.6070518493, Learning Rate: 0.01, buffer_len: 55998
Loss: -5.23163795471
[NOR] Episode: 540, Length: 167, e: 0.26429875, Avg Reward: -140.607679203, Learning Rate: 0.01, buffer_len: 57123
Loss: 1.7297334671
[NOR] Episode: 550, Length: 124, e: 0.2634575, Avg Reward: -103.510191416, Learning Rate: 0.01, buffer_len: 58469
Loss: 3.47176456451
[NOR] Episode: 560, Length: 126, e: 0.26267375, Avg Reward: -150.505023216, Learning Rate: 0.01, buffer_len: 59723
Loss: -20.0330142975
[NOR] Episode: 570, Length: 135, e: 0.261885625, Avg Reward: -141.318813177, Learning Rate: 0.01, buffer_len: 60984
Loss: -1.41389131546
[NOR] Episode: 580, Length: 119, e: 0.260984375, Avg Reward: -107.892814457, Learning Rate: 0.01, buffer_len: 62426
Loss: -2.45063281059
[NOR] Episode: 590, Length: 157, e: 0.260166875, Avg 

[2017-03-04 19:15:06,120] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video000729.mp4


[NOR] Episode: 730, Length: 160, e: 0.24954875, Avg Reward: -84.3950131498, Learning Rate: 0.01, buffer_len: 80723
Loss: -14.778585434
[NOR] Episode: 740, Length: 117, e: 0.24880875, Avg Reward: -114.684216888, Learning Rate: 0.01, buffer_len: 81907
Loss: -14.3544216156
[NOR] Episode: 750, Length: 115, e: 0.248088125, Avg Reward: -121.595614412, Learning Rate: 0.01, buffer_len: 83060
Loss: 1.81228160858
[NOR] Episode: 760, Length: 152, e: 0.2472775, Avg Reward: -101.477921548, Learning Rate: 0.01, buffer_len: 84357
Loss: -12.2868366241
[NOR] Episode: 770, Length: 140, e: 0.246573125, Avg Reward: -109.785850455, Learning Rate: 0.01, buffer_len: 85484
Loss: -27.5905380249
[NOR] Episode: 780, Length: 90, e: 0.24576625, Avg Reward: -127.51100721, Learning Rate: 0.01, buffer_len: 86775
Loss: -3.17818832397
[NOR] Episode: 790, Length: 151, e: 0.245010625, Avg Reward: -103.812341884, Learning Rate: 0.01, buffer_len: 87984
Loss: -18.2061004639
[NOR] Episode: 800, Length: 133, e: 0.24425375, Av

[2017-03-04 19:16:52,144] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video001000.mp4


[NOR] Episode: 1000, Length: 100, e: 0.22566625, Avg Reward: -77.1760817529, Learning Rate: 0.01, buffer_len: 118935
Loss: -3.14226031303
[NOR] Episode: 1010, Length: 108, e: 0.2239075, Avg Reward: -83.9033328478, Learning Rate: 0.01, buffer_len: 121749
Loss: -2.67626905441
[NOR] Episode: 1020, Length: 102, e: 0.2228825, Avg Reward: -113.370661759, Learning Rate: 0.01, buffer_len: 123389
Loss: 0.577072143555
[NOR] Episode: 1030, Length: 81, e: 0.222238125, Avg Reward: -92.8782097531, Learning Rate: 0.01, buffer_len: 124420
Loss: -11.7655467987
[NOR] Episode: 1040, Length: 97, e: 0.2216075, Avg Reward: -109.5065765, Learning Rate: 0.01, buffer_len: 125429
Loss: -15.7261075974
[NOR] Episode: 1050, Length: 139, e: 0.22093875, Avg Reward: -103.379439651, Learning Rate: 0.01, buffer_len: 126499
Loss: -8.03443527222
[NOR] Episode: 1060, Length: 73, e: 0.2202775, Avg Reward: -95.7564398895, Learning Rate: 0.01, buffer_len: 127557
Loss: -6.34093523026
[NOR] Episode: 1070, Length: 92, e: 0.2195

[2017-03-04 19:24:44,949] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander-actor-critic/tmp/monitor36/openaigym.video.1.15919.video002000.mp4


[NOR] Episode: 2000, Length: 108, e: 0.123595, Avg Reward: -64.189974407, Learning Rate: 0.01, buffer_len: 282249
Loss: -51.3012275696
[NOR] Episode: 2010, Length: 476, e: 0.12242875, Avg Reward: -48.4146722731, Learning Rate: 0.01, buffer_len: 284115
Loss: -5.73314094543
[NOR] Episode: 2020, Length: 1000, e: 0.120453125, Avg Reward: -39.1983509844, Learning Rate: 0.01, buffer_len: 287276
Loss: -2.28845453262
[NOR] Episode: 2030, Length: 1000, e: 0.118, Avg Reward: -36.2720403629, Learning Rate: 0.01, buffer_len: 291201
Loss: -7.12934017181
[NOR] Episode: 2040, Length: 786, e: 0.116169375, Avg Reward: -89.0619551062, Learning Rate: 0.01, buffer_len: 294130
Loss: -38.3872871399
[NOR] Episode: 2050, Length: 146, e: 0.114808125, Avg Reward: -47.6634918727, Learning Rate: 0.01, buffer_len: 296308
Loss: -16.9381504059
[NOR] Episode: 2060, Length: 153, e: 0.1127075, Avg Reward: -52.1363387606, Learning Rate: 0.01, buffer_len: 299669
Loss: -11.5435142517
[NOR] Episode: 2070, Length: 88, e: 0.

In [26]:
import time

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
print(env.action_space)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/actor-critic.model"
logs_path = "logs/run0"


model_run = LanderAC(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 1200:
        ep += 1
        a = model_run.choose_action(s, 1.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    print(total)
env.render(close=True)


[2017-03-04 20:55:43,848] Making new env: LunarLander-v2


False
Discrete(4)
150.723606528
209.676953188
161.72128111
81.9737582643
171.20761489
73.1185110583


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type

In [None]:

# gym.upload("tmp/monitor{}".format(run), api_key='sk_WASyK12rQxais3gwyG4Vg', ignore_open_monitors=True)

In [None]:
gym.upload?