In [1]:
import numpy as np
import gym
from gym.wrappers import Monitor
from numpy.random import choice
import random
from phi.api import *
import tensorflow as tf
from tfinterface.reinforcement import DQN, ExpandedStateEnv
import os
from scipy.interpolate import interp1d
import numbers



def get_run():
    try:
        with open("run.txt") as f:
            run = int(f.read().split("/n")[0])
    except:
        run = -1
    
    with open("run.txt", 'w+') as f:
        run += 1
        
        f.seek(0)
        f.write(str(run))
        f.truncate()
        
    return run

In [24]:


run = get_run()
env_logs = '/tmp/cartpole-{}'.format(run)
expansion = 3

env = gym.make('LunarLander-v2')
# env = Monitor(env, env_logs)
env = ExpandedStateEnv(env, expansion)
                
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * expansion
model_path = os.getcwd() + "/Q-network-full.model"
logs_path = "logs/run{}".format(run)

[2017-03-03 14:10:54,022] Making new env: LunarLander-v2


In [25]:
class LunarDQN(DQN):
    
    def define_Qs(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            use_bias=False,
            bias_initializer=None
        )

        net = tf.layers.dense(inputs.s, 64, activation=tf.nn.relu, name='relu_layer', **ops)
        net = tf.layers.dense(inputs.s, 32, activation=tf.nn.relu, name='relu_layer2', **ops)
        return tf.layers.dense(inputs.s, n_actions, name='linear_layer', **ops)

In [26]:

model = LunarDQN(
    n_actions, n_states,
    model_path = model_path,
    logs_path = logs_path,
    flush_secs = 3.0,
    y = 0.98,
    buffer_length=500000,
    restore = False
)

print("run: {},\n s: {},\n a: {},\n r: {},\n Qs: {},\n update: {}".format(
    run, model.inputs.s, model.inputs.a, model.inputs.r, model.network.Qs, model.update
))

run: 70,
 s: Tensor("inputs/s:0", shape=(?, 24), dtype=float32, device=/device:CPU:0),
 a: Tensor("inputs/a:0", shape=(?,), dtype=int32, device=/device:CPU:0),
 r: Tensor("inputs/r:0", shape=(?,), dtype=float32, device=/device:CPU:0),
 Qs: Tensor("network/linear_layer/MatMul:0", shape=(?, 4), dtype=float32, device=/device:CPU:0),
 update: name: "network/Adam"
op: "NoOp"
input: "^network/Adam/update_network/linear_layer/kernel/ApplyAdam"
input: "^network/Adam/Assign"
input: "^network/Adam/Assign_1"
device: "/device:CPU:0"



In [27]:
k = 5000.
model.fit(
    env, 
    episodes=50000,
    max_episode_length = 2000,
    learning_rate = 0.01, #lambda t: max(0.001, k / (k + t)),
    e = 0.1 #interp1d([0, 4000], [1, 0.05], fill_value=0.05, bounds_error=False)
)

[MAX] Episode: 0, Reward: -427.44619018, e: 0.1, learning_rate: 0.01, buffer_len: 90, episode_length: 90
[NOR] Episode: 10, avg reward: -286.982311246, e: 0.1, learning_rate: 0.01, buffer_len: 740, episode_length: 60
[NOR] Episode: 20, avg reward: -181.130319211, e: 0.1, learning_rate: 0.01, buffer_len: 1388, episode_length: 56
[NOR] Episode: 30, avg reward: -242.727730193, e: 0.1, learning_rate: 0.01, buffer_len: 2354, episode_length: 65
[NOR] Episode: 40, avg reward: -238.480080357, e: 0.1, learning_rate: 0.01, buffer_len: 3070, episode_length: 64
[NOR] Episode: 50, avg reward: -243.644812841, e: 0.1, learning_rate: 0.01, buffer_len: 3738, episode_length: 52
[NOR] Episode: 60, avg reward: -365.74857608, e: 0.1, learning_rate: 0.01, buffer_len: 4469, episode_length: 68
[NOR] Episode: 70, avg reward: -314.768971952, e: 0.1, learning_rate: 0.01, buffer_len: 5167, episode_length: 55
[NOR] Episode: 80, avg reward: -292.654297829, e: 0.1, learning_rate: 0.01, buffer_len: 5857, episode_leng

KeyboardInterrupt: 

In [None]:
import time

model_run = DQN(
    n_actions, n_states,
    model_path = model_path + ".max",
    flush_secs = 3.0,
    restore = True
)



s = env.reset()
done = False

while not done:
    a = model_run.choose_action(s, e=0.2)
    s, r, done, info = env.step(a)
    env.render()
    time.sleep(0.01)