In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time


name = "lunar-lander-base"

In [3]:
class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            self.V = base_model.define_critic_network(inputs, n_actions, n_states)

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            self.P = base_model.define_actor_network(inputs, n_actions, n_states)

            self.Pa = select_columns(self.P, inputs.a)

            self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [4]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
            
            
    def define_actor_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )
        
        net = inputs.s
        
        net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
        net = tf.nn.dropout(net, inputs.keep_prob)
        
        net = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
        
        return net


    def define_critic_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )
        
        net = inputs.s
        
        net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
        net = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]
        
        return net
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}.max".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [6]:
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{name}".format(name = name))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{name}/{name}.model".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/{name}".format(path = os.getcwd(), name = name)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005
)

[2017-03-16 22:21:38,604] Making new env: LunarLander-v2
[2017-03-16 22:21:38,607] Creating monitor directory tmp/monitor/lunar-lander-base


In [None]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-16 22:21:40,438] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor/lunar-lander-base/openaigym.video.0.9977.video000000.mp4
[2017-03-16 22:21:42,271] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor/lunar-lander-base/openaigym.video.0.9977.video000001.mp4


[MAX] Episode: 0, Length: 141, Reward: -169.862621605, buffer_len: 141
[MAX] Episode: 5, Length: 138, Reward: -87.858969857, buffer_len: 808
[MAX] Episode: 6, Length: 139, Reward: -72.7414348717, buffer_len: 947


[2017-03-16 22:21:47,404] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor/lunar-lander-base/openaigym.video.0.9977.video000008.mp4


[MAX] Episode: 10, Length: 178, Reward: -45.7347469854, buffer_len: 2418
[NOR] Episode: 10, Length: 178, Avg Reward: -286.245647511, e: 0.397180166667, Learning Rate: 0.01, buffer_len: 2418
Loss: -10.3632049561
[MAX] Episode: 14, Length: 137, Reward: -24.5288992475, buffer_len: 3577
[NOR] Episode: 20, Length: 143, Avg Reward: -127.396939849, e: 0.392185666667, Learning Rate: 0.01, buffer_len: 6699
Loss: -5.88282966614


[2017-03-16 22:22:07,444] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor/lunar-lander-base/openaigym.video.0.9977.video000027.mp4


[NOR] Episode: 30, Length: 236, Avg Reward: -141.697028313, e: 0.389630666667, Learning Rate: 0.01, buffer_len: 8889
Loss: -7.21165847778
[MAX] Episode: 34, Length: 158, Reward: 33.342165944, buffer_len: 10008
[MAX] Episode: 36, Length: 189, Reward: 40.2056866199, buffer_len: 10324
[NOR] Episode: 40, Length: 202, Avg Reward: -75.9001141286, e: 0.387244833333, Learning Rate: 0.01, buffer_len: 10934
Loss: -8.61496925354


In [7]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/stable/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/{name}".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-17 11:16:03,985] Making new env: LunarLander-v2


False
217.756050565
188.086913388
181.011019837
161.089294676
238.233424924
215.999584346
192.102182956
218.579888014
228.065414563
208.693242141
191.151074058
195.356141588
242.339825009
232.464957469
21.7095646673
110.20470237
244.406228216
218.302161378
225.698059032
232.152458049
218.204175846
211.620227937
180.965451426
256.655153409
232.169926635
229.138606903
216.834140515
225.96814146
189.482300345
200.076429375
219.731258434
171.762874711
215.469253484
240.624357465
243.565446264
233.941404502
250.973142898
106.525419426
206.150799492
226.261741038
252.126767133
185.472303977
199.486188986
227.656093808
214.995035299
215.586125584
120.878694468
191.403415595
234.423343972
219.110877774
130.046218617
227.173051944
194.980972653
111.148596646
205.654850739
110.275561839
238.438564254
218.205193657
5.96065455885
224.305563159
210.370194785
233.885822779
196.956010019
225.152002145
185.547915962
212.454979071
249.842348963
214.075882199
216.144952439
176.300983654
185.331293429
22

In [1]:

from pynput.keyboard import Key, Listener
import time
import gym
from gym import wrappers

ACTION = 0

UP = False
LEFT = False
RIGHT = False

def set_action():
    global ACTION
    
    if RIGHT:
        ACTION = 3
    elif LEFT:
        ACTION = 1
    elif UP:
        ACTION = 0
    else:
        ACTION = 2
        

def on_press(key):
    global UP, LEFT, RIGHT

    if key == Key.left:
        LEFT = True
        RIGHT = False
    elif key == Key.right:
        RIGHT = True
        LEFT = False
    elif key == Key.down:
        UP = True
        
    set_action()

def on_release(key):
    global UP, LEFT, RIGHT
    
    if key == Key.left:
        LEFT = False
    elif key == Key.right:
        RIGHT = False
    elif key == Key.down:
        UP = False
        
    set_action()


# Collect events until released
with Listener(
        on_press=on_press,
        on_release=on_release):


    env = gym.make("LunarLander-v2")

    
    while True:
        s = env.reset()
        done = False
        total = 0.
        ep = 0
        while not done and ep < 700:
            ep += 1
            a = ACTION
            s, r, done, info = env.step(a)
            total += r
            env.render()
            time.sleep(0.02)
        print(total)
    
    env.render(close=True)


[2017-03-16 14:52:50,439] Making new env: LunarLander-v2


-197.609169383


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type