In [1]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os

In [2]:
class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            self.V = base_model.define_critic_network(inputs, n_actions, n_states)

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            self.P = base_model.define_actor_network(inputs, n_actions, n_states)

            self.Pa = select_columns(self.P, inputs.a)

            self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [3]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.replay_buffer = ExperienceReplay(max_length=buffer_length)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
            
            
    def define_actor_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )
        
        net = inputs.s
        
        net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
        net = tf.nn.dropout(net, inputs.keep_prob)
        
        net = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
        
        return net


    def define_critic_network(self, inputs, n_actions, n_states):
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )
        
        net = inputs.s
        
        net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)        
        net = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]
        
        return net
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries)
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".max")
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [6]:


run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "tmp/monitor{}".format(run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/actor-critic.model"
logs_path = "logs/run{}".format(run)

print("Run: {}".format(run))

model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.01
)

[2017-03-13 07:35:16,029] Making new env: LunarLander-v2
[2017-03-13 07:35:16,032] Creating monitor directory tmp/monitor44


False
Run: 44


In [None]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.01, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.9,
    update_target_step = 1
)

[2017-03-13 07:35:18,586] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor44/openaigym.video.1.5687.video000000.mp4
[2017-03-13 07:35:20,900] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor44/openaigym.video.1.5687.video000001.mp4


[MAX] Episode: 0, Length: 146, Reward: -487.201210903, buffer_len: 146
[MAX] Episode: 2, Length: 93, Reward: -440.555031959, buffer_len: 361
[MAX] Episode: 4, Length: 127, Reward: -366.041002919, buffer_len: 608


[2017-03-13 07:35:24,067] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor44/openaigym.video.1.5687.video000008.mp4


[MAX] Episode: 9, Length: 154, Reward: -353.837952822, buffer_len: 1181
[NOR] Episode: 10, Length: 192, Avg Reward: -504.126328467, e: 0.398399333333, Learning Rate: 0.01, buffer_len: 1373
Loss: -1.76620423794
[MAX] Episode: 12, Length: 95, Reward: -305.273658475, buffer_len: 1579
[NOR] Episode: 20, Length: 76, Avg Reward: -485.680605657, e: 0.396886166667, Learning Rate: 0.01, buffer_len: 2670
Loss: -6.82359743118


[2017-03-13 07:35:31,602] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor44/openaigym.video.1.5687.video000027.mp4


[NOR] Episode: 30, Length: 80, Avg Reward: -459.651298405, e: 0.395667, Learning Rate: 0.01, buffer_len: 3715
Loss: -1.32222127914
[NOR] Episode: 40, Length: 278, Avg Reward: -612.816022161, e: 0.393554166667, Learning Rate: 0.01, buffer_len: 5526
Loss: 1.33149123192
[NOR] Episode: 50, Length: 400, Avg Reward: -548.016095995, e: 0.3917085, Learning Rate: 0.01, buffer_len: 7108
Loss: 2.06883955002
[MAX] Episode: 58, Length: 125, Reward: -304.985537927, buffer_len: 8074
[NOR] Episode: 60, Length: 77, Avg Reward: -437.51974016, e: 0.390282833333, Learning Rate: 0.01, buffer_len: 8330
Loss: 5.31168365479


[2017-03-13 07:35:46,603] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor44/openaigym.video.1.5687.video000064.mp4


[NOR] Episode: 70, Length: 322, Avg Reward: -586.209293548, e: 0.388630833333, Learning Rate: 0.01, buffer_len: 9746
Loss: 20.5478191376
[NOR] Episode: 80, Length: 225, Avg Reward: -467.000000011, e: 0.386961333333, Learning Rate: 0.01, buffer_len: 11177
Loss: 9.28769493103
[MAX] Episode: 84, Length: 80, Reward: -292.983151946, buffer_len: 11656
[NOR] Episode: 90, Length: 112, Avg Reward: -488.58446504, e: 0.385491333333, Learning Rate: 0.01, buffer_len: 12437
Loss: 7.60430049896
[MAX] Episode: 98, Length: 112, Reward: -240.93151075, buffer_len: 13561
[NOR] Episode: 100, Length: 76, Avg Reward: -491.118079788, e: 0.383958333333, Learning Rate: 0.01, buffer_len: 13751
Loss: 7.21946144104
[NOR] Episode: 110, Length: 169, Avg Reward: -507.340658958, e: 0.382516333333, Learning Rate: 0.01, buffer_len: 14987
Loss: 2.98401641846
[NOR] Episode: 120, Length: 101, Avg Reward: -468.433089428, e: 0.3809635, Learning Rate: 0.01, buffer_len: 16318
Loss: 64.6607131958


[2017-03-13 07:36:10,126] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor44/openaigym.video.1.5687.video000125.mp4


[MAX] Episode: 126, Length: 94, Reward: -58.1730999017, buffer_len: 17089
[NOR] Episode: 130, Length: 124, Avg Reward: -415.583284089, e: 0.3795425, Learning Rate: 0.01, buffer_len: 17536
Loss: 14.9348573685
[NOR] Episode: 140, Length: 184, Avg Reward: -469.024431493, e: 0.378295333333, Learning Rate: 0.01, buffer_len: 18605
Loss: 6.33234500885
[NOR] Episode: 150, Length: 136, Avg Reward: -515.950330443, e: 0.3766235, Learning Rate: 0.01, buffer_len: 20038
Loss: 11.6594953537
[NOR] Episode: 160, Length: 65, Avg Reward: -373.379121, e: 0.3754265, Learning Rate: 0.01, buffer_len: 21064
Loss: 73.8516845703
[NOR] Episode: 170, Length: 73, Avg Reward: -194.825130711, e: 0.374469833333, Learning Rate: 0.01, buffer_len: 21884
Loss: -8.87912559509
[NOR] Episode: 180, Length: 79, Avg Reward: -169.896969712, e: 0.373582, Learning Rate: 0.01, buffer_len: 22645
Loss: -8.15791416168
[MAX] Episode: 186, Length: 62, Reward: -50.4956587649, buffer_len: 23063
[NOR] Episode: 190, Length: 73, Avg Reward:

[2017-03-13 07:36:37,670] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic-2/tmp/monitor44/openaigym.video.1.5687.video000216.mp4


[NOR] Episode: 220, Length: 74, Avg Reward: -178.614766684, e: 0.370227833333, Learning Rate: 0.01, buffer_len: 25520
Loss: -8.62942028046
[NOR] Episode: 230, Length: 58, Avg Reward: -169.095927362, e: 0.369417, Learning Rate: 0.01, buffer_len: 26215
Loss: -5.03403520584
[NOR] Episode: 240, Length: 105, Avg Reward: -184.84595621, e: 0.368460333333, Learning Rate: 0.01, buffer_len: 27035
Loss: -1.22948789597
[NOR] Episode: 250, Length: 56, Avg Reward: -175.794996152, e: 0.367485, Learning Rate: 0.01, buffer_len: 27871
Loss: -13.2601833344
[NOR] Episode: 260, Length: 110, Avg Reward: -213.587966733, e: 0.3665995, Learning Rate: 0.01, buffer_len: 28630
Loss: -2.08478569984
[NOR] Episode: 270, Length: 78, Avg Reward: -259.463327466, e: 0.365607833333, Learning Rate: 0.01, buffer_len: 29480
Loss: -18.1402740479
[NOR] Episode: 280, Length: 57, Avg Reward: -188.93427588, e: 0.364727, Learning Rate: 0.01, buffer_len: 30235
Loss: -3.64950275421
[NOR] Episode: 290, Length: 59, Avg Reward: -161.9

In [None]:
import time

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
print(env.action_space)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/actor-critic.model"
logs_path = "logs/run0"


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    print(total)
env.render(close=True)
