In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run, map_gradients
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time
from itertools import groupby

name = "actor-critic-shared-base"

'module' object has no attribute '__module__'


In [3]:
def update_dict(d, key, default, f):
    if key in d:
        d[key] = f(d[key])
    else:
        d[key] = default

def combine_gradients(grads1, grads2):
    d = {}
    
    for g, v in grads1 + grads2:
        update_dict(d, v, g, (g1) -> g1 + g)
    
    return [ (g, v) for v, g in d.items() ]


grads1 = [(5, 0), (3, 1)]
grads2 = [(2, 0)]
combine_gradients(grads1, grads2)

[(7, 0), (3, 1)]

In [50]:
class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')
            self.episode_reward = tf.placeholder(tf.float32, [], name='episode_reward')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Base(object):
    def __init__(self, inputs, n_states, scope, ops):
        with tf.variable_scope(scope):
            net = inputs.s

            net = tf.layers.dense(net, 128, activation=tf.nn.relu, name="relu_layer", use_bias=True, **ops)
            self.net = tf.nn.dropout(net, inputs.keep_prob)
            
            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

class Critic(object):
    def __init__(self, base, inputs, n_actions, n_states, y, scope, ops):
        with tf.variable_scope(scope):
            
            self.V = (
                base.net
                |> tf.layers.dense$(?, 64, name='relu_layer', activation=tf.nn.relu, **ops)
                |> tf.nn.dropout$(?, inputs.keep_prob)
                |> tf.layers.dense$(?, n_actions, name='V', **ops)
                |> (lambda net: net[:, 0])
            )

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) + base.variables

            self.gradients = tf.train.AdamOptimizer(inputs.learning_rate).compute_gradients(self.loss, var_list=self.variables)

            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', (
                    inputs.a
                    |> tf.one_hot$(?, n_actions)
                    |> tf.reduce_mean$(?, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base, inputs, target_critic, n_actions, n_states, y, scope, ops):
        with tf.variable_scope(scope):
            self.P = (
                base.net
                |> tf.layers.dense$(?, 64, name='relu_layer', activation=tf.nn.relu, **ops)
                |> tf.nn.dropout$(?, inputs.keep_prob)
                |> tf.layers.dense$(?, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
            )
            
            
            self.Pa = select_columns(self.P, inputs.a)

            self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) + base.variables

            self.gradients = tf.train.AdamOptimizer(inputs.learning_rate).compute_gradients(self.loss, var_list=self.variables)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [56]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1, clip=10):
        self.global_max = float('-inf')
        self.replay_buffer = ExperienceReplay(max_length=buffer_length)
        ops = dict(
            trainable=True,
            kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
            bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
        )


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")
            
            self.base = Base(self.inputs, n_states, "base", ops)
            self.target_base = Base(self.inputs, n_states, "target_base", ops)
            
            self.critic = Critic(self.base, self.inputs, n_actions, n_states, y, "critic", ops)
            self.target_critic = Critic(self.target_base, self.inputs, n_actions, n_states, y, "target_critic", ops)
            
            self.actor = Actor(self.base, self.inputs, self.target_critic, n_actions, n_states, y, "actor", ops)
            
            with tf.name_scope("combine_gradients"):
                self.gradients = (
                    combine_gradients(self.actor.gradients, self.critic.gradients)
                    |> map_gradients$(tf.clip_by_norm$(?, clip))
                )
            
            self.update = tf.train.AdamOptimizer(self.inputs.learning_rate).apply_gradients(self.gradients)

            self.episode_summaries = tf.summary.merge([
                tf.summary.scalar('episode_length', self.inputs.episode_length),
                tf.summary.scalar('episode_reward', self.inputs.episode_reward)
            ])

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])
            
            with tf.name_scope("update_targets"):
                self.update_target = tf.group(*[
                    t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
                ])
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        r_total = 0.

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                s = s1
                
            
            episode_summaries = self.sess.run(self.episode_summaries,feed_dict={
                self.inputs.episode_length: episode_length,
                self.inputs.episode_reward: ep_reward
            })
            self.writer.add_summary(episode_summaries, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, buffer_len: {}".format(episode, episode_length, ep_reward, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, avg_r, _e, _learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [57]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{run}".format(run = run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/{run}".format(path = os.getcwd(), name = name, run = run)


model = LunarLander(
    n_actions, n_states, y=0.9999, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.005,
    clip = 5.
)

[2017-03-17 18:12:07,172] Making new env: LunarLander-v2
[2017-03-17 18:12:07,176] Creating monitor directory monitor/23


False


In [58]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.05, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.4, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1
)

[2017-03-17 18:12:08,253] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/23/openaigym.video.18.4296.video000000.mp4


[MAX] Episode: 0, Length: 86, Reward: -405.336138105, buffer_len: 86


[2017-03-17 18:12:12,682] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/23/openaigym.video.18.4296.video000001.mp4


[MAX] Episode: 2, Length: 63, Reward: -360.878044705, buffer_len: 283
[MAX] Episode: 5, Length: 57, Reward: -250.139148962, buffer_len: 470


[2017-03-17 18:12:16,105] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/23/openaigym.video.18.4296.video000008.mp4


[NOR] Episode: 10, Length: 82, Avg Reward: -470.926908496, e: 0.399077166667, Learning Rate: 0.05, buffer_len: 792
Loss: -17.195520401
[NOR] Episode: 20, Length: 80, Avg Reward: -513.807438635, e: 0.3982185, Learning Rate: 0.05, buffer_len: 1528
Loss: 4.6593503952


[2017-03-17 18:12:23,083] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/23/openaigym.video.18.4296.video000027.mp4


[NOR] Episode: 30, Length: 97, Avg Reward: -442.845449959, e: 0.3969305, Learning Rate: 0.05, buffer_len: 2632
Loss: -3.78609538078
[NOR] Episode: 40, Length: 95, Avg Reward: -437.822317873, e: 0.395338, Learning Rate: 0.05, buffer_len: 3997
Loss: -0.462967276573
[NOR] Episode: 50, Length: 126, Avg Reward: -560.481444936, e: 0.393851666667, Learning Rate: 0.05, buffer_len: 5271
Loss: 2.2222533226
[MAX] Episode: 52, Length: 138, Reward: -209.829761904, buffer_len: 5492
[NOR] Episode: 60, Length: 218, Avg Reward: -497.69082713, e: 0.392027, Learning Rate: 0.05, buffer_len: 6835
Loss: 7.23928546906


[2017-03-17 18:12:43,446] Starting new video recorder writing to /data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/23/openaigym.video.18.4296.video000064.mp4


[NOR] Episode: 70, Length: 332, Avg Reward: -514.454895625, e: 0.3903715, Learning Rate: 0.05, buffer_len: 8254
Loss: 0.323273897171
[NOR] Episode: 80, Length: 155, Avg Reward: -573.204097233, e: 0.389011166667, Learning Rate: 0.05, buffer_len: 9420
Loss: -1.08817768097
[MAX] Episode: 86, Length: 232, Reward: -165.252241254, buffer_len: 10361
[NOR] Episode: 90, Length: 82, Avg Reward: -410.147070293, e: 0.387163166667, Learning Rate: 0.05, buffer_len: 11004
Loss: 34.1177482605
[MAX] Episode: 98, Length: 97, Reward: -148.124360316, buffer_len: 11781
[NOR] Episode: 100, Length: 83, Avg Reward: -410.864578699, e: 0.386022166667, Learning Rate: 0.05, buffer_len: 11982
Loss: 2.2966837883


[2017-03-17 18:13:06,078] Finished writing results. You can upload them to the scoreboard via gym.upload(u'/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/shared-base/monitor/22')


KeyboardInterrupt: 

In [6]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/{name}".format(path = os.getcwd(), name = name)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-17 12:07:02,890] Making new env: LunarLander-v2
[2017-03-17 12:07:02,892] Finished writing results. You can upload them to the scoreboard via gym.upload('/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/actor-critic-base')


False
196.104081847
149.867198645
215.673060563
217.593832845
201.192692875
198.152117614
-35.4402700038
218.394082887
193.044127714
120.793062792
234.628837747
213.755804367
0.398475525016
219.955313615
219.817810851
-31.113964878
204.808844432
218.3261461
223.759720668
230.32005815
192.594561856
192.463367622


KeyboardInterrupt: 