In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
from tfinterface.model_base import ModelBase
from tfinterface.reinforcement import ExperienceReplay
from tfinterface.utils import select_columns, soft_if, get_run, map_gradients
from phi.api import *
import tensorflow as tf
import random
from scipy.interpolate import interp1d
import numpy as np
import gym
from gym import wrappers
from tfinterface.reinforcement import ExpandedStateEnv
import os
import time


name = "actor-critic-base"

In [12]:
class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.v1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
            self.training = tf.placeholder(tf.bool, [], name='training')
            
            self.pi = tf.placeholder(tf.float32, [], name='pi')
            

class Critic(object):
    def __init__(self, base_model, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)
            
            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer2", **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)
            
            self.V = tf.layers.dense(net, n_actions, name='V', **ops)[:, 0]

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.v1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

#             self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)
            trainer = tf.train.AdamOptimizer(inputs.learning_rate)
            gradients = trainer.compute_gradients(self.loss, var_list=self.variables)
            gradients = map_gradients(lambda g: tf.clip_by_norm(g, 1), gradients)
            self.update = trainer.apply_gradients(gradients)
            
            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
class Actor(object):
    def __init__(self, base_model, inputs, target_critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            ops = dict(
                trainable=True,
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                bias_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01)
            )

            net = inputs.s

            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer", **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)
            
            net = tf.layers.dense(net, 64, activation=tf.nn.relu, name="relu_layer2", **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)

            self.P = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='P', use_bias=False, **ops)
            self.Pa = select_columns(self.P, inputs.a)
            
            r_pred = net
            r_pred = tf.concat([r_pred, tf.one_hot(inputs.a, n_actions)], 1)
            r_pred = tf.layers.dense(net, 32, activation=tf.nn.softmax, name='r_pred_relu', **ops)
            r_pred = tf.layers.dense(r_pred, 1, activation=tf.nn.softmax, name='r_pred', **ops)[:,0]
            r_loss = 0.01 * Pipe(r_pred - inputs.r, tf.nn.l2_loss, tf.reduce_mean)

            self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
            self.loss = tf.reduce_mean(self.loss) + r_loss

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

#             self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)
            trainer = tf.train.AdamOptimizer(inputs.learning_rate)
            gradients = trainer.compute_gradients(self.loss, var_list=self.variables)
            gradients = map_gradients(lambda g: tf.clip_by_norm(g, 1), gradients)
            self.update = trainer.apply_gradients(gradients)
            
            
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            


In [13]:
class LunarLander(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.1):
        self.global_max = float('-inf')

        self.positive_buffer = ExperienceReplay(max_length=buffer_length/2)
        self.negative_buffer = ExperienceReplay(max_length=buffer_length/2)


        with self.graph.as_default(), tf.device("cpu:0"):

            self.inputs = Inputs(n_states, "inputs")

            self.critic = Critic(self, self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self, self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self, self.inputs, self.target_critic, n_actions, n_states, y, "actor")

            self.update = tf.group(self.critic.update, self.actor.update)

            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)

            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])

            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
    
    
    def predict_feed(self, S):
        return {
            self.inputs.s: S,
            self.inputs.keep_prob: 1.0,
            self.inputs.training: False
        }
    
    def predict(self, state, e = 0.0):
        predict_feed = self.predict_feed([state])
        actions = self.sess.run(self.actor.P, feed_dict=predict_feed)
        actions = actions[0]
        n = len(actions)

        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.inputs.s: S,
            self.inputs.a: A,
            self.inputs.r: R,
            self.inputs.v1: V1,
            self.inputs.done: Done,
            self.inputs.learning_rate: learning_rate,
            self.inputs.keep_prob: keep_prob,
            self.inputs.training: True
        }
    
    
    def fit(self, env, keep_prob=0.5, e=0.01, learning_rate=0.01, print_step=10, 
            update_target_step = 32, episodes=100000, max_episode_length=float('inf'), batch_size=32,
            min_buffer_size=10000):
        
        r_total = 0.
        batch_size /= 2

        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
            ep_reward = 0.
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                
                _learning_rate = learning_rate(self.global_step) if hasattr(learning_rate, '__call__') else learning_rate
                _e = e(self.global_step) if hasattr(e, '__call__') else e
                
                
                a = self.predict(s, e = _e)
                s1, r, done, info = env.step(a)
                r_total += r
                ep_reward += r
                
                
                if r > 0:
                    self.positive_buffer.append((s, a, r, s1, float(done)))
                else:
                    self.negative_buffer.append((s, a, r, s1, float(done)))
                    
                s = s1
                
                
                
                Sp, Ap, Rp, S1p, Donep = self.positive_buffer.random_batch(batch_size).unzip() if len(self.positive_buffer) > 0 else ([], [], [], [], [])
                Sn, An, Rn, S1n, Donen = self.negative_buffer.random_batch(batch_size).unzip() if len(self.negative_buffer) > 0 else ([], [], [], [], [])
                
                S, A, R, S1, Done = Sp + Sn, Ap + An, Rp + Rn, S1p + S1n, Donep + Donen
                
                predict_feed = self.predict_feed(S1)
                V1 = self.sess.run(self.target_critic.V, feed_dict=predict_feed)

                
                fit_feed = self.fit_feed(S, A, R, V1, Done, _learning_rate, keep_prob)
                
                if self.global_step < min_buffer_size:
                    continue
                
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=fit_feed)
                self.writer.add_summary(summaries, self.global_step)
                
                
                if self.global_step % update_target_step == 0:
                    self.sess.run(self.update_target)
                
                
                
                
            
            episode_length_summary = self.sess.run(self.episode_length_summary,
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary, self.global_step)


            if ep_reward >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, Reward: {}, global_step: {}".format(episode, episode_length, ep_reward, self.global_step))
                self.save(model_path = self.model_path + ".{score}".format(score = ep_reward))
                self.global_max = ep_reward


            if episode % print_step == 0 and episode > 0:
                avg_r = r_total / print_step
                actor_loss = self.sess.run(self.actor.loss, feed_dict=fit_feed)
                print("[NOR] Episode: {}, Length: {}, Avg Reward: {}, e: {}, Learning Rate: {}, global_step: {}".format(episode, episode_length, avg_r, _e, _learning_rate, self.global_step))
                print("Loss: {}".format(actor_loss))
                self.save()
                r_total = 0.


In [None]:
run = get_run()
env = gym.make("LunarLander-v2")
env = wrappers.Monitor(env, "monitor/{run}".format(run = run))
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{run}".format(path = os.getcwd(), run = run)
logs_path = "{path}/logs/{run}".format(path = os.getcwd(), run = run)


model = LunarLander(
    n_actions, n_states, y=0.95, 
    buffer_length=500000,
    model_path = model_path,
    logs_path = logs_path,
    restore = False,
    pi = 0.01
)

[2017-03-20 17:57:19,869] Making new env: LunarLander-v2
[2017-03-20 17:57:19,872] Creating monitor directory monitor/36


In [None]:
k = 40000.
model.fit(
    env, print_step=10, 
    episodes=int(1e5), max_episode_length=10000, batch_size=32,
    learning_rate = 0.005, # lambda t: 0.05 * k / (k + t)
    e = interp1d([0, 300000], [0.5, 0.05], fill_value=0.05, bounds_error=False),
    keep_prob = 0.5,
    update_target_step = 1,
    min_buffer_size = 20000
)

[2017-03-20 17:57:21,088] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/36/openaigym.video.7.18417.video000000.mp4


[MAX] Episode: 0, Length: 100, Reward: -329.910421965, global_step: 99


[2017-03-20 17:57:22,439] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/36/openaigym.video.7.18417.video000001.mp4


[MAX] Episode: 1, Length: 81, Reward: -134.660737674, global_step: 180


[2017-03-20 17:57:24,230] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/36/openaigym.video.7.18417.video000008.mp4


[NOR] Episode: 10, Length: 63, Avg Reward: -227.863212541, e: 0.498389, Learning Rate: 0.005, global_step: 1074
Loss: 1.24029505253
[MAX] Episode: 12, Length: 93, Reward: -126.548560835, global_step: 1256
[NOR] Episode: 20, Length: 103, Avg Reward: -275.01815288, e: 0.49715, Learning Rate: 0.005, global_step: 1900
Loss: 47.4998779297


[2017-03-20 17:57:27,945] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/36/openaigym.video.7.18417.video000027.mp4


[MAX] Episode: 26, Length: 72, Reward: 5.05415454995, global_step: 2460
[NOR] Episode: 30, Length: 85, Avg Reward: -214.417759002, e: 0.4956635, Learning Rate: 0.005, global_step: 2891
Loss: 93.9633789062
[NOR] Episode: 40, Length: 121, Avg Reward: -269.71775062, e: 0.4942505, Learning Rate: 0.005, global_step: 3833
Loss: 1.8716584444
[NOR] Episode: 50, Length: 130, Avg Reward: -307.864052393, e: 0.4927955, Learning Rate: 0.005, global_step: 4803
Loss: 1.18475544453
[NOR] Episode: 60, Length: 127, Avg Reward: -228.385374496, e: 0.4914335, Learning Rate: 0.005, global_step: 5711
Loss: 3.64567470551


[2017-03-20 17:57:34,535] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/36/openaigym.video.7.18417.video000064.mp4


[NOR] Episode: 70, Length: 101, Avg Reward: -248.667964503, e: 0.4898915, Learning Rate: 0.005, global_step: 6739
Loss: 2.64568042755
[NOR] Episode: 80, Length: 120, Avg Reward: -285.513358014, e: 0.488354, Learning Rate: 0.005, global_step: 7764
Loss: 0.731885552406
[NOR] Episode: 90, Length: 90, Avg Reward: -199.440076773, e: 0.4871315, Learning Rate: 0.005, global_step: 8579
Loss: 5.14626502991
[NOR] Episode: 100, Length: 102, Avg Reward: -267.710517089, e: 0.4857635, Learning Rate: 0.005, global_step: 9491
Loss: 124.74156189
[NOR] Episode: 110, Length: 94, Avg Reward: -197.561789133, e: 0.48446, Learning Rate: 0.005, global_step: 10360
Loss: 1.06783723831
[NOR] Episode: 120, Length: 122, Avg Reward: -210.188726574, e: 0.482993, Learning Rate: 0.005, global_step: 11338
Loss: 50.0718841553


[2017-03-20 17:57:43,577] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/36/openaigym.video.7.18417.video000125.mp4


[NOR] Episode: 130, Length: 70, Avg Reward: -211.904131771, e: 0.4815995, Learning Rate: 0.005, global_step: 12267
Loss: 47.9377593994
[NOR] Episode: 140, Length: 117, Avg Reward: -194.618146794, e: 0.4801505, Learning Rate: 0.005, global_step: 13233
Loss: 3.96183943748
[NOR] Episode: 150, Length: 106, Avg Reward: -276.924482145, e: 0.4787, Learning Rate: 0.005, global_step: 14200
Loss: 0.917749583721
[NOR] Episode: 160, Length: 69, Avg Reward: -233.496597722, e: 0.4773905, Learning Rate: 0.005, global_step: 15073
Loss: 2.17873096466
[NOR] Episode: 170, Length: 95, Avg Reward: -246.64443882, e: 0.4760465, Learning Rate: 0.005, global_step: 15969
Loss: 47.1853790283
[NOR] Episode: 180, Length: 126, Avg Reward: -249.046203727, e: 0.4746005, Learning Rate: 0.005, global_step: 16933
Loss: 2.43584799767
[NOR] Episode: 190, Length: 102, Avg Reward: -191.253429562, e: 0.4731695, Learning Rate: 0.005, global_step: 17887
Loss: 0.62024974823
[NOR] Episode: 200, Length: 100, Avg Reward: -168.4091

[2017-03-20 17:57:57,396] Starting new video recorder writing to /home/cristian/data/neura-lab/q-learning/notebooks/lunar-lander/actor-critic/base/monitor/36/openaigym.video.7.18417.video000216.mp4


[NOR] Episode: 220, Length: 283, Avg Reward: -290.204984836, e: 0.468194, Learning Rate: 0.005, global_step: 21204
Loss: 23.3327674866
[NOR] Episode: 230, Length: 182, Avg Reward: -230.668763061, e: 0.46529, Learning Rate: 0.005, global_step: 23140
Loss: -2.55795383453
[MAX] Episode: 237, Length: 1000, Reward: 46.0603061899, global_step: 25551
[NOR] Episode: 240, Length: 384, Avg Reward: -138.51013865, e: 0.460382, Learning Rate: 0.005, global_step: 26412
Loss: 32.1707992554


In [16]:

env = gym.make("LunarLander-v2")
env = ExpandedStateEnv(env, 3)
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path =  "{path}/models/{name}".format(path = os.getcwd(), name = 36)
logs_path = "{path}/logs/".format(path = os.getcwd(), name = name)


model_run = LunarLander(
    n_actions, n_states,
    model_path = model_path,
    flush_secs = 3.0,
    restore = True
)

for i in range(100):
    s = env.reset()
    done = False
    total = 0.
    ep = 0
    while not done and ep < 700:
        ep += 1
        a = model_run.predict(s, 0.0)
        s, r, done, info = env.step(a)
        total += r
        env.render()
        time.sleep(0.01)
    
    print(total)
    
env.render(close=True)


[2017-03-20 18:04:48,763] Making new env: LunarLander-v2


-207.369028694
-126.784393113


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type

In [18]:
run

3