In [2]:
import numpy as np
import gym
from numpy.random import choice
import random
from phi.api import *
import tensorflow as tf
from tfinterface.reinforcement import OnBatchModel, ExperienceReplay
from tfinterface.interfaces import EnvironmentInterface
from tfinterface.model_base import ModelBase
from tensorflow.python import debug as tf_debug
import os
from scipy.interpolate import interp1d
import numbers


def get_run():
    try:
        with open("run.txt") as f:
            run = int(f.read().split("/n")[0])
    except:
        run = -1
    
    with open("run.txt", 'w+') as f:
        run += 1
        
        f.seek(0)
        f.write(str(run))
        f.truncate()
        
    return run

In [40]:
def select_columns(tensor, indexes):
    idx = tf.stack((tf.range(tf.shape(indexes)[0]), indexes), 1)
    return tf.gather_nd(tensor, idx)

def soft_if(cond, then, else_):
    return (cond * then) + (1.0 - cond) * else_


def map_gradients(gradients, clip_fun):
    return [ (clip_fun(g), v) for g, v in gradients ]

class ExpandedStateEnv(EnvironmentInterface):
    
    def __init__(self, env):
        self.env = gym.make(env) if type(env) is str else env
        
    def reset(self):
        s = self.env.reset()
        self.s = np.hstack((s,s,s))
        return self.s
    
    def step(self, a):
        s, r, done, info = self.env.step(a)
        n = len(s)
        self.s = np.hstack((self.s[n:], s))
        
        return self.s, r, done, info
    
    def __getattr__(self, attr):
        return getattr(self.env, attr)
    
        

class Inputs(object):
    def __init__(self, n_states, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')
            self.V1 = tf.placeholder(tf.float32, [None], name='V1')
            self.done = tf.placeholder(tf.float32, [None], name='done')
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            self.keep_prob = tf.placeholder(tf.float32, [], name='training')
    
    def fit_feed(self, S, A, R, V1, Done, learning_rate, keep_prob):
        return {
            self.s: S, self.a: A, self.r: R, self.V1: V1, self.done: Done, 
            self.learning_rate: learning_rate, self.keep_prob: keep_prob
        }
            
            
class Critic(object):
    def __init__(self, inputs, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            

            ops = dict(
                trainable=True, 
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                use_bias=False,
                bias_initializer=None
            )


            net = tf.layers.dense(inputs.s, 32, activation=tf.nn.relu, name='relu_layer', **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)
            self.V = tf.layers.dense(net, 1, name='linear_layer', **ops)[:, 0]

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.V1)

            self.error = self.target - self.V
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
            
            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)
            
            avg_error, std_error = tf.nn.moments(self.error, [0])
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.scalar('avg_error', avg_error),
                tf.summary.scalar('std_error', std_error),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
            
            
class Actor(object):
    def __init__(self, inputs, target_critic, n_actions, n_states, y, scope):
        with tf.variable_scope(scope):
            
            ops = dict(
                trainable=True, 
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                use_bias=False,
                bias_initializer=None
            )


            net = tf.layers.dense(inputs.s, 32, activation=tf.nn.relu, name='relu_layer', **ops)
            net = tf.nn.dropout(net, inputs.keep_prob)
            self.P = tf.layers.dense(net, n_actions, activation=tf.nn.softmax, name='softmax_layer', **ops)
            
            self.Pa = select_columns(self.P, inputs.a)

            self.loss = - tf.log(tf.clip_by_value(self.Pa, 1e-3, 1.0)) * target_critic.error
            self.loss = tf.reduce_mean(self.loss)
            
            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
            
            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss, var_list=self.variables)
            
            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('Pa0', tf.reduce_mean(self.P[:,0])),
                tf.summary.scalar('Pa1', tf.reduce_mean(self.P[:,1])),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, n_actions),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])

class Model(ModelBase):
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000, pi=0.01):
        self.global_max = 0.0
        self.replay_buffer = ExperienceReplay(max_length=buffer_length)
        
        
        with self.graph.as_default(), tf.device("cpu:0"):
            
            self.inputs = Inputs(n_states, "inputs")
            
            self.critic = Critic(self.inputs, n_actions, n_states, y, "critic")
            self.target_critic = Critic(self.inputs, n_actions, n_states, y, "target_critic")
            self.actor = Actor(self.inputs, self.target_critic, n_actions, n_states, y, "actor")
            
            self.update = tf.group(self.critic.update, self.actor.update)
            
            self.episode_length_summary = tf.summary.scalar('episode_length', self.inputs.episode_length)
            
            self.summaries = tf.summary.merge([self.actor.summaries, self.critic.summaries, self.target_critic.summaries])
            
            self.update_target = tf.group(*[
                t.assign_add(pi * (a - t)) for t, a in zip(self.target_critic.variables, self.critic.variables)
            ])
        
                              
    def fit_feed(self, *args, **kwargs): return self.inputs.fit_feed(*args, **kwargs)
                
    def choose_action(self, state, keep_prob, e=0.0):
        actions = self.sess.run(self.actor.P, feed_dict={
            self.inputs.s: [state],
            self.inputs.keep_prob: keep_prob
        })[0]
        n = len(actions)
        
        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.random.choice(n, p=actions)
        
        
    
    
    def fit(self, env, keep_prob=0.5, learning_rate=0.01, print_step=10, update_target=32, episodes=100000, max_episode_length=float('inf'), batch_size=32):
        
        for episode in range(episodes):
            done = False
            ep_step = 0
            s = env.reset()
            episode_length = 0
                
            
            while not done and ep_step <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                ep_step += 1
                
                a = self.choose_action(s, keep_prob)
                s1, r, done, info = env.step(a)
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                V1 = self.sess.run(self.target_critic.V, feed_dict={self.inputs.s: S1, self.inputs.keep_prob: 1.0})
                
                feed_dict = self.fit_feed(S, A, R, V1, Done, learning_rate, True)
                
                if self.global_step > 1:
                    _, summaries = self.sess.run([self.update, self.summaries], feed_dict=feed_dict)
                    self.writer.add_summary(summaries)
                
                if self.global_step % update_target == 0:
                    self.sess.run(self.update_target)
                
                s = s1
                
            
            
            episode_length_summary = self.sess.run(self.episode_length_summary, 
                                                   feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary)
            
            
            if episode_length >= self.global_max:
                print("[MAX] Episode: {}, Length: {}, buffer_len: {}".format(episode, episode_length, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".max")
                self.global_max = episode_length

            
            
            
            if episode % print_step == 0:
                actor_loss = self.sess.run(self.actor.loss, feed_dict=feed_dict)
                print("[NOR] Episode: {}, Length: {}, Learning Rate: {}, buffer_len: {}".format(episode, episode_length, learning_rate, len(self.replay_buffer)))
                print("Loss: {}".format(actor_loss))
                self.save()
            
run = get_run()
env = ExpandedStateEnv("CartPole-v1")
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/actor-critic.model"
logs_path = "/logs/run{}".format(run)

print("Run: {}".format(run))

[2017-02-26 06:42:02,527] Making new env: CartPole-v1


Run: 167


In [41]:
model = Model(
    n_actions, n_states, y=0.9999, 
    buffer_length=1000000, pi=0.1,
    model_path = model_path,
    logs_path = logs_path
)

In [42]:
model.fit(
    env, keep_prob=0.5, learning_rate=0.01, print_step=10, 
    episodes=int(1e5), max_episode_length=200e3, batch_size=32
)

[MAX] Episode: 0, Length: 12, buffer_len: 12
[NOR] Episode: 0, Length: 12, Learning Rate: 0.01, buffer_len: 12
Loss: 0.693122327328
[MAX] Episode: 2, Length: 14, buffer_len: 36
[MAX] Episode: 4, Length: 23, buffer_len: 68
[NOR] Episode: 10, Length: 8, Learning Rate: 0.01, buffer_len: 142
Loss: 0.552762031555
[NOR] Episode: 20, Length: 12, Learning Rate: 0.01, buffer_len: 280
Loss: 0.512374162674
[MAX] Episode: 27, Length: 28, buffer_len: 385
[NOR] Episode: 30, Length: 19, Learning Rate: 0.01, buffer_len: 424
Loss: 0.385224342346
[NOR] Episode: 40, Length: 9, Learning Rate: 0.01, buffer_len: 547
Loss: 0.291245102882
[NOR] Episode: 50, Length: 9, Learning Rate: 0.01, buffer_len: 677
Loss: 0.476347327232
[NOR] Episode: 60, Length: 9, Learning Rate: 0.01, buffer_len: 792
Loss: 0.53714710474
[NOR] Episode: 70, Length: 9, Learning Rate: 0.01, buffer_len: 913
Loss: 0.290581226349
[MAX] Episode: 71, Length: 30, buffer_len: 943
[NOR] Episode: 80, Length: 11, Learning Rate: 0.01, buffer_len: 107

KeyboardInterrupt: 

In [None]:
import time

model_run = Model(
    n_actions, n_states,
    model_path = model_path + ".max",
    flush_secs = 3.0,
    restore = True
)
env = ExpandedStateEnv("CartPole-v1")
s = env.reset()
done = False
total = 0
while not done:
    total += 1
    a = model_run.choose_action(s, 1.0, e=0.)
    s, r, done, info = env.step(a)
    env.render()
    time.sleep(0.01)
    
print total