In [1]:
import numpy as np
import gym
from numpy.random import choice
import random
from phi.api import *
import tensorflow as tf
from tfinterface.reinforcement import OnBatchModel, OnExperienceTrainer, ExperienceReplay
from tfinterface.interfaces import EnvironmentInterface
from tfinterface.model_base import ModelBase
import os
from scipy.interpolate import interp1d
import numbers


env = 

def select_columns(tensor, indexes):
    idx = tf.stack((tf.range(tf.shape(indexes)[0]), indexes), 1)
    return tf.gather_nd(tensor, idx)

run = 20

[2017-02-21 17:22:11,572] Making new env: CartPole-v1


In [6]:
class ExtendedEnv(EnvironmentInterface):
    
    def __init__(self, env):
        self.env = gym.make(env) if type(env) is str else env
        
    def reset(self):
        s = self.env.reset()
        self.s = np.hstack((s,s,s))
        return self.s
    
    def step(self, a):
        s, r, done, info = self.env.step(a)
        n = len(s)
        self.s = np.hstack((self.s[n:], s))
        
        return self.s, r, done, info
    
    def __getattr__(self, attr):
        return getattr(self.env, attr)
    
        
env = ExtendedEnv("CartPole-v1")

class Object(object): pass

class Model(ModelBase):
    
    @staticmethod
    def set_network(self, n_actions, n_states, y, buffer_length, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')

            self.done = tf.placeholder(tf.float32, [None], name='done')

            self.max_Qs1 = tf.placeholder(tf.float32, [None], name='max_Qs1')
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')

            ops = dict(
                trainable=True, 
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                use_bias=False,
                bias_initializer=None
            )


            net = tf.layers.dense(self.s, 32, activation=tf.nn.relu, name='relu_layer', **ops)
            self.Qs = tf.layers.dense(net, n_actions, name='linear_layer', **ops)

            self.Qsa = select_columns(self.Qs, self.a)

            self.max_Qs = tf.reduce_max(self.Qs, 1)

            self.target = Model.soft_if(self.done, self.r,  self.r + y * self.max_Qs1)

            self.error = self.target - self.Qsa
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)


            self.episode_length_summary = tf.summary.scalar('episode_length', self.episode_length)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    self.a,
                    Then(tf.one_hot, 2),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
    
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=50000):
        
        self.replay_buffer = ExperienceReplay(max_length=buffer_length)
        self.target_network = Object()
        self.global_max = 0.0
        
        with self.graph.as_default(), tf.device("cpu:0"):
            
            Model.set_network(self, n_actions, n_states, y, buffer_length, "actor")
            
            Model.set_network(self.target_network, n_actions, n_states, y, buffer_length, "target")
            
            self.update_target = tf.group(*[
                tf.assign(t, a) for t, a in zip(self.target_network.variables, self.variables)
            ])
            
        
            
            
    @staticmethod
    def soft_if(cond, then, else_):
        return (cond * then) + (1.0 - cond) * else_
    
    @staticmethod
    def clip(gradients, clip_fun):
        return [ (clip_fun(g), v) for g, v in gradients ]
                              
    def fit_feed(self, S, A, R, Max_Qs1, Done, learning_rate):
        return {self.s: S, self.a: A, self.r: R, self.max_Qs1: Max_Qs1, self.done: Done, self.learning_rate: learning_rate}
                
    def choose_action(self, state, e=0.1):
        actions = self.sess.run(self.Qs, feed_dict={self.s: [state]})[0]
        n = len(actions)
        
        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.argmax(actions)
    
    
    
    def fit(self, env, learning_rate=0.05, print_step=10, episodes=100000, max_episode_length=float('inf'), discount=0.9, batch_size=32):
        
        for episode in range(episodes):
            done = False
            ep_num = 0
            s = env.reset()
            episode_length = 0
            learning_rate = e = max(0.0001, 1. / (1. + (episode / 10.)))
            e = max(0.01, 1. / (1. + (episode / 10.)))
            
            if episode % 20 == 0:
                self.sess.run(self.update_target)
            
            while not done and ep_num <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                
                a = self.choose_action(s, e)
                s1, r, done, info = env.step(a)
                
                if done:
                    r = -100.0
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                s = s1
                
            
            
            episode_length_summary = self.sess.run(self.episode_length_summary, feed_dict={self.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary)
            
            if episode_length > self.global_max:
                print("[MAX] Episode: {}, Length: {}, e: {}, buffer_len: {}".format(episode, episode_length, e, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".max")
                self.save(model_path = self.logs_path + "/Q-network-full.max")
                self.global_max = episode_length
                
            
            for _ in range(episode_length):
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                MaxQs1 = self.sess.run(self.target_network.max_Qs, feed_dict={self.target_network.s: S1})
                
                feed_dict = self.fit_feed(S, A, R, MaxQs1, Done, learning_rate)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=feed_dict)
                self.writer.add_summary(summaries)
            
            
            if episode % print_step == 0:
                print("[NOR] Episode: {}, Length: {}, e: {}, buffer_len: {}".format(episode, episode_length, e, len(self.replay_buffer)))
                self.save()
                self.save(model_path = self.logs_path + "/Q-network-full.model")
            
                
                
run += 1
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/Q-network-full.model"
logs_path = "/logs/run{}".format(run)

[2017-02-21 17:26:27,125] Making new env: CartPole-v1


In [7]:
model = Model(
    n_actions, n_states,
    model_path = model_path,
    logs_path = logs_path,
    flush_secs = 3.0,
)

print("run: {},\n s: {},\n a: {},\n r: {},\n Qs: {},\n update: {}".format(
    run, model.s, model.a, model.r, model.Qs, model.update
))

run: 25,
 s: Tensor("actor/s:0", shape=(?, 12), dtype=float32, device=/device:CPU:0),
 a: Tensor("actor/a:0", shape=(?,), dtype=int32, device=/device:CPU:0),
 r: Tensor("actor/r:0", shape=(?,), dtype=float32, device=/device:CPU:0),
 Qs: Tensor("actor/linear_layer/MatMul:0", shape=(?, 2), dtype=float32, device=/device:CPU:0),
 update: name: "actor/Adam"
op: "NoOp"
input: "^actor/Adam/update_actor/relu_layer/kernel/ApplyAdam"
input: "^actor/Adam/update_actor/linear_layer/kernel/ApplyAdam"
input: "^actor/Adam/Assign"
input: "^actor/Adam/Assign_1"
device: "/device:CPU:0"



In [None]:
model.fit(
    env, 
    episodes=50000,
    max_episode_length = 30000
)

[MAX] Episode: 0, Length: 13, e: 1.0, buffer_len: 13
[NOR] Episode: 0, Length: 13, e: 1.0, buffer_len: 13
[MAX] Episode: 1, Length: 19, e: 0.909090909091, buffer_len: 32
[MAX] Episode: 4, Length: 33, e: 0.714285714286, buffer_len: 96
[NOR] Episode: 10, Length: 18, e: 0.5, buffer_len: 180
[NOR] Episode: 20, Length: 12, e: 0.333333333333, buffer_len: 309
[MAX] Episode: 24, Length: 44, e: 0.294117647059, buffer_len: 400
[NOR] Episode: 30, Length: 12, e: 0.25, buffer_len: 498
[NOR] Episode: 40, Length: 22, e: 0.2, buffer_len: 642
[NOR] Episode: 50, Length: 14, e: 0.166666666667, buffer_len: 827
[NOR] Episode: 60, Length: 11, e: 0.142857142857, buffer_len: 960
[MAX] Episode: 67, Length: 46, e: 0.12987012987, buffer_len: 1121
[NOR] Episode: 70, Length: 11, e: 0.125, buffer_len: 1160
[NOR] Episode: 80, Length: 35, e: 0.111111111111, buffer_len: 1306
[MAX] Episode: 82, Length: 52, e: 0.108695652174, buffer_len: 1394
[NOR] Episode: 90, Length: 28, e: 0.1, buffer_len: 1600
[MAX] Episode: 100, Le

In [9]:
import time

model_run = Model(
    n_actions, n_states,
    model_path = model_path + ".max",
    flush_secs = 3.0,
    restore = True
)
ext_env = ExtendedEnv(env)
s = ext_env.reset()
done = False

while not done:
    a = model_run.choose_action(s, e=0)
    s, r, done, info = ext_env.step(a)
    env.render()
    time.sleep(0.1)

ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type