In [1]:
import numpy as np
import gym
from gym.wrappers import Monitor
from numpy.random import choice
import random
from phi.api import *
import tensorflow as tf
from tfinterface.reinforcement import OnBatchModel, OnExperienceTrainer, ExperienceReplay
from tfinterface.interfaces import EnvironmentInterface
from tfinterface.model_base import ModelBase
import os
from scipy.interpolate import interp1d
import numbers



def get_run():
    try:
        with open("run.txt") as f:
            run = int(f.read().split("/n")[0])
    except:
        run = -1
    
    with open("run.txt", 'w+') as f:
        run += 1
        
        f.seek(0)
        f.write(str(run))
        f.truncate()
        
    return run

In [36]:
def select_columns(tensor, indexes):
    idx = tf.stack((tf.range(tf.shape(indexes)[0]), indexes), 1)
    return tf.gather_nd(tensor, idx)


def soft_if(cond, then, else_):
    return (cond * then) + (1.0 - cond) * else_
    

def clip(gradients, clip_fun):
    return [ (clip_fun(g), v) for g, v in gradients ]


class ExpandedStateEnv(EnvironmentInterface):
    
    def __init__(self, env):
        self.env = gym.make(env) if type(env) is str else env
        
    def reset(self):
        s = self.env.reset()
        self.s = np.hstack((s,s,s))
        return self.s
    
    def step(self, a):
        s, r, done, info = self.env.step(a)
        n = len(s)
        self.s = np.hstack((self.s[n:], s))
        
        return self.s, r, done, info
    
    def __getattr__(self, attr):
        return getattr(self.env, attr)
    


class Inputs(object):
    def __init__(self, n_actions, n_states, y, buffer_length, scope):
        with tf.variable_scope(scope):
            self.episode_length = tf.placeholder(tf.int64, [], name='episode_length')

            self.s = tf.placeholder(tf.float32, [None, n_states], name='s')
            self.a = tf.placeholder(tf.int32, [None], name='a')
            self.r = tf.placeholder(tf.float32, [None], name='r')

            self.done = tf.placeholder(tf.float32, [None], name='done')

            self.max_Qs1 = tf.placeholder(tf.float32, [None], name='max_Qs1')
            self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
            
class Network(object):
    def __init__(self, inputs, n_actions, n_states, y, buffer_length, scope):
        with tf.variable_scope(scope):
            ops = dict(
                trainable=True, 
                kernel_initializer=tf.random_uniform_initializer(minval=0.0, maxval=0.01),
                use_bias=False,
                bias_initializer=None
            )


            net = tf.layers.dense(inputs.s, 32, activation=tf.nn.relu, name='relu_layer', **ops)
            self.Qs = tf.layers.dense(net, n_actions, name='linear_layer', **ops)

            self.Qsa = select_columns(self.Qs, inputs.a)

            self.max_Qs = tf.reduce_max(self.Qs, 1)

            self.target = soft_if(inputs.done, inputs.r,  inputs.r + y * inputs.max_Qs1)

            self.error = self.target - self.Qsa
            self.loss = Pipe(self.error, tf.nn.l2_loss, tf.reduce_mean)

            self.variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)

            self.update = tf.train.AdamOptimizer(inputs.learning_rate).minimize(self.loss)


            self.episode_length_summary = tf.summary.scalar('episode_length', inputs.episode_length)

            self.summaries = tf.summary.merge([
                tf.summary.scalar('loss', self.loss),
                tf.summary.scalar('avg_target', tf.reduce_mean(self.target)),
                tf.summary.scalar('variables_sum', sum([ tf.reduce_sum(v) for v in self.variables ])),
                tf.summary.histogram(
                    'avg_action', Pipe(
                    inputs.a,
                    Then(tf.one_hot, 2),
                    Then(tf.reduce_mean, axis=0)
                ))
            ]+[
                tf.summary.histogram('var{}'.format(i), self.variables[i]) for i in range(len(self.variables))
            ])
            
    
    

class DQN(ModelBase):
    def define_model(self, n_actions, n_states, y=0.98, buffer_length=500000):
        
        self.replay_buffer = ExperienceReplay(max_length=buffer_length)
        self.global_max = 0.0
        
        with self.graph.as_default(), tf.device("cpu:0"):
            self.inputs = Inputs(n_actions, n_states, y, buffer_length, "inputs")
            self.network = Network(self.inputs, n_actions, n_states, y, buffer_length, "network")
            self.target_network = Network(self.inputs, n_actions, n_states, y, buffer_length, "target_network")
            
            self.update = self.network.update
            self.update_target = tf.group(*[
                tf.assign(t, a) for t, a in zip(self.target_network.variables, self.network.variables)
            ])
            
            self.summaries = tf.summary.merge([self.network.summaries, self.target_network.summaries])
            
        
            
                              
    def fit_feed(self, S, A, R, Max_Qs1, Done, learning_rate):
        return {
            self.inputs.s: S, self.inputs.a: A, self.inputs.r: R, 
            self.inputs.max_Qs1: Max_Qs1, self.inputs.done: Done, 
            self.inputs.learning_rate: learning_rate
        }
                
    def choose_action(self, state, e=0.1):
        actions = self.sess.run(self.network.Qs, feed_dict={self.inputs.s: [state]})[0]
        n = len(actions)
        
        if random.random() < e:
            return random.randint(0, n-1)
        else:
            return np.argmax(actions)
    
    
    
    def fit(self, env, k=100., learning_rate=0.05, print_step=10, episodes=100000, max_episode_length=float('inf'), discount=0.9, batch_size=32):
        
        for episode in range(episodes):
            done = False
            ep_num = 0
            s = env.reset()
            episode_length = 0
            
            
            if episode % 20 == 0:
                self.sess.run(self.update_target)
            
            while not done and ep_num <= max_episode_length:
                self.global_step += 1
                episode_length += 1
                
                learning_rate = max(0.0001, 1. / (1. + (self.global_step / k)))
                e = max(0.01, 1. / (1. + (self.global_step / k)))
                
                a = self.choose_action(s, e)
                s1, r, done, info = env.step(a)
                
                if done:
                    r = -100.0
                
                self.replay_buffer.append((s, a, r, s1, float(done)))
                
                S, A, R, S1, Done = self.replay_buffer.random_batch(batch_size).unzip()
                MaxQs1 = self.sess.run(self.target_network.max_Qs, feed_dict={self.inputs.s: S1})
                
                feed_dict = self.fit_feed(S, A, R, MaxQs1, Done, learning_rate)
                _, summaries = self.sess.run([self.update, self.summaries], feed_dict=feed_dict)
                self.writer.add_summary(summaries)
                
                s = s1
                
            
            
            episode_length_summary = self.sess.run(self.network.episode_length_summary, feed_dict={self.inputs.episode_length: episode_length})
            self.writer.add_summary(episode_length_summary)
            
            if episode_length > self.global_max:
                print("[MAX] Episode: {}, Length: {}, e: {}, learning_rate: {}, buffer_len: {}".format(episode, episode_length, e, learning_rate, len(self.replay_buffer)))
                self.save(model_path = self.model_path + ".max")
                self.save(model_path = self.logs_path + "/Q-network-full.max")
                self.global_max = episode_length
            
            
            if episode % print_step == 0:
                print("[NOR] Episode: {}, Length: {}, e: {}, learning_rate: {}, buffer_len: {}".format(episode, episode_length, e, learning_rate, len(self.replay_buffer)))
                self.save()
                self.save(model_path = self.logs_path + "/Q-network-full.model")

run = get_run()
env_logs = '/tmp/cartpole-{}'.format(run)

env = gym.make('CartPole-v1')
# env = Monitor(env, env_logs)
env = ExpandedStateEnv(env)
                
n_actions = env.action_space.n
n_states = env.observation_space.shape[0] * 3
model_path = os.getcwd() + "/Q-network-full.model"
logs_path = "logs/run{}".format(run)

[2017-03-02 23:08:28,037] Making new env: CartPole-v1


In [37]:

model = DQN(
    n_actions, n_states,
    model_path = model_path,
    logs_path = logs_path,
    flush_secs = 3.0,
    y = 0.9999,
    buffer_length=500000
)

print("run: {},\n s: {},\n a: {},\n r: {},\n Qs: {},\n update: {}".format(
    run, model.inputs.s, model.inputs.a, model.inputs.r, model.network.Qs, model.update
))

run: 32,
 s: Tensor("inputs/s:0", shape=(?, 12), dtype=float32, device=/device:CPU:0),
 a: Tensor("inputs/a:0", shape=(?,), dtype=int32, device=/device:CPU:0),
 r: Tensor("inputs/r:0", shape=(?,), dtype=float32, device=/device:CPU:0),
 Qs: Tensor("network/linear_layer/MatMul:0", shape=(?, 2), dtype=float32, device=/device:CPU:0),
 update: name: "network/Adam"
op: "NoOp"
input: "^network/Adam/update_network/relu_layer/kernel/ApplyAdam"
input: "^network/Adam/update_network/linear_layer/kernel/ApplyAdam"
input: "^network/Adam/Assign"
input: "^network/Adam/Assign_1"
device: "/device:CPU:0"



In [None]:
model.fit(
    env, 
    episodes=50000,
    max_episode_length = 60000,
    k = 1000.
)

[NOR] Episode: 0, Length: 43, e: 0.137023842149, learning_rate: 0.137023842149, buffer_len: 6299
[MAX] Episode: 4, Length: 172, e: 0.129382843835, learning_rate: 0.129382843835, buffer_len: 6730
[NOR] Episode: 10, Length: 37, e: 0.126008064516, learning_rate: 0.126008064516, buffer_len: 6937
[NOR] Episode: 20, Length: 10, e: 0.117453605826, learning_rate: 0.117453605826, buffer_len: 7515
[NOR] Episode: 30, Length: 13, e: 0.112511251125, learning_rate: 0.112511251125, buffer_len: 7889
[NOR] Episode: 40, Length: 161, e: 0.104460461715, learning_rate: 0.104460461715, buffer_len: 8574
[MAX] Episode: 41, Length: 179, e: 0.102543068089, learning_rate: 0.102543068089, buffer_len: 8753
[MAX] Episode: 45, Length: 194, e: 0.100050025013, learning_rate: 0.100050025013, buffer_len: 8996
[NOR] Episode: 50, Length: 167, e: 0.0946700747894, learning_rate: 0.0946700747894, buffer_len: 9564
[MAX] Episode: 55, Length: 213, e: 0.09055510278, learning_rate: 0.09055510278, buffer_len: 10044
[MAX] Episode: 

In [34]:
import time

model_run = DQN(
    n_actions, n_states,
    model_path = model_path + ".max",
    flush_secs = 3.0,
    restore = True
)

env = gym.make('CartPole-v1')
# env = Monitor(env, env_logs)
env = ExpandedStateEnv(env)


s = env.reset()
done = False

while not done:
    a = model_run.choose_action(s, e=0)
    s, r, done, info = env.step(a)
    env.render()
    time.sleep(0.01)

[2017-03-02 18:00:32,674] Making new env: CartPole-v1


ArgumentError: argument 2: <type 'exceptions.TypeError'>: wrong type