In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import os

  from ._conv import register_converters as _register_converters


In [14]:

#Model - this makes the network (ie the policy)

#we have the soft Q function, the parameterised value function V, and the parameterised tractable policy pi

# TODOs
# Make comments properly

# Make run time routine
# Whitening of inputs

# Need batching

# Need recall

# Train, test routines


# Here is a helper class to make a simple neural network. Importantly, it allows us to easily get the parameters, and hopefully to link the inputs to other variables
# The get_output functionality is borrowed from the SAC reference code.

class MLP():
    def __init__(self,name,inputs,output_size,n_hidden,n_layers):
        self._name = name
        self.inputs = inputs
        self.output_size = output_size
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        
        self.output = self.make_network(reuse = False)
        
    def make_network(self,inputs = False,reuse = tf.AUTO_REUSE):
        if inputs is False :
            inputs = self.inputs
            
        with tf.variable_scope(self._name,reuse = reuse):
            if not(isinstance(inputs,tf.Tensor)):  
                inputs = tf.concat(inputs,axis=1)

            # To do understand weight initialization!   
            self.hidden = slim.stack(inputs, slim.fully_connected, [self.n_hidden]*self.n_layers, scope='fc',activation_fn=tf.nn.relu)
            outputs = slim.fully_connected(self.hidden,self.output_size)
        return outputs

    def get_params_internal(self):

        scope = tf.get_variable_scope().name
        scope += '/' + self._name + '/' if len(scope) else self._name + '/'

        return tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope
        )

class Qnet(MLP):
    def __init__(self,actions,obs,n_hidden,n_layers):
        # Ok so Q function takes s,a, gives Q   
        self.actions = actions
        self.obs = obs
        
        # Super is used to call the init method of the parent class
        super(Qnet,self).__init__('qNet',(self.actions,self.obs),1,n_hidden,n_layers)


class Vnet(MLP):
    def __init__(self,obs,n_hidden,n_layers):
        # Ok so V function takes s, gives V
        self.obs = obs
        super(Vnet,self).__init__('vNet',(self.obs),1,n_hidden,n_layers)

class Policy_Discrete(MLP):
    # P function takes s, needs to be able to give actions.
    # For now assume that discrete action space, such that tractable - obviously this slightly defeats the point of SAC implementation since all about how hard to compute the partition function
    def __init__(self,action_size,obs,n_hidden,n_layers):

        self.obs = obs
        
        super(Policy_Discrete,self).__init__('policy',(self.obs),action_size,n_hidden,n_layers)
        self.make_policy_outputs(reuse=False)
        
    def make_policy_outputs(self, reuse = tf.AUTO_REUSE):
       
        with tf.variable_scope(self._name + '_outs',reuse = reuse):
            self.policy_output = tf.nn.softmax(self.output) # Automatically sum to one.
            self.log_policy_output = tf.log(self.policy_output)
            self.action = tf.multinomial(self.log_policy_output, num_samples=1)[0] # Will generate an action


    def get_actions(self):
        pass

#Ok, so pass the Pnet, Qnet, Vnet

class Soft_Actor_Critic():
    def __init__(self,Qnet,Vnet,Policy,actions,obs,next_obs,rewards,lr=1e-3,discount = 0.99):
        self.lr = lr
        self.discount = discount
        
        self.actions = actions
        self.obs = obs
        self.next_obs = next_obs
        self.rewards = rewards
        
        self.Qnet = Qnet
        self.Vnet = Vnet
        self.Policy = Policy
        
        self.Qs = self.Qnet.output
        self.Vs = self.Vnet.output
        self.policy_log_a = self.Policy.log_policy_output
        
        # Duplicate v network for target. Need to add training for this
        with tf.variable_scope('target'):
            self.target_Vs = self.Vnet.make_network(inputs = self.next_obs,reuse=False)
            self.target_V_params = self.Vnet.get_params_internal()
            
        self.optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
        
        self.init_Q_net_training()
        self.init_V_net_training()
        self.init_Policy_training()
        
    def init_Q_net_training(self):
        training_variables = self.Qnet.get_params_internal()
        with tf.variable_scope('Q_loss'):
            self.Q_Loss = 0.5*tf.reduce_sum(tf.square(self.Qs - self.rewards - self.discount * self.target_Vs))
            
        self.optimizer.minimize(self.Q_Loss,var_list = training_variables)

    def init_V_net_training(self):
        training_variables = self.Vnet.get_params_internal()

        with tf.variable_scope('V_loss'):
            self.V_Loss = 0.5*tf.reduce_sum(tf.square(self.Vs - self.Qs + self.policy_log_a))

        self.optimizer.minimize(self.V_Loss,var_list = training_variables)


    def init_Policy_training(self):
        training_variables = self.Policy.get_params_internal()

        with tf.variable_scope('P_loss'):
            self.P_Loss = 0.5*tf.reduce_sum(tf.square(self.policy_log_a - self.Qs + self.Vs))
            
        self.optimizer.minimize(self.P_Loss,var_list = training_variables)



    def train(self):
        pass

# action = tf.clip_by_value(logits,tf.expand_dims(env.action_space.low,0),tf.expand_dims(env.action_space.high,0)) # Have to clip the action space. This might be a bad idea


#should make so that the pi can be easily changed

#Algorithm ie Soft Actor Critic - training etc makes the ops

#Env

#optimizer

class envMaker():
    def __init__(self,env_name):
        # Setup the gym environment!
        self.env = gym.make(env_name)

        self.n_inputs = self.env.observation_space.shape[0]
        self.n_outputs = self.env.action_space.n

In [15]:
log_dir = os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
                               'tensorflow/logs/soft_actor_critic')
env_name = 'LunarLander-v2'
env = envMaker(env_name)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [16]:
tf.reset_default_graph() # THIS IS NECESSARY BEFORE MAKING NEW SESSION TO STOP IT ERRORING!!
try:
    sess
except:
    pass
else:
    sess.close()
    del sess
sess = tf.InteractiveSession()

n_hidden = 10
n_layers = 1
# Todo make these into lists so that can define each layer separately

rewards = tf.placeholder(tf.float32,shape = [None],name = 'rewards')
actions = tf.placeholder(tf.float32,shape = [None,env.n_inputs],name = 'action')
obs = tf.placeholder(tf.float32,shape = [None,env.n_outputs],name = 'obs')
next_obs = tf.placeholder(tf.float32,shape = [None,env.n_inputs],name = 'next_obs')

qnet = Qnet(actions,obs,n_hidden,n_layers)
vnet = Vnet(obs,n_hidden,n_layers)
pnet = Policy_Discrete(env.n_outputs,obs,n_hidden,n_layers)

sac = Soft_Actor_Critic(qnet,vnet,pnet,actions,obs,next_obs,rewards)

writer = tf.summary.FileWriter(log_dir, sess.graph)

tf.global_variables_initializer().run()
writer.flush()

In [None]:
# writer.close()