In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import os

  from ._conv import register_converters as _register_converters


In [2]:

#Model - this makes the network (ie the policy)

#we have the soft Q function, the parameterised value function V, and the parameterised tractable policy pi

# TODOs
# Make comments properly

# Make run time routine
# Whitening of inputs

# Need batching

# Need recall

# Train, test routines


# Here is a helper class to make a simple neural network. Importantly, it allows us to easily get the parameters, and hopefully to link the inputs to other variables
# The get_output functionality is borrowed from the SAC reference code.

class MLP():
    def __init__(self,name,inputs,output_size,n_hidden,n_layers):
        self._name = name
        self.inputs = inputs
        self.output_size = output_size
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        
        self.output = self.make_network(reuse = False)
        
    def make_network(self,inputs = False,reuse = tf.AUTO_REUSE):
        if inputs is False :
            inputs = self.inputs
            
        with tf.variable_scope(self._name,reuse = reuse):
            if not(isinstance(inputs,tf.Tensor)):  
                inputs = tf.concat(inputs,axis=1)

            # To do understand weight initialization!   
            self.hidden = slim.stack(inputs, slim.fully_connected, [self.n_hidden]*self.n_layers, scope='fc',activation_fn=tf.nn.relu)
            outputs = slim.fully_connected(self.hidden,self.output_size)
        return outputs

    def get_params_internal(self):

        scope = tf.get_variable_scope().name
        scope += '/' + self._name + '/' if len(scope) else self._name + '/'

        return tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope
        )

class Qnet(MLP):
    def __init__(self,actions,obs,n_hidden,n_layers):
        # Ok so Q function takes s,a, gives Q   
        self.actions = actions
        self.obs = obs
        
        # Super is used to call the init method of the parent class
        super(Qnet,self).__init__('qNet',(self.actions,self.obs),1,n_hidden,n_layers)


class Vnet(MLP):
    def __init__(self,obs,n_hidden,n_layers):
        # Ok so V function takes s, gives V
        self.obs = obs
        super(Vnet,self).__init__('vNet',(self.obs),1,n_hidden,n_layers)

class Policy_Discrete(MLP):
    # P function takes s, needs to be able to give actions.
    # For now assume that discrete action space, such that tractable - obviously this slightly defeats the point of SAC implementation since all about how hard to compute the partition function
    def __init__(self,action_size,obs,n_hidden,n_layers):

        self.obs = obs
        
        super(Policy_Discrete,self).__init__('policy',(self.obs),action_size,n_hidden,n_layers)
        self.make_policy_outputs(reuse=False)
        
    def make_policy_outputs(self, reuse = tf.AUTO_REUSE):
       
        with tf.variable_scope(self._name + '_outs',reuse = reuse):
            self.policy_output = tf.nn.softmax(self.output) # Automatically sum to one.
            self.log_policy_output = tf.log(self.policy_output)
            self.action = tf.multinomial(self.log_policy_output, num_samples=1)[0] # Will generate an action


    def get_action(self,obs):
        return self.action.eval(feed_dict = {self.obs : [obs]})[0]
    

#Ok, so pass the Pnet, Qnet, Vnet

class Soft_Actor_Critic():
    def __init__(self,Qnet,Vnet,Policy,actions,obs,next_obs,rewards,dones,lr=1e-3,discount = 0.99, tau=0.99):
        self.lr = lr
        self.discount = discount
        self.tau = tau 
        
        # Maybe would be nicer to not pass these but define here, but this seems to be messy
        self.actions = actions
        self.obs = obs
        self.next_obs = next_obs
        self.rewards = rewards
        self.dones = dones
        
        self.Qnet = Qnet
        self.Vnet = Vnet
        self.Policy = Policy
        
        self.Qs = self.Qnet.output
        self.Vs = self.Vnet.output
        self.policy_log_a = self.Policy.log_policy_output
        
        # Duplicate v network for target. Need to add training for this
        with tf.variable_scope('vNet_T'):
            self.target_Vs = self.Vnet.make_network(inputs = self.next_obs,reuse=False)
            self.target_V_params = self.Vnet.get_params_internal()
            
        self.optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
        
        self.init_Q_net_training()
        self.init_V_net_training()
        self.init_Policy_training()
        self.init_target_v_update()
        
        
        self.train_ops = tf.group(self.train_Q,self.train_V,self.train_P,self.tvnet_update)
        
        
    def init_Q_net_training(self):
        training_variables = self.Qnet.get_params_internal()
        with tf.variable_scope('Q_loss'):
            Q_t = tf.stop_gradient(self.rewards + self.discount * (1-self.dones) * self.target_Vs)
            self.Q_Loss = 0.5*tf.reduce_mean(tf.square(self.Qs - Q_t))
            
        self.train_Q = self.optimizer.minimize(self.Q_Loss,var_list = training_variables)

    def init_V_net_training(self):
        training_variables = self.Vnet.get_params_internal()

        with tf.variable_scope('V_loss'):
            V_t = tf.stop_gradient(self.Qs - self.policy_log_a) 
            self.V_Loss = 0.5*tf.reduce_mean(tf.square(self.Vs - V_t))

        self.train_V = self.optimizer.minimize(self.V_Loss,var_list = training_variables)


    def init_Policy_training(self):
        training_variables = self.Policy.get_params_internal()

        with tf.variable_scope('P_loss'):
            P_t = tf.stop_gradient(self.Qs - self.Vs) 
            self.P_Loss = 0.5*tf.reduce_mean(tf.square(self.policy_log_a - P_t))
            
        self.train_P = self.optimizer.minimize(self.P_Loss,var_list = training_variables)

        
    def init_target_v_update(self):
        vnet_params = self.Vnet.get_params_internal()
        
        with tf.variable_scope('Vt_loss'):
            tvnet_update = []
            for tv_p in self.target_V_params:
                v_p = [v for v in vnet_params if tv_p.name[(tv_p.name.index('/')+1):] in v.name]
                assert(len(v_p) == 1) # Check that only found one variable
                v_p = v_p[0]
                with tf.control_dependencies([self.train_V]):
                    tvnet_update.append(tv_p.assign(self.tau * tv_p + (1-self.tau)*v_p))
            self.tvnet_update = tf.group(tvnet_update)

    def _construct_feed_dict(self,samples):  
        return {self.actions : samples['actions'],
                    self.obs : samples['observations'],
                    self.next_obs : samples['next_observations'],
                    self.dones : samples['dones'],
                    self.rewards : samples['rewards']}
                    
    def train(self, samples):
        feed_dict = self._construct_feed_dict(samples)
        return tf.get_default_session().run(self.train_ops, feed_dict = feed_dict)

class replayBuffer():
    def __init__(self,n_inputs,n_outputs,max_buffer_size = 1e4):
        self._max_size = int(max_buffer_size)
        self._size = 0
        self._pos = 0
        
        self.actions = np.zeros([self._max_size,n_outputs])
        self.observations = np.zeros([self._max_size,n_inputs])
        self.next_observations = np.zeros([self._max_size,n_inputs])
        self.rewards = np.zeros(self._max_size)
        self.dones = np.zeros(self._max_size)
        
    def add_sample(self,action,obs,next_obs,reward,done):
        self.actions[self._pos] = action
        self.observations[self._pos] = obs
        self.next_observations[self._pos] = next_obs
        self.rewards[self._pos] = reward
        self.dones[self._pos] = done
        
        self._advance()
    
    def _advance(self):
        self._pos = (self._pos + 1) % self._max_size
        if self._size < self._max_size:
            self._size += 1
            
    def get_samples(self,n_samples):
        inds = np.random.randint(0,self._size,n_samples)
        return dict(actions = self.actions[inds],
                   observations = self.observations[inds],
                   next_observations = self.next_observations[inds],
                   rewards = self.rewards[inds],
                   dones = self.dones[inds])
    
    def ready_to_sample(self,n_samples):
        return self._size >= n_samples
        
class logger():
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.vars = {}
        
    def record(self,var_name,value):
        if hasattr(self.vars,var_name):
            self.vars[var_name].append(value)
        else:
            self.vars[var_name] = [value]
    
    def get(self,var_name):
        if hasattr(self.vars,var_name):
            return self.vars[var_name]
        else:
            return False
            
log = logger() 

# action = tf.clip_by_value(logits,tf.expand_dims(env.action_space.low,0),tf.expand_dims(env.action_space.high,0)) # Have to clip the action space. This might be a bad idea


#should make so that the pi can be easily changed

#Algorithm ie Soft Actor Critic - training etc makes the ops

#Env

#optimizer


In [3]:
log_dir = os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
                               'tensorflow/logs/soft_actor_critic')
env_name = 'LunarLander-v2'
env = gym.make(env_name)

n_inputs = env.observation_space.shape[0]
n_outputs = env.action_space.n
    
epoch_length = 1000
max_epochs = 100

samples_per_env_step = 4
max_buffer_size = 1e4

n_hidden = 10
n_layers = 1
# Todo make these into lists so that can define each layer separately


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
tf.reset_default_graph() # THIS IS NECESSARY BEFORE MAKING NEW SESSION TO STOP IT ERRORING!!
try:
    sess
except:
    pass
else:
    sess.close()
    del sess
sess = tf.InteractiveSession()

rewards = tf.placeholder(tf.float32,shape = [None],name = 'rewards')
actions = tf.placeholder(tf.float32,shape = [None,n_outputs],name = 'action')
observations = tf.placeholder(tf.float32,shape = [None,n_inputs],name = 'observations')
next_observations = tf.placeholder(tf.float32,shape = [None,n_inputs],name = 'next_observations')
dones = tf.placeholder(tf.float32,shape = [None],name = 'dones')

qnet = Qnet(actions,observations,n_hidden,n_layers)
vnet = Vnet(observations,n_hidden,n_layers)
pnet = Policy_Discrete(n_outputs,observations,n_hidden,n_layers)

sac = Soft_Actor_Critic(qnet,vnet,pnet,actions,observations,next_observations,rewards,dones)

rb = replayBuffer(n_inputs,n_outputs,max_buffer_size)
writer = tf.summary.FileWriter(log_dir, sess.graph)

tf.global_variables_initializer().run()


In [None]:
for i in range(max_epochs):
    obs = env.reset()
    episodes = 0
    episode_steps = 0
    episode_reward = 0
    mean_episode_reward = 0
    
    for t in range(epoch_length):
        action = pnet.get_action(obs)
        next_obs, reward, done, info = env.step(action)
        episode_steps += 1
        episode_reward += reward

        rb.add_sample(action,obs,next_obs,reward,done)

        if done:
            obs = env.reset()
            episodes += 1
            
            log.record('episode_reward',episode_reward)
            mean_episode_reward = (mean_episode_reward * (episodes - 1) + episode_reward) / episodes
            episode_steps = 0
            episode_reward = 0

        if rb.ready_to_sample(samples_per_env_step):
            samples = rb.get_samples(samples_per_env_step)
            sac.train(samples)
            
    log.record('mean_episode_reward',mean_episode_reward)
    print('Epoch %i, mean_reward %d' % i, mean_episode_reward)
    
writer.flush()