In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import os

  from ._conv import register_converters as _register_converters


In [2]:

#Model - this makes the network (ie the policy)

#we have the soft Q function, the parameterised value function V, and the parameterised tractable policy pi

# TODOs
# Make comments properly

# Whitening of inputs

# Need batching

# Need recall

# Train, test routines


# Here is a helper class to make a simple neural network. Importantly, it allows us to easily get the parameters, and hopefully to link the inputs to other variables
# The get_output functionality is borrowed from the SAC reference code.

class MLP():
    def __init__(self,name,inputs,output_size,n_hidden,n_layers):
        self._name = name
        self.inputs = inputs
        self.output_size = output_size
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        
        self.output = self.make_network(reuse = False)
        
    def make_network(self,inputs = False,reuse = tf.AUTO_REUSE):
        # This function just makes a simple fully connected network. It is structured in a little bit of a silly way. The idea is that this lets one reuse the network weights elsewhere with different inputs. Currently not actually using this functionality 
        if inputs is False :
            inputs = self.inputs
            
        with tf.variable_scope(self._name,reuse = reuse):
            if not(isinstance(inputs,tf.Tensor)):  # Can chuck in more than one input. This just concatenates them
                inputs = tf.concat(inputs,axis=1)

            # To do: understand weight initialization!   
            self.hidden = slim.stack(inputs, slim.fully_connected, [self.n_hidden]*self.n_layers, scope='fc',activation_fn=tf.nn.relu)
            outputs = slim.fully_connected(self.hidden,self.output_size,activation_fn=None)
        return outputs

    def get_params_internal(self):
        # Useful function to get network weights
        
        scope = tf.get_variable_scope().name
        scope += '/' + self._name + '/' if len(scope) else self._name + '/'

        return tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope
        )

class Qnet(MLP):
    # Make a simple q network
    def __init__(self,action_size,obs,n_hidden,n_layers):
        # Ok so Q function takes s,a, gives Q   
        self.obs = obs
        # Super is used to call the init method of the parent class
        super(Qnet,self).__init__('qNet',self.obs,action_size,n_hidden,n_layers)

class Policy_Discrete(MLP):
    # Make a policy.
    # For now assume that discrete action space, such that tractable - obviously this slightly defeats the point of SAC implementation since all about how hard to compute the partition function
    # P function takes s, needs to be able to give actions.

    def __init__(self,action_size,obs,n_hidden,n_layers):

        self.obs = obs
        self.action_size = action_size
        self.discrete = True
        
        super(Policy_Discrete,self).__init__('policy',(self.obs),action_size,n_hidden,n_layers)
        self.make_policy_outputs(reuse=False)
        
    def make_policy_outputs(self, reuse = tf.AUTO_REUSE):
       
        with tf.variable_scope(self._name + '_outs',reuse = reuse):
            self.policy_output = tf.nn.softmax(self.output,axis=1) # Automatically sum to one.
            self.log_policy_output = tf.log(self.policy_output)
            self.action = tf.multinomial(self.log_policy_output, num_samples=1)[0] # Will generate an action


    def get_action(self,obs):
        return self.action.eval(feed_dict = {self.obs : [obs]})[0]
    
class Policy_Qnet_Discrete(MLP):
    # Make a policy.
    # For now assume that discrete action space, such that tractable - obviously this slightly defeats the point of SAC implementation since all about how hard to compute the partition function
    # P function takes s, needs to be able to give actions.

    def __init__(self,Qnet):

        self.Qnet = Qnet
        self.action_size = Qnet.output_size
        self.discrete = True
        self._name = 'Policy'
        self.make_policy_outputs(reuse=False)
        
    def make_policy_outputs(self, reuse = tf.AUTO_REUSE):
       
        with tf.variable_scope(self._name + '_outs',reuse = reuse):
            self.policy_output = tf.nn.softmax(self.Qnet.output,axis=1) # Automatically sum to one.
            self.log_policy_output = tf.log(self.policy_output)
            self.action = tf.multinomial(self.log_policy_output, num_samples=1)[0] # Will generate an action
#             self.V = tf.log(tf.reduce_sum(tf.exp(self.Qnet.output)))
            
    def get_action(self,obs):
        return self.action.eval(feed_dict = {self.Qnet.obs : [obs]})[0]

class Soft_Actor_Critic():
    # This class handles the training of the networks
    def __init__(self,Qnet,Policy,actions,obs,next_obs,rewards,dones,lr=3e-4,discount = 0.99, tau=0.05, reward_scale = 1.0):
        self.lr = lr
        self.discount = discount
        self.tau = tau 
        self.reward_scale = reward_scale
        
        # Maybe would be nicer to not pass these but define here, but this seems to be messy. Once check if works, could go back to defining here
        self.actions = actions
        self.obs = obs
        self.next_obs = next_obs
        self.rewards = rewards
        self.dones = dones
        
        self.Qnet = Qnet
        self.Policy = Policy
        
        self.Qs = self.Qnet.output
        self.policy_log_a = self.Policy.log_policy_output
        
        # Duplicate Q network for target.
        with tf.variable_scope('qNet_T'):
            self.target_Q_out = self.Qnet.make_network(inputs = self.next_obs,reuse=False) # We dont reuse weights
            self.target_Q_params = self.Qnet.get_params_internal()
            self.target_Vs = tf.log(tf.reduce_sum(tf.exp(self.target_Q_out),axis=1))
            
            
        self.optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
        
        self.train_ops = []
        self.init_Q_net_training()
        self.init_target_Q_update()
        
        
    def init_Q_net_training(self):
        training_variables = self.Qnet.get_params_internal()
        with tf.variable_scope('Q_loss'):
            Q_t = tf.stop_gradient(self.rewards*self.reward_scale +  self.discount * (1-self.dones) * self.target_Vs)
            self.Q_Loss = 0.5*tf.reduce_mean(tf.square(tf.reduce_sum(self.Qs*self.actions,axis=1) - Q_t))
            tf.summary.scalar('Q_loss', self.Q_Loss)
            
        self.train_Q = self.optimizer.minimize(self.Q_Loss,var_list = training_variables)
        self.train_ops.append(self.train_Q)
        
    def init_target_Q_update(self):
        # Pull the qnet params
        qnet_params = self.Qnet.get_params_internal()
        
        with tf.variable_scope('QT_loss'):
            self.tQnet_update = []
            for tQ_p in self.target_Q_params:
                # Match each target net param with equiv from vnet
                Q_p = [v for v in qnet_params if tQ_p.name[(tQ_p.name.index('/')+1):] in v.name]
                assert(len(Q_p) == 1) # Check that only found one variable
                Q_p = Q_p[0]
                
                self.tQnet_update.append(tQ_p.assign(self.tau * Q_p + (1-self.tau)*tQ_p))
            self.tQnet_update = tf.group(self.tQnet_update)
            
        self.train_ops.append(self.tQnet_update)
        
    def _construct_feed_dict(self,samples):  
        return {self.actions : samples['actions'],
                    self.obs : samples['observations'],
                    self.next_obs : samples['next_observations'],
                    self.dones : samples['dones'],
                    self.rewards : samples['rewards']}
                    
    def train(self, samples, *args):
        feed_dict = self._construct_feed_dict(samples)
        return tf.get_default_session().run([self.train_ops] + list(args), feed_dict = feed_dict)[1:]

class replayBuffer():
    def __init__(self,n_inputs,n_outputs,max_buffer_size = 1e4,min_pool_size=1000,batch_size=128):
        self._max_size = int(max_buffer_size)
        self._min_pool_size = int(min_pool_size)
        self._batch_size = batch_size
        self.n_outputs = n_outputs
        self.n_inputs = n_inputs
        self.reset()
        
    def reset(self):
        self._size = 0
        self._pos = 0
        
        self.actions = np.zeros([self._max_size,self.n_outputs])
        self.observations = np.zeros([self._max_size,self.n_inputs])
        self.next_observations = np.zeros([self._max_size,self.n_inputs])
        self.rewards = np.zeros(self._max_size)
        self.dones = np.zeros(self._max_size)
        
    def add_sample(self,action,obs,next_obs,reward,done):
        self.actions[self._pos] = action
        self.observations[self._pos] = obs
        self.next_observations[self._pos] = next_obs
        self.rewards[self._pos] = reward
        self.dones[self._pos] = done
        
        self._advance()
    
    def _advance(self):
        self._pos = (self._pos + 1) % self._max_size
        
        if self._size < self._max_size:
            self._size += 1
            
    def get_samples(self):
        inds = np.random.randint(0,self._size,self._batch_size)
        return dict(actions = self.actions[inds],
                   observations = self.observations[inds],
                   next_observations = self.next_observations[inds],
                   rewards = self.rewards[inds],
                   dones = self.dones[inds])
    
    def get_last_sample(self):
        last_pos = [(self._pos-1) % self._max_size]
        return dict(actions = self.actions[last_pos],
                   observations = self.observations[last_pos],
                   next_observations = self.next_observations[last_pos],
                   rewards = self.rewards[last_pos],
                   dones = self.dones[last_pos])
    
    def batch_ready(self):
        return self._size >= self._min_pool_size
        
class logger():
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.vars = {}
        
    def record(self,var_name,value):
        if hasattr(self.vars,var_name):
            self.vars[var_name].append(value)
        else:
            self.vars[var_name] = [value]
    
    def get(self,var_name):
        if hasattr(self.vars,var_name):
            return self.vars[var_name]
        else:
            return False

class Sampler():
    def __init__(self,policy,env,replaybuffer):
        self.policy = policy
        self.env = env
        self.replaybuffer = replaybuffer
        
        self.reset()
        
    def reset(self): 
        self.current_obs = False
        self.episodes = 0
        self.current_episode_reward = 0
        self.mean_episode_reward = 0

    def sample(self):
        if (self.current_obs is False):
            self.current_obs = env.reset()
            
        action = pnet.get_action(self.current_obs)
        next_obs, reward, done, info = env.step(action)
        if self.policy.discrete == True:
            action = np.eye(self.policy.action_size)[action]
        rb.add_sample(action,self.current_obs,next_obs,reward,done)
        self.current_obs = next_obs
        
        self.current_episode_reward += reward

        if done:
            self.current_obs = False   
            self.episodes += 1

            log.record('episode_reward',self.current_episode_reward)
            self.mean_episode_reward = (self.mean_episode_reward * (self.episodes - 1) + self.current_episode_reward) / self.episodes
            self.current_episode_reward = 0
# action = tf.clip_by_value(logits,tf.expand_dims(env.action_space.low,0),tf.expand_dims(env.action_space.high,0)) # Have to clip the action space. This might be a bad idea


#should make so that the pi can be easily changed

#Algorithm ie Soft Actor Critic - training etc makes the ops

#Env

#optimizer


In [3]:
params = {
    
    'CartPole-v0_online' : 
    {'base' : dict(
        env_name = 'CartPole-v0',
        epoch_length = 1000,
        max_epochs = 100,
        online_training = True,
        grad_steps_per_t = 1,
        ),
     'replay_buffer' : dict(
        batch_size = 2,
        max_buffer_size = 1e5,
        min_pool_size = 1000,
     ),
     'algorithm' : dict(
        reward_scale = 1.0,
        lr = 4e-3,
        tau = 0.05
     ),
     'nnet' : dict(
        n_hidden = 10,
        n_layers = 2
     )
    },
    
     'CartPole-v0_offline' : 
    {'base' : dict(
        env_name = 'CartPole-v0',
        epoch_length = 1000,
        max_epochs = 100,
        online_training = False,
        grad_steps_per_t = 1,
        ),
     'replay_buffer' : dict(
        batch_size = 128,
        max_buffer_size = 1e6,
        min_pool_size = 1000,
     ),
     'algorithm' : dict(
        reward_scale = 0.5,
        lr = 1e-2,
        tau = 0.05
     ),
     'nnet' : dict(
        n_hidden = 10,
        n_layers = 2
     )
    },
    
         'LunarLander-v2' : 
    {'base' : dict(
        env_name = 'LunarLander-v2',
        epoch_length = 1000,
        max_epochs = 100,
        online_training = True,
        grad_steps_per_t = 1,
        ),
     'replay_buffer' : dict(
        batch_size = 128,
        max_buffer_size = 1e5,
        min_pool_size = 1000,
     ),
     'algorithm' : dict(
        reward_scale = 1.0,
        lr = 1e-5,
        tau = 0.01
     ),
     'nnet' : dict(
        n_hidden = 10,
        n_layers = 2
     )
    }
    
    
}

In [4]:
log_dir = os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
                               'tensorflow/logs/soft_actor_critic')
expm_name = 'LunarLander-v2'

base_params = params[expm_name]['base']
env = gym.make(base_params['env_name'])

n_inputs = env.observation_space.shape[0]
n_outputs = env.action_space.n
    
epoch_length = base_params['epoch_length']
max_epochs = base_params['max_epochs']
online_training = base_params['online_training']
grad_steps_per_t = base_params['grad_steps_per_t']

lr = params[expm_name]['algorithm']['lr']
reward_scale = params[expm_name]['algorithm']['reward_scale']
tau = params[expm_name]['algorithm']['tau']

n_hidden = params[expm_name]['nnet']['n_hidden']
n_layers = params[expm_name]['nnet']['n_layers']

batch_size = params[expm_name]['replay_buffer']['batch_size']
max_buffer_size = params[expm_name]['replay_buffer']['max_buffer_size']
min_pool_size = params[expm_name]['replay_buffer']['min_pool_size']
# Todo make these into lists so that can define each layer separately


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [5]:
tf.reset_default_graph() # THIS IS NECESSARY BEFORE MAKING NEW SESSION TO STOP IT ERRORING!!
try:
    sess
except:
    pass
else:
    sess.close()
    del sess
sess = tf.InteractiveSession()

rewards = tf.placeholder(tf.float32,shape = [None],name = 'rewards')
actions = tf.placeholder(tf.float32,shape = [None,n_outputs],name = 'actions')
observations = tf.placeholder(tf.float32,shape = [None,n_inputs],name = 'observations')
next_observations = tf.placeholder(tf.float32,shape = [None,n_inputs],name = 'next_observations')
dones = tf.placeholder(tf.float32,shape = [None],name = 'dones')

qnet = Qnet(n_outputs,observations,n_hidden=n_hidden,n_layers=n_layers)
pnet = Policy_Discrete(qnet)

sac = Soft_Actor_Critic(qnet,pnet,actions,observations,next_observations,rewards,dones,reward_scale = reward_scale,lr=lr,tau=tau)

rb = replayBuffer(n_inputs,n_outputs,max_buffer_size,min_pool_size = min_pool_size,batch_size=batch_size)
sampler = Sampler(pnet,env,rb)

            
log = logger() 
merged = tf.summary.merge_all()

writer = tf.summary.FileWriter(log_dir, sess.graph)

tf.global_variables_initializer().run()


In [6]:
for i in range(max_epochs):
    sampler.reset()
    epoch_avg_losses = 0
    
    for t in range(epoch_length):
        sampler.sample()
        
        if rb.batch_ready() or online_training:
            if online_training:
                samples = rb.get_last_sample()
                summary,losses,qnet_o = sac.train(samples,merged,sac.Q_Loss,qnet.output) 
            else:
                for j in range(grad_steps_per_t):
                    samples = rb.get_samples()
                    summary,losses,qnet_o = sac.train(samples,merged,sac.Q_Loss,qnet.output) 
            epoch_avg_losses = (epoch_avg_losses*(t) + np.array(losses))/(t+1)
            
    log.record('mean_episode_reward',sampler.mean_episode_reward)
    writer.add_summary(summary, i)
    print(epoch_avg_losses)
    print(qnet_o[0])
    writer.flush()

    print('Epoch %i, mean_reward %d' % (i, sampler.mean_episode_reward))
    


59.26752275764205
[0.00419009 0.06072711 0.24655947 0.        ]
Epoch 0, mean_reward -294
56.08219911085396
[0.03569135 0.26418635 0.38578597 0.        ]
Epoch 1, mean_reward -209
56.326273789398144
[0.07685288 0.10097591 0.3480607  0.01388737]
Epoch 2, mean_reward -182
65.0976764597873
[0.         0.15631676 0.29078314 0.05547515]
Epoch 3, mean_reward -205
54.835568962456094
[0.00626669 0.12601984 0.2743091  0.        ]
Epoch 4, mean_reward -236
53.95506767972
[0.         0.0159335  0.23441038 0.20884317]
Epoch 5, mean_reward -217
67.14849610328
[0.00876944 0.06001875 0.42267472 0.02509613]
Epoch 6, mean_reward -273
50.05812411942332
[0.         0.         0.5935617  0.24710184]
Epoch 7, mean_reward -193
50.93986101289108
[0.11323611 0.03168104 0.35769764 0.        ]
Epoch 8, mean_reward -196
50.713939072269405
[0.         0.         0.3395669  0.12618797]
Epoch 9, mean_reward -268
59.09864540064954
[0.         0.10127079 0.21203655 0.09070032]
Epoch 10, mean_reward -216
68.1805773362

KeyboardInterrupt: 