In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import os

  from ._conv import register_converters as _register_converters


In [44]:

#Model - this makes the network (ie the policy)

#we have the soft Q function, the parameterised value function V, and the parameterised tractable policy pi

# TODOs
# Make comments properly

# Whitening of inputs

# Train, test routines


class replayBuffer():
    def __init__(self,n_inputs,n_outputs,max_buffer_size = 1e4,min_pool_size=1000,batch_size=128):
        self._max_size = int(max_buffer_size)
        self._min_pool_size = int(min_pool_size)
        self._batch_size = batch_size
        self.n_outputs = n_outputs
        self.n_inputs = n_inputs
        self.reset()
        
    def reset(self):
        self._size = 0
        self._pos = 0
        
        self.actions = np.zeros([self._max_size,self.n_outputs])
        self.observations = np.zeros([self._max_size,self.n_inputs])
        self.next_observations = np.zeros([self._max_size,self.n_inputs])
        self.rewards = np.zeros(self._max_size)
        self.dones = np.zeros(self._max_size)
        
    def add_sample(self,action,obs,next_obs,reward,done):
        self.actions[self._pos] = action
        self.observations[self._pos] = obs
        self.next_observations[self._pos] = next_obs
        self.rewards[self._pos] = reward
        self.dones[self._pos] = done
        
        self._advance()
    
    def _advance(self):
        self._pos = (self._pos + 1) % self._max_size
        
        if self._size < self._max_size:
            self._size += 1
            
    def get_samples(self):
        inds = np.random.randint(0,self._size,self._batch_size)
        return dict(actions = self.actions[inds],
                   observations = self.observations[inds],
                   next_observations = self.next_observations[inds],
                   rewards = self.rewards[inds],
                   dones = self.dones[inds])
    
    def get_last_sample(self):
        last_pos = [(self._pos-1) % self._max_size]
        return dict(actions = self.actions[last_pos],
                   observations = self.observations[last_pos],
                   next_observations = self.next_observations[last_pos],
                   rewards = self.rewards[last_pos],
                   dones = self.dones[last_pos])
    
    def batch_ready(self):
        return self._size >= self._min_pool_size
        
class logger():
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.vars = {}
        
    def record(self,var_name,value):
        if hasattr(self.vars,var_name):
            self.vars[var_name].append(value)
        else:
            self.vars[var_name] = [value]
    
    def get(self,var_name):
        if hasattr(self.vars,var_name):
            return self.vars[var_name]
        else:
            return False

class Sampler():
    def __init__(self,policy,env,replaybuffer):
        self.policy = policy
        self.env = env
        self.replaybuffer = replaybuffer
        
        self.reset()
        
    def reset(self): 
        self.current_obs = False
        self.episodes = 0
        self.current_episode_reward = 0
        self.mean_episode_reward = 0

    def sample(self):
        if (self.current_obs is False):
            self.current_obs = env.reset()
            
        action = self.policy.get_action(self.current_obs)
        next_obs, reward, done, info = env.step(action)
        if self.policy.discrete == True:
            action = np.eye(self.policy.action_size)[action]
        rb.add_sample(action,self.current_obs,next_obs,reward,done)
        self.current_obs = next_obs
        
        self.current_episode_reward += reward

        if done:
            self.current_obs = False   
            self.episodes += 1

            log.record('episode_reward',self.current_episode_reward)
            self.mean_episode_reward = (self.mean_episode_reward * (self.episodes - 1) + self.current_episode_reward) / self.episodes
            self.current_episode_reward = 0
            


In [45]:

# Here is a helper class to make a simple neural network. Importantly, it allows us to easily get the parameters, and hopefully to link the inputs to other variables
# The get_output functionality is borrowed from the SAC reference code.

class MLP():
    def __init__(self,name,inputs,output_size,n_hidden,n_layers):
        self._name = name
        self.inputs = inputs
        self.output_size = output_size
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        
        self.output = self.make_network(reuse = False)
        
    def make_network(self,inputs = False,reuse = tf.AUTO_REUSE):
        # This function just makes a simple fully connected network. It is structured in a little bit of a silly way. The idea is that this lets one reuse the network weights elsewhere with different inputs. Currently not actually using this functionality 
        if inputs is False :
            inputs = self.inputs
            
        with tf.variable_scope(self._name,reuse = reuse):
            if not(isinstance(inputs,tf.Tensor)):  # Can chuck in more than one input. This just concatenates them
                inputs = tf.concat(inputs,axis=1)

            # To do: understand weight initialization!   
            self.hidden = slim.stack(inputs, slim.fully_connected, [self.n_hidden]*self.n_layers, scope='fc',activation_fn=tf.nn.relu) #,weights_regularizer=slim.l2_regularizer(0.1)
            outputs = slim.fully_connected(self.hidden,self.output_size,activation_fn=None)
        return outputs

    def get_params_internal(self):
        # Useful function to get network weights
        
        scope = tf.get_variable_scope().name
        scope += '/' + self._name + '/' if len(scope) else self._name + '/'

        return tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope
        )

In [46]:
class Qnet(MLP):
    # Make a simple q network
    def __init__(self,action_size,obs,n_hidden,n_layers):
        # Ok so Q function takes s,a, gives Q   
        self.obs = obs
        # Super is used to call the init method of the parent class
        super(Qnet,self).__init__('qNet',self.obs,action_size,n_hidden,n_layers)
      

In [108]:

class Policy_Discrete():
    # Make a policy.
    # For now assume that discrete action space, such that tractable - obviously this slightly defeats the point of SAC implementation since all about how hard to compute the partition function
    # P function takes s, needs to be able to give actions.

    def __init__(self,Qnet,scheme = 'Bltz',reward_scale = 1.0,epsilon_start = 1,epsilon_end=0.1,epsilon_decay=10000):

        self.Qnet = Qnet
        self.action_size = Qnet.output_size
        self.reward_scale = reward_scale
        self.scheme = scheme
        self.e = epsilon_start
        self.e_end = epsilon_end
        self.e_decay_frac = (1-1/epsilon_decay)
        
        self.discrete = True
        self._name = 'Policy'
        self.make_policy_outputs(reuse=False)
        
    def make_policy_outputs(self, reuse = tf.AUTO_REUSE):
       
        with tf.variable_scope(self._name,reuse = reuse):
            if self.scheme == 'Bltz':
                self.policy_output = tf.nn.softmax(reward_scale*self.Qnet.output,axis=1) # Automatically sum to one.
                self.log_policy_output = tf.log(self.policy_output)
                self.action = tf.multinomial(self.log_policy_output, num_samples=1)[0] # Will generate an action
            elif self.scheme == 'Epsilon':
                self.action = tf.argmax(self.Qnet.output)
                
            
    def get_action(self,obs):
        
        if self.scheme == 'Bltz':
                a = self.action.eval(feed_dict = {self.Qnet.obs : [obs]})[0]
                
        elif self.scheme == 'Epsilon':
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < self.e:
                a = np.random.randint(0,self.action_size)
            else:
                a = self.action.eval(feed_dict = {self.Qnet.obs : [obs]})[0]
                
            self.e = (self.e - self.e_end) * self.e_decay_frac + self.e_end
            
        return a
                

    
    
class Deep_Q_Learning():
    # This class handles the training of the networks
    def __init__(self,Qnet,actions,obs,next_obs,rewards,dones,lr=3e-4,discount = 0.99, tau=0.005):
        self.lr = lr
        self.discount = discount
        self.tau = tau 
        
        # Maybe would be nicer to not pass these but define here, but this seems to be messy. Once check if works, could go back to defining here
        self.actions = actions
        self.obs = obs
        self.next_obs = next_obs
        self.rewards = rewards
        self.dones = dones
        
        self.Qnet = Qnet
        self.Q_outputs = Qnet.output
        
        with tf.variable_scope('qNet_T'):
            self.target_Q_outputs = Qnet.make_network(inputs = self.next_obs,reuse=False) 
            self.target_Q_params = Qnet.get_params_internal()
        
        self.next_obs_out = Qnet.make_network(inputs = self.next_obs)
        self.predict = tf.one_hot(tf.argmax(Qnet.make_network(inputs = self.next_obs),axis=1),self.Qnet.output_size)
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
        
        self.train_ops = []
        self.init_Q_net_training()
        self.init_target_Q_update()
        
        
    def init_Q_net_training(self):
        training_variables = self.Qnet.get_params_internal()
        with tf.variable_scope('Q_loss'):
            Q_t = tf.stop_gradient(self.rewards +  self.discount * (1-self.dones) * tf.reduce_sum(self.target_Q_outputs*self.predict,axis=1))
            self.Q_Loss = 0.5*tf.reduce_mean(tf.square(tf.reduce_sum(self.Q_outputs*self.actions,axis=1) - Q_t))
            tf.summary.scalar('Q_loss', self.Q_Loss)
        

#         Qnet_regularization_losses = tf.get_collection(
#             tf.GraphKeys.REGULARIZATION_LOSSES,
#             scope=self.Qnet._name)
#         Qnet_regularization_loss = tf.reduce_sum(
#             Qnet_regularization_losses)
    
#         gradients, variables = zip(*self.optimizer.compute_gradients(self.Q_Loss + Qnet_regularization_loss,var_list = training_variables))
#         gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
#         self.train_Q = self.optimizer.apply_gradients(zip(gradients, variables))

        self.train_Q = self.optimizer.minimize(self.Q_Loss,var_list = training_variables)

        self.train_ops.append(self.train_Q)
        
    def init_target_Q_update(self):
        # Pull the qnet params
        qnet_params = self.Qnet.get_params_internal()
        
        with tf.variable_scope('Target_Q_update'):
            self.tQnet_update = []
            for tQ_p in self.target_Q_params:
                # Match each target net param with equiv from vnet
                Q_p = [v for v in qnet_params if tQ_p.name[(tQ_p.name.index('/')+1):] in v.name]
                assert(len(Q_p) == 1) # Check that only found one variable
                Q_p = Q_p[0]
                with tf.control_dependencies([self.train_Q]):
                    self.tQnet_update.append(tQ_p.assign(self.tau * Q_p + (1-self.tau)*tQ_p))
            self.tQnet_update = tf.group(self.tQnet_update)
            
        self.train_ops.append(self.tQnet_update)
        
    def _construct_feed_dict(self,samples):  
        return {self.actions : samples['actions'],
                    self.obs : samples['observations'],
                    self.next_obs : samples['next_observations'],
                    self.dones : samples['dones'],
                    self.rewards : samples['rewards']}
                    
    def train(self, samples, *args):
        feed_dict = self._construct_feed_dict(samples)
        return tf.get_default_session().run([self.train_ops] + list(args), feed_dict = feed_dict)[1:]


In [109]:
params = {
    
    'CartPole-v0' : 
    {'base' : dict(
        env_name = 'CartPole-v0',
        epoch_length = 1000,
        max_epochs = 100,
        online_training = True,
        grad_steps_per_t = 1,
        ),
     'replay_buffer' : dict(
        batch_size = 100,
        max_buffer_size = 50000,
        min_pool_size = 500,
     ),
     'algorithm' : dict(
        reward_scale = 1,
        epsilon_start = 1.0,
        epsilon_end = 0.1,
        epsilon_decay = 20000,
        scheme = 'Epsilon',
        lr = 1e-5,
        tau = 0.01
     ),
     'nnet' : dict(
        n_hidden = 10,
        n_layers = 1
     )
    },
    
         'LunarLander-v2' : 
    {'base' : dict(
        env_name = 'LunarLander-v2',
        epoch_length = 1000,
        max_epochs = 100,
        online_training = False,
        grad_steps_per_t = 1,
        ),
     'replay_buffer' : dict(
        batch_size = 256,
        max_buffer_size = 1e6,
        min_pool_size = 1000,
     ),
     'algorithm' : dict(
        reward_scale = 2,
        epsilon_start = 1.0,
        epsilon_end = 0.1,
        epsilon_decay = 10000,
        scheme = 'Epsilon',
        lr = 3e-4,
        tau = 0.01
     ),
     'nnet' : dict(
        n_hidden = 20,
        n_layers = 2
     )
    }
    
    
}

In [110]:
log_dir = os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
                               'tensorflow/logs/soft_actor_critic')
expm_name = 'CartPole-v0'

base_params = params[expm_name]['base']
env = gym.make(base_params['env_name'])

n_inputs = env.observation_space.shape[0]
n_outputs = env.action_space.n
    
epoch_length = base_params['epoch_length']
max_epochs = base_params['max_epochs']
online_training = base_params['online_training']
grad_steps_per_t = base_params['grad_steps_per_t']

lr = params[expm_name]['algorithm']['lr']
reward_scale = params[expm_name]['algorithm']['reward_scale']
tau = params[expm_name]['algorithm']['tau']
epsilon_start = params[expm_name]['algorithm']['epsilon_start']
epsilon_end = params[expm_name]['algorithm']['epsilon_end']
epsilon_decay = params[expm_name]['algorithm']['epsilon_decay']

scheme = params[expm_name]['algorithm']['scheme']

n_hidden = params[expm_name]['nnet']['n_hidden']
n_layers = params[expm_name]['nnet']['n_layers']

batch_size = params[expm_name]['replay_buffer']['batch_size']
max_buffer_size = params[expm_name]['replay_buffer']['max_buffer_size']
min_pool_size = params[expm_name]['replay_buffer']['min_pool_size']
# Todo make these into lists so that can define each layer separately


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [111]:
tf.reset_default_graph() # THIS IS NECESSARY BEFORE MAKING NEW SESSION TO STOP IT ERRORING!!
try:
    sess
except:
    pass
else:
    sess.close()
    del sess
sess = tf.InteractiveSession()

rewards = tf.placeholder(tf.float32,shape = [None],name = 'rewards')
actions = tf.placeholder(tf.float32,shape = [None,n_outputs],name = 'actions')
observations = tf.placeholder(tf.float32,shape = [None,n_inputs],name = 'observations')
next_observations = tf.placeholder(tf.float32,shape = [None,n_inputs],name = 'next_observations')
dones = tf.placeholder(tf.float32,shape = [None],name = 'dones')

qnet = Qnet(n_outputs,observations,n_hidden=n_hidden,n_layers=n_layers)
policy = Policy_Discrete(qnet,reward_scale=reward_scale,epsilon_start = epsilon_start,epsilon_end=epsilon_end,epsilon_decay=epsilon_decay,scheme=scheme)

algo = Deep_Q_Learning(qnet,actions,observations,next_observations,rewards,dones,lr=lr,tau=tau)

rb = replayBuffer(n_inputs,n_outputs,max_buffer_size,min_pool_size = min_pool_size,batch_size=batch_size)
sampler = Sampler(policy,env,rb)

            
log = logger() 
merged = tf.summary.merge_all()

writer = tf.summary.FileWriter(log_dir, sess.graph)

tf.global_variables_initializer().run()


In [112]:
for i in range(max_epochs):
    sampler.reset()
    epoch_avg_losses = 0
    
    for t in range(epoch_length):
        sampler.sample()
        
        if rb.batch_ready() or online_training:
            if online_training:
                samples = rb.get_last_sample()
                summary,losses,qnet_o = algo.train(samples,merged,algo.Q_Loss,qnet.output) 
            else:
                for j in range(grad_steps_per_t):
                    samples = rb.get_samples()
                    summary,losses= algo.train(samples,merged,algo.Q_Loss) 
            epoch_avg_losses = (epoch_avg_losses*(t) + np.array(losses))/(t+1)
            print(algo.predict.eval(feed_dict = algo._construct_feed_dict(samples)))
            print(algo.next_obs_out.eval(feed_dict = algo._construct_feed_dict(samples)))
            print(algo.target_Q_outputs.eval(feed_dict = algo._construct_feed_dict(samples)))

    log.record('mean_episode_reward',sampler.mean_episode_reward)
    writer.add_summary(summary, i)
    print(epoch_avg_losses)
    
    writer.flush()

    print('Epoch %i, mean_reward %d' % (i, sampler.mean_episode_reward))
    


0.8856529944334177
Epoch 0, mean_reward 22
0.9718393453322495
Epoch 1, mean_reward 18
0.8851114018789813
Epoch 2, mean_reward 21
0.8692759827123956
Epoch 3, mean_reward 20


KeyboardInterrupt: 

[[0. 1.]]
[[0.02752121 0.25922558]]
[[0.02610591 0.25770772]]
[[0.00681379 0.05556605]]


array([[2.0556815, 0.5648127]], dtype=float32)