# Evolution strategy

Here for funsies I try to understand the Evolution Strategy implementation from https://arxiv.org/pdf/1703.03864.pdf

I implemented a simple serial version of the code, along with a basic Adam optimizer to use the extracted gradient estimate for training


In [1]:
import tensorflow as tf
import numpy as np
  
from tensorflow.python.ops import variables
from tensorflow.python.framework import ops
from scipy.stats import rankdata


  from ._conv import register_converters as _register_converters


In [2]:
# Setup the gym environment!
import gym
env_name = 'BipedalWalker-v2'
env = gym.make(env_name)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


## Adam Optimizer code

In [3]:

# Adam Optimizer https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
# To do --> learn more

# Recipe:
# t <- t + 1
# lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)

# m_t <- beta1 * m_{t-1} + (1 - beta1) * g
# v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
# variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)

class Simple_Optimizer(object):
    def __init__(self, variables,grad_vars, alpha = 1e-3,beta1 = 1e-1):
        self.a = tf.Variable(alpha,trainable=False)
        self.b1 = tf.constant(beta1)
        
        self.vars = variables
        self.grad_vars = grad_vars
        
        self.create_update_ops()
        
    def create_update_ops(self):
        ops = []
        for var,g in zip(self.vars, self.grad_vars):
            ops.append(var.assign_add(self.a * g))
        self.update = tf.group(*ops)

    
class Momentum_Optimizer(object):
    def __init__(self, variables,grad_vars, alpha = 1e-3,beta1 = 0.9):
        self.a = tf.Variable(alpha,trainable=False)
        self.b1 = tf.constant(beta1)
        
        self.vars = variables
        self.grad_vars = grad_vars
        
        self.m = []
        for var in self.vars:
            self.m.append(tf.Variable(tf.zeros(var.get_shape()),trainable=False))
        
        self.create_update_ops()
        
    def create_update_ops(self):
        ops = []
        for var,g,m in zip(self.vars, self.grad_vars, self.m):
            m_op = m.assign(self.b1 * m + (1 - self.b1) * g)
            with tf.get_default_graph().control_dependencies([m_op]): # Ensure m runs first
                var_op = var.assign_add(self.a * m)
            ops += [m_op,var_op]
        self.update = tf.group(*ops)
    
class Adam_Optimizer(object):
    def __init__(self, variables,grad_vars, alpha = 1e-3,beta1 = 0.9, beta2 = 0.999, eps = 1e-8):
        self.t = tf.Variable(0.0,trainable=False)
        self.a = tf.Variable(alpha,trainable=False)
        self.b1 = tf.constant(beta1)
        self.b2 = tf.constant(beta2)
        self.eps = tf.constant(eps)
        self.vars = variables
        self.grad_vars = grad_vars
        
        self.m = []
        self.v = []
        for var in self.vars:
            self.m.append(tf.Variable(tf.zeros(var.get_shape()),trainable=False))
            self.v.append(tf.Variable(tf.zeros(var.get_shape()),trainable=False))
        
        self.create_update_ops()
        
    def create_update_ops(self):
        t_op = self.t.assign_add(1.0)
        with tf.get_default_graph().control_dependencies([t_op]): # Ensure t runs first
            a_op = self.a.assign(self.a * tf.sqrt(1- tf.pow(self.b2,self.t)) / (1 - tf.pow(self.b1,self.t)))   
        ops = [a_op,t_op]
        
        for var,g,m,v in zip(self.vars, self.grad_vars, self.m, self.v):
            m_op = m.assign(self.b1 * m + (1 - self.b1) * g)
            v_op = v.assign(self.b2 * v + (1 - self.b2) * tf.square(g))
            with tf.get_default_graph().control_dependencies([m_op,v_op,a_op]): # Ensure m,v,a runs first
                var_op = var.assign_add(self.a * m / (tf.sqrt(v) + self.eps))
            ops += [m_op,v_op,var_op]
        self.update = tf.group(*ops)
    

## Evolution strategy gradient estimator

In [4]:

def rescale_to_normal(array):
    # Helper function
    return (array - np.mean(array))/ np.std(array)


class ES_gradient_estimator(object):
    def __init__(self, sigma = 0.01,weight_decay=0.005, mini_batch_size = 30):
        self.sigma = tf.constant(np.float(sigma)) # Standard deviation of weight adjustments
        self.weight_decay = tf.constant(weight_decay)
        self.mini_batch_size = mini_batch_size
        
        self.current_ind = tf.placeholder("int32")
        self.update_weights = tf.placeholder("float32",shape = [mini_batch_size])
        
        # Used for fitness shaping http://www.jmlr.org/papers/volume15/wierstra14a/wierstra14a.pdf
        self.fitness_shape_rank_vals = [np.max([0,np.log(self.mini_batch_size/2.0+1)-np.log(ind)]) for ind in np.arange(1,(self.mini_batch_size+1))]
        self.fitness_shape_rank_vals = self.fitness_shape_rank_vals/np.sum(self.fitness_shape_rank_vals)
#         self.fitness_shape_rank_vals = -(1.0/self.mini_batch_size)*(np.arange(0,self.mini_batch_size)/(self.mini_batch_size-1.0) - 0.5)
        
        # Get the weights and biases from TF
        self.set_links_to_vars()

    def set_links_to_vars(self):
        # Have to pull out the relevant variables from TF, make the operations necessary to modify them efficiently
        self.noise_vars = []
        self.grad_vars = []
        
        self.new_noise_vals_ops = []
        self.set_noise_ops = []
        self.copy_state_ops = []
        self.reset_state_ops = []
        self.calc_gradient_ops = []
        
        self.vars = variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        for var in self.vars:
            # Mirrored sampling! 
            tmp_noise_var = tf.random_normal([np.int(self.mini_batch_size/2)] + var.get_shape().as_list())
            noise_var = tf.concat([tmp_noise_var,-tmp_noise_var],0)
            
            current_noise_vals = tf.Variable(tf.zeros(noise_var.get_shape()),trainable=False)
            # Only want to make random noise variable change after each round, so have to do this fudge
            self.new_noise_vals_ops.append(current_noise_vals.assign(noise_var))
            self.noise_vars.append(current_noise_vals)
            copy_var = tf.Variable(var.initialized_value(),trainable=False)
            
            self.set_noise_ops.append(var.assign(copy_var + self.sigma * current_noise_vals[self.current_ind]))
            self.copy_state_ops.append(copy_var.assign(var))
            self.reset_state_ops.append(var.assign(copy_var))
            
            grad_var = tf.Variable(tf.zeros(var.get_shape()),trainable=False)
            self.grad_vars.append(grad_var)
            self.calc_gradient_ops.append(grad_var.assign(tf.tensordot(self.update_weights, current_noise_vals,axes=[0,0]) - self.weight_decay * var)) # Note weight decay
            
    # Three different flavours of update mechanism. In practice only using the fitness shaping version
    def calc_update_based_on_reward(self,rewards):
        
        self.calc_weights = 1.0/(self.mini_batch_size * self.sigma.eval()) * rescale_to_normal(rewards)
    
    def calc_update_based_on_highest_reward(self,rewards):
        
        self.calc_weights = self.sigma.eval()*np.array([(1.0 if val == np.max(rewards) else 0) for val in np.array(rewards)])
    
    def calc_update_fitness_shaping(self,rewards):
        
        k = self.mini_batch_size - rankdata(rewards_t, method='ordinal')
        self.calc_weights = self.fitness_shape_rank_vals[k]

## Build the network

In [22]:
tf.reset_default_graph() # THIS IS NECESSARY BEFORE MAKING NEW SESSION TO STOP IT ERRORING!!
try:
    sess
except:
    pass
else:
    sess.close()
    del sess
sess = tf.InteractiveSession()

# Borrowed some bits from http://mat.univie.ac.at/~grohs/tmp/DeepLearningClass_Jun28_1.html
n_inputs = env.observation_space.shape[0]
n_hidden = 100  
n_hlayers = 2
n_outputs = env.action_space.shape[0]
initializer = tf.contrib.layers.variance_scaling_initializer()

# 2. Build the neural network
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
Y = tf.placeholder(tf.int32, shape=[None])

layer = X
for _ in range(n_hlayers):
    layer = tf.layers.dense(layer, n_hidden, activation=tf.nn.relu,
                         kernel_initializer=initializer)

raw_action = tf.layers.dense(layer, n_outputs,
                          kernel_initializer=initializer)
action = tf.clip_by_value(raw_action,tf.expand_dims(env.action_space.low,0),tf.expand_dims(env.action_space.high,0)) # Have to clip the action space. This might be a bad idea
saver = tf.train.Saver()

es = ES_gradient_estimator(sigma = 0.05,mini_batch_size=200)

# optimizer = Adam_Optimizer(es.vars,es.grad_vars,alpha=0.03,beta1=0.7,beta2=0.99)
# optimizer = Simple_Optimizer(es.vars,es.grad_vars,alpha=0.1)
optimizer = Momentum_Optimizer(es.vars,es.grad_vars,alpha=0.2,beta1=0.7)

tf.global_variables_initializer().run()



In [23]:
# Some idiot checking here for when running new environments
observation = env.reset()
act = sess.run(action, feed_dict={X: np.expand_dims(observation,axis=0)})
print(act)
observation, reward, done, info = env.step(act[0])
print(observation)
print(reward)

[[ 1.         -0.37650865  0.69396424  0.18345767]]
[-0.02121598 -0.03145527 -0.02868943 -0.01274514  0.47453269  1.00028253
  0.08405423 -1.00017548  1.          0.37595776  0.90991032  0.08919561
 -0.88032214  1.          0.4465037   0.45157441  0.46737847  0.49586892
  0.54099655  0.61023712  0.71830201  0.89736676  1.          1.        ]
-0.23493897485857443


## Run the training

In [24]:
env = gym.make(env_name)

render = False

i = 0
max_t = 1000

sess.run(optimizer.a.assign(0.15))

while i < 1000:
    sess.run(es.copy_state_ops)
    sess.run(es.new_noise_vals_ops)

    rewards_t = np.zeros([es.mini_batch_size])
    obs_std = np.zeros([es.mini_batch_size,env.observation_space.shape[0]])
    
    for j in range(es.mini_batch_size):
        observation = env.reset()
        obs_t = np.array([observation])
        sess.run(es.set_noise_ops, feed_dict={es.current_ind : j})
        reward_t = 0
        for t in range(max_t):
            if render:
                env.render()
            obs_t = np.append(obs_t,[observation],axis = 0)
            
            act = sess.run(action, feed_dict={X: np.expand_dims(observation,axis=0)})[0]
            observation, reward, done, info = env.step(act)
            reward_t += reward
            if done or (t == max_t-1):
                rewards_t[j] = reward_t
                break
    print('mini batch results for run %d : %f' % (i + 1,np.mean(rewards_t)))
    
    es.calc_update_fitness_shaping(rewards_t)
    sess.run(es.reset_state_ops)
    sess.run(es.calc_gradient_ops,feed_dict={es.update_weights : es.calc_weights})
    sess.run(optimizer.update)
    i += 1
                
            
# saver.save(sess, "./my_policy_net_basic.ckpt")

env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
mini batch results for run 1 : -107.273033
mini batch results for run 2 : -93.529358
mini batch results for run 3 : -78.803114
mini batch results for run 4 : -70.769058
mini batch results for run 5 : -73.679605
mini batch results for run 6 : -69.722070
mini batch results for run 7 : -66.959893
mini batch results for run 8 : -78.672353
mini batch results for run 9 : -84.937350
mini batch results for run 10 : -86.326880
mini batch results for run 11 : -74.901914
mini batch results for run 12 : -68.593762
mini batch results for run 13 : -65.427668
mini batch results for run 14 : -64.081339
mini batch results for run 15 : -65.264838
mini batch results for run 16 : -52.673349
mini batch results for run 17 : -65.465931
mini batch results for run 18 : -58.385733
mini batch results

KeyboardInterrupt: 

In [25]:
# saver.save(sess, "./my_policy_net_basic.ckpt")

'./my_policy_net_basic.ckpt'

In [28]:
rewardsum = 0
env = gym.make(env_name)
obs = env.reset()
for step in range(1000):
    env.render()
    action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
    obs, reward, done, info = env.step(action_val[0])
    rewardsum += reward
    if done:
        break
env.close()
print(rewardsum)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
130.21838680337558
