In [16]:
"""
A bare bones examples of optimizing a black-box function (f) using
Natural Evolution Strategies (NES), where the parameter distribution is a 
gaussian of fixed standard deviation.
"""

import numpy as np
#np.random.seed(0)

# the function we want to optimize
def f(w):
  # here we would normally:
  # ... 1) create a neural network with weights w
  # ... 2) run the neural network on the environment for some time
  # ... 3) sum up and return the total reward

  # but for the purposes of an example, lets try to minimize
  # the L2 distance to a specific solution vector. So the highest reward
  # we can achieve is 0, when the vector w is exactly equal to solution
  reward = -np.sum(np.square(solution - w))
  return reward

# hyperparameters
npop = 50 # population size
sigma = 0.1 # noise standard deviation
alpha = 0.001 # learning rate

# initialze the optimization
solution = np.array([0.5, 0.1, -0.3])
w = np.random.randn(3) # our initial guess is random

# start the optimization
for i in range(300):

  # print current fitness of the most likely parameter setting
  if i % 20 == 0:
    print('iter %d. w: %s, solution: %s, reward: %f' % (i, str(w), str(solution), f(w)))

  # initialize memory for a population of w's, and their rewards
  N = np.random.randn(npop, 3) # samples from a normal distribution N(0,1)
  R = np.zeros(npop)
  for j in range(npop):
    w_try = w + sigma*N[j] # jitter w using gaussian of sigma 0.1
    R[j] = f(w_try) # evaluate the jittered version

  # standardize the rewards to have a gaussian distribution
  A = (R - np.mean(R)) / np.std(R)
    
  # perform the parameter update. The matrix multiply below
  # is just an efficient way to sum up all the rows of the noise matrix N,
  # where each row N[j] is weighted by A[j]
  w = w + alpha/(npop*sigma) * np.dot(N.T, A)

iter 0. w: [-0.24698585  1.30580815  0.67329242], solution: [ 0.5  0.1 -0.3], reward: -2.959259
iter 20. w: [-0.16284468  1.17202958  0.5606038 ], solution: [ 0.5  0.1 -0.3], reward: -2.329249
iter 40. w: [-0.07899048  1.03274563  0.44787205], solution: [ 0.5  0.1 -0.3], reward: -1.764557
iter 60. w: [0.00931179 0.90124559 0.3425777 ], solution: [ 0.5  0.1 -0.3], reward: -1.295676
iter 80. w: [0.0968119  0.76291701 0.22995821], solution: [ 0.5  0.1 -0.3], reward: -0.882875
iter 100. w: [0.18019401 0.61013542 0.129085  ], solution: [ 0.5  0.1 -0.3], reward: -0.546628
iter 120. w: [0.26248082 0.46936003 0.012722  ], solution: [ 0.5  0.1 -0.3], reward: -0.290637
iter 140. w: [ 0.34445169  0.33505541 -0.089466  ], solution: [ 0.5  0.1 -0.3], reward: -0.123771
iter 160. w: [ 0.42114351  0.21879481 -0.20889024], solution: [ 0.5  0.1 -0.3], reward: -0.028632
iter 180. w: [ 0.46928359  0.13442792 -0.2776589 ], solution: [ 0.5  0.1 -0.3], reward: -0.002628
iter 200. w: [ 0.48535004  0.1091224  

In [18]:
 alpha/sigma

0.01