In [5]:
"""
A bare bones examples of optimizing a black-box function (f) using
Natural Evolution Strategies (NES), where the parameter distribution is a 
gaussian of fixed standard deviation.
"""

import numpy as np
#np.random.seed(0)

# the function we want to optimize
def f(w):
  # here we would normally:
  # ... 1) create a neural network with weights w
  # ... 2) run the neural network on the environment for some time
  # ... 3) sum up and return the total reward

  # but for the purposes of an example, lets try to minimize
  # the L2 distance to a specific solution vector. So the highest reward
  # we can achieve is 0, when the vector w is exactly equal to solution
  reward = -np.sum(np.square(solution - w))
  return reward

# hyperparameters
npop = 50 # population size
sigma = 0.1 # noise standard deviation
alpha = 0.001 # learning rate

# initialze the optimization
solution = np.array([0.5, 0.1, -0.3])
w = np.random.randn(3) # our initial guess is random

# start the optimization
for i in range(300):

  # print current fitness of the most likely parameter setting
  if i % 1 == 0:
    print('iter %d. w: %s, solution: %s, reward: %f' % (i, str(w), str(solution), f(w)))

  # initialize memory for a population of w's, and their rewards
  N = np.random.randn(npop, 3) # samples from a normal distribution N(0,1)
  R = np.zeros(npop)
  for j in range(npop):
    w_try = w + sigma*N[j] # jitter w using gaussian of sigma 0.1
    R[j] = f(w_try) # evaluate the jittered version

  # standardize the rewards to have a gaussian distribution
  A = (R - np.mean(R)) / np.std(R)
  #A = R
    

  # perform the parameter update. The matrix multiply below
  # is just an efficient way to sum up all the rows of the noise matrix N,
  # where each row N[j] is weighted by A[j]
  w = w + alpha/(npop*sigma) * np.dot(N.T, A)

iter 0. w: [-0.27916276 -0.12851063 -0.02231567], solution: [ 0.5  0.1 -0.3], reward: -0.736420
iter 1. w: [-0.27117474 -0.12768884 -0.02717357], solution: [ 0.5  0.1 -0.3], reward: -0.720987
iter 2. w: [-0.2610864  -0.12365639 -0.03065439], solution: [ 0.5  0.1 -0.3], reward: -0.701822
iter 3. w: [-0.25102923 -0.12283951 -0.03233998], solution: [ 0.5  0.1 -0.3], reward: -0.685344
iter 4. w: [-0.24196168 -0.12277463 -0.03488263], solution: [ 0.5  0.1 -0.3], reward: -0.670423
iter 5. w: [-0.23511739 -0.11943736 -0.03772421], solution: [ 0.5  0.1 -0.3], reward: -0.657339
iter 6. w: [-0.22812307 -0.11747227 -0.04007989], solution: [ 0.5  0.1 -0.3], reward: -0.645016
iter 7. w: [-0.21985687 -0.1144399  -0.04109246], solution: [ 0.5  0.1 -0.3], reward: -0.631211
iter 8. w: [-0.21079789 -0.11302076 -0.04449796], solution: [ 0.5  0.1 -0.3], reward: -0.615893
iter 9. w: [-0.20235613 -0.11212618 -0.04956268], solution: [ 0.5  0.1 -0.3], reward: -0.601021
iter 10. w: [-0.19495199 -0.10819967 -0.

iter 132. w: [ 0.49762237  0.09429063 -0.30772073], solution: [ 0.5  0.1 -0.3], reward: -0.000098
iter 133. w: [ 0.49944239  0.09734053 -0.30885552], solution: [ 0.5  0.1 -0.3], reward: -0.000086
iter 134. w: [ 0.50188333  0.09579501 -0.30784944], solution: [ 0.5  0.1 -0.3], reward: -0.000083
iter 135. w: [ 0.5022813   0.09161767 -0.30927906], solution: [ 0.5  0.1 -0.3], reward: -0.000162
iter 136. w: [ 0.50348804  0.09263906 -0.30971572], solution: [ 0.5  0.1 -0.3], reward: -0.000161
iter 137. w: [ 0.50475925  0.09721965 -0.30913419], solution: [ 0.5  0.1 -0.3], reward: -0.000114
iter 138. w: [ 0.49934568  0.10041869 -0.30485869], solution: [ 0.5  0.1 -0.3], reward: -0.000024
iter 139. w: [ 0.49713698  0.10004025 -0.30660733], solution: [ 0.5  0.1 -0.3], reward: -0.000052
iter 140. w: [ 0.49730501  0.09896246 -0.30687631], solution: [ 0.5  0.1 -0.3], reward: -0.000056
iter 141. w: [ 0.50036884  0.100503   -0.30377844], solution: [ 0.5  0.1 -0.3], reward: -0.000015
iter 142. w: [ 0.500

iter 222. w: [ 0.50405389  0.09987829 -0.29249195], solution: [ 0.5  0.1 -0.3], reward: -0.000073
iter 223. w: [ 0.50570496  0.09812459 -0.29575804], solution: [ 0.5  0.1 -0.3], reward: -0.000054
iter 224. w: [ 0.50144419  0.0984789  -0.29683962], solution: [ 0.5  0.1 -0.3], reward: -0.000014
iter 225. w: [ 0.50404062  0.09791421 -0.29557762], solution: [ 0.5  0.1 -0.3], reward: -0.000040
iter 226. w: [ 0.49724303  0.09733416 -0.2944958 ], solution: [ 0.5  0.1 -0.3], reward: -0.000045
iter 227. w: [ 0.49339763  0.09808149 -0.29434862], solution: [ 0.5  0.1 -0.3], reward: -0.000079
iter 228. w: [ 0.49483566  0.0974987  -0.29529813], solution: [ 0.5  0.1 -0.3], reward: -0.000055
iter 229. w: [ 0.49625295  0.09994741 -0.29312448], solution: [ 0.5  0.1 -0.3], reward: -0.000061
iter 230. w: [ 0.49594094  0.10256998 -0.29467418], solution: [ 0.5  0.1 -0.3], reward: -0.000051
iter 231. w: [ 0.4934631   0.10270838 -0.29826988], solution: [ 0.5  0.1 -0.3], reward: -0.000053
iter 232. w: [ 0.492

In [18]:
 alpha/sigma

0.01