# CMA-ES for CartPole TF2
### Christian Igel, 2019

If you have suggestions for improvement, [let me know](mailto:igel@diku.dk).

In [2]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import cma

# Define task
env = gym.make('CartPole-v0')
state_space_dimension = 4
action_space_dimension = 1

First we define some helper functions to count and set network weights:

In [3]:
def number_of_weights(weights): 
    number_of_parameters = 0
    for w in weights:
        number_of_parameters += w.size
    return number_of_parameters

def weights_from_vector(weights, parameter_vector):
    assert number_of_weights(weights) == parameter_vector.size, \
      'number of parameters do not match: %r vs. %r' % (number_of_weights(weights), parameter_vector.size)
    idx = 0
    new_weights = []
    for w in weights:   
        new_weights.append(parameter_vector[idx:idx + w.size].reshape(w.shape))
        idx += w.size
    return new_weights

The task is only regarded as being solved if the network can balance the pole successfully from five different initializations.
Then the functions returns -1000.

Define the policy network:

In [4]:
# Model definition
no_hidden = 10
policy_net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(no_hidden, use_bias=False, activation='tanh', input_shape=(state_space_dimension,)),
    tf.keras.layers.Dense(1, use_bias=False, activation='linear')
])
d = number_of_weights(policy_net.get_weights())
print(policy_net.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                40        
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 10        
Total params: 50
Trainable params: 50
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
# Can we do it k times?
def fitness_cart_pole_k(x, k=4):
    for i in range(k):
        weights = weights_from_vector(policy_net.get_weights(), x)
        policy_net.set_weights(weights)
        state = env.reset()

        R = 0
        while True:
            out = policy_net(state.reshape((1, state_space_dimension)))
            a = int(out > 0)
            state, reward, done, _ = env.step(a)
            R += reward
            
            if done:
                if(R<200):
                    return False
                else:
                    break
    return True

def fitness_cart_pole(x):
    weights = weights_from_vector(policy_net.get_weights(), x)
    policy_net.set_weights(weights)
    state = env.reset()
    R = 0
    while True:
        out = policy_net(state.reshape((1, state_space_dimension)))
        a = int(out > 0)
        state, reward, done, _ = env.step(a)
        R += reward
            
        if done:
            if(R == 200):
                if(fitness_cart_pole_k(x)):
                    return -1000
            return -R
    

Do the learning:

In [6]:
# Generate initial search point and initial hidden RNN states
initial_weights = np.random.normal(0, 0.01, d)
initial_sigma = .01

# Do the optimization
res = cma.fmin(fitness_cart_pole,  # Objective function
               initial_weights,  # Initial search point
               initial_sigma,  # Initial global step-size sigma
               options={'ftarget': -999.9})

W1015 14:28:50.176019 140360641607360 base_layer.py:1814] Layer dense is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



(7_w,15)-aCMA-ES (mu_w=4.5,w_1=34%) in dimension 50 (seed=203755, Tue Oct 15 14:28:50 2019)
Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
    1     15 -6.300000000000000e+01 1.0e+00 9.53e-03  1e-02  1e-02 0:00.2
    2     30 -6.100000000000000e+01 1.0e+00 9.23e-03  9e-03  9e-03 0:00.5
    3     45 -2.000000000000000e+02 1.1e+00 8.93e-03  9e-03  9e-03 0:01.2
    5     75 -1.000000000000000e+03 1.1e+00 8.67e-03  9e-03  9e-03 0:05.4
termination on ftarget=-999.9 (Tue Oct 15 14:28:55 2019)
final/bestever f-value = -2.000000e+02 -1.000000e+03
incumbent solution: [-0.00984044 -0.0145872   0.00897445  0.00604748  0.00295987  0.01058524
  0.0148456   0.0166346  ...]
std deviations: [0.00859662 0.00864387 0.008681   0.00869661 0.0086245  0.0086752
 0.00862933 0.00867832 ...]


In [8]:
# Learn even more on CMA-ES
cma.CMAOptions() 
cma.fmin?