# Cross-Entropy Method

---

In this notebook, we will train a Cross-Entropy Method with OpenAI Gym's MountainCarContinuous environment.

### 1. Import the Necessary Packages

In [1]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

!python -m pip install pyvirtualdisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

Collecting pyvirtualdisplay
  Downloading https://files.pythonhosted.org/packages/39/37/f285403a09cc261c56b6574baace1bdcf4b8c7428c8a7239cbba137bc0eb/PyVirtualDisplay-0.2.1.tar.gz
Collecting EasyProcess (from pyvirtualdisplay)
  Downloading https://files.pythonhosted.org/packages/45/3a/4eecc0c7995a13a64739bbedc0d3691fc574245b7e79cff81905aa0c2b38/EasyProcess-0.2.5.tar.gz
Building wheels for collected packages: pyvirtualdisplay, EasyProcess
  Running setup.py bdist_wheel for pyvirtualdisplay ... [?25ldone
[?25h  Stored in directory: /Users/cab/Library/Caches/pip/wheels/d1/8c/16/1c64227974ae29c687e4cc30fd691d5c0fd40f54446dde99da
  Running setup.py bdist_wheel for EasyProcess ... [?25ldone
[?25h  Stored in directory: /Users/cab/Library/Caches/pip/wheels/41/22/19/af15ef6264c58b625a82641ed7483ad05e258fbd8925505227
Successfully built pyvirtualdisplay EasyProcess
Installing collected packages: EasyProcess, pyvirtualdisplay
Successfully installed EasyProcess-0.2.5 pyvirtualdisplay-0.2.1


### 2. Instantiate the Environment and Agent

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

env = gym.make('MountainCarContinuous-v0')
env.seed(101)
np.random.seed(101)

print('observation space:', env.observation_space)
print('action space:', env.action_space)
print('  - low:', env.action_space.low)
print('  - high:', env.action_space.high)

class Agent(nn.Module):
    def __init__(self, env, h_size=16):
        super(Agent, self).__init__()
        self.env = env
        # state, hidden layer, action sizes
        self.s_size = env.observation_space.shape[0]
        self.h_size = h_size
        self.a_size = env.action_space.shape[0]
        # define layers
        self.fc1 = nn.Linear(self.s_size, self.h_size)
        self.fc2 = nn.Linear(self.h_size, self.a_size)
        
    def set_weights(self, weights):
        s_size = self.s_size
        h_size = self.h_size
        a_size = self.a_size
        # separate the weights for each layer
        fc1_end = (s_size*h_size)+h_size
        fc1_W = torch.from_numpy(weights[:s_size*h_size].reshape(s_size, h_size))
        fc1_b = torch.from_numpy(weights[s_size*h_size:fc1_end])
        fc2_W = torch.from_numpy(weights[fc1_end:fc1_end+(h_size*a_size)].reshape(h_size, a_size))
        fc2_b = torch.from_numpy(weights[fc1_end+(h_size*a_size):])
        # set the weights for each layer
        self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
        self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
        self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
        self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
    
    def get_weights_dim(self):
        return (self.s_size+1)*self.h_size + (self.h_size+1)*self.a_size
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.tanh(self.fc2(x))
        return x.cpu().data
        
    def evaluate(self, weights, gamma=1.0, max_t=5000):
        self.set_weights(weights)
        episode_return = 0.0
        state = self.env.reset()
        for t in range(max_t):
            state = torch.from_numpy(state).float().to(device)
            action = self.forward(state)
            state, reward, done, _ = self.env.step(action)
            episode_return += reward * math.pow(gamma, t)
            if done:
                break
        return episode_return
    
agent = Agent(env).to(device)

observation space: Box(2,)
action space: Box(1,)
  - low: [-1.]
  - high: [1.]


In [3]:
agent.get_weights_dim()

65

In [6]:
np.random.randn(10)

array([-0.03116048,  1.93993231, -1.00518692, -0.7417897 ,  0.18712452,
       -0.73284515, -1.3829201 ,  1.4824955 ,  0.96145816, -2.14121229])

In [21]:
n_iterations=500
max_t=1000
gamma=1.0
print_every=10
pop_size=50
elite_frac=0.2
sigma=0.5
best_weight = sigma*np.random.randn(agent.get_weights_dim()) 
# for each iteration
# so previous best_weight plus 0.5 * random_weight (sigma*np.random.randn)
[best_weight + (sigma*np.random.randn(agent.get_weights_dim())) for i in range(pop_size)][0]



array([-4.39180307e-01, -1.11373437e-02, -9.83940604e-01,  1.66577328e+00,
        1.47737193e+00, -4.33266121e-01, -1.48367919e-01, -3.77272719e-02,
        8.82153021e-01, -6.68804671e-01, -2.87663462e-01,  7.75706230e-01,
        7.16072012e-01, -6.60248240e-01,  1.28844455e+00,  2.07581368e-03,
        5.23357993e-01,  9.84152338e-02, -1.47121203e+00,  3.71045092e-01,
       -9.42653760e-01,  1.52176296e-02,  6.00632076e-01,  1.09786701e+00,
        6.15448644e-01,  1.03160602e+00, -6.31029911e-01,  1.31209094e+00,
       -1.85295573e+00,  9.84638761e-01,  3.33260608e-01,  6.77112509e-01,
       -7.46962684e-01,  1.34293938e-01, -5.81316961e-01,  9.56347785e-02,
        1.80477868e-01, -3.65326936e-01, -1.14796179e+00, -1.89336535e-01,
        7.12000046e-01,  1.26975461e-01, -7.23269725e-02, -7.75567682e-01,
       -1.18760828e-01, -5.52733685e-01,  1.39891197e+00,  6.12331762e-01,
        3.37473821e-01,  2.31146669e+00,  8.41545764e-01,  1.35225508e-01,
       -1.04884499e-01, -

In [28]:
rs = np.random.rand(50)

rs.argsort()

array([47, 27, 37, 26,  5, 33, 15, 13,  8, 10, 41, 17, 38, 48, 30,  9, 18,
       23,  1, 40,  6, 46, 11, 45, 32,  0, 36, 29, 24, 16, 25, 49, 14, 43,
        3, 39, 31,  7,  2, 34, 20, 21, 35, 28, 44, 22, 42, 19,  4, 12])

In [31]:
rs[np.array([47, 27, 37, 26, 12])]

array([2.72898504e-04, 6.08727689e-03, 8.79484353e-02, 9.08028300e-02,
       9.79996895e-01])

In [26]:
?np.argsort

### 3. Train the Agent with a Cross-Entropy Method

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [None]:
def cem(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5):
    """PyTorch implementation of a cross-entropy method.
        
    Params
    ======
        n_iterations (int): maximum number of training iterations
        max_t (int): maximum number of timesteps per episode
        gamma (float): discount rate
        print_every (int): how often to print average score (over last 100 episodes)
        pop_size (int): size of population at each iteration
        elite_frac (float): percentage of top performers to use in update
        sigma (float): standard deviation of additive noise
    """
    n_elite=int(pop_size*elite_frac)

    scores_deque = deque(maxlen=100)
    scores = []
    best_weight = sigma*np.random.randn(agent.get_weights_dim())

    for i_iteration in range(1, n_iterations+1):
        # each iteration we take previous best_weight and add random weight vector multiplied by half (sigma=0.5)
        # so each time we will nudge each param in best_weights by random number (-0.5, 0.5)  (rand returns (-1,1))
        weights_pop = [best_weight + (sigma*np.random.randn(agent.get_weights_dim())) for i in range(pop_size)]
        # for each of 50 weight pops we run an episode and gather discounted (gamma! ) total reward
        rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])

        # sort rewards ASC and then select last n_elite indexes (n_elite = 50 * 0.2 = 10)
        elite_idxs = rewards.argsort()[-n_elite:]
        # using those indexes select 10 best weights out weights_pop
        elite_weights = [weights_pop[i] for i in elite_idxs]
        # take a mean of elite_weights and save it as next iteration best_weights
        best_weight = np.array(elite_weights).mean(axis=0)

        # find a reward for best_weight by running an episode
        reward = agent.evaluate(best_weight, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)
        
        torch.save(agent.state_dict(), 'checkpoint.pth')
        
        if i_iteration % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))

        if np.mean(scores_deque)>=90.0:
            print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
            break
    return scores

scores = cem()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 10	Average Score: -1.44
Episode 20	Average Score: -3.98
Episode 30	Average Score: -4.18
Episode 40	Average Score: 2.57
Episode 50	Average Score: 18.74


### 4. Watch a Smart Agent!

In the next code cell, you will load the trained weights from file to watch a smart agent!

In [None]:
# load the weights from file
agent.load_state_dict(torch.load('checkpoint.pth'))

state = env.reset()
img = plt.imshow(env.render(mode='rgb_array'))
while True:
    state = torch.from_numpy(state).float().to(device)
    with torch.no_grad():
        action = agent(state)
    img.set_data(env.render(mode='rgb_array')) 
    plt.axis('off')
    display.display(plt.gcf())
    display.clear_output(wait=True)
    next_state, reward, done, _ = env.step(action)
    state = next_state
    if done:
        break

env.close()