# Cross Entropy for CartPole Problem

In [7]:
# !pip install tensorboardX
# Import required packages
import gym
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)
[K     |█                               | 10kB 16.8MB/s eta 0:00:01[K     |██▏                             | 20kB 1.7MB/s eta 0:00:01[K     |███▏                            | 30kB 2.1MB/s eta 0:00:01[K     |████▎                           | 40kB 2.2MB/s eta 0:00:01[K     |█████▎                          | 51kB 2.0MB/s eta 0:00:01[K     |██████▍                         | 61kB 2.2MB/s eta 0:00:01[K     |███████▍                        | 71kB 2.4MB/s eta 0:00:01[K     |████████▌                       | 81kB 2.6MB/s eta 0:00:01[K     |█████████▌                      | 92kB 2.8MB/s eta 0:00:01[K     |██████████▋                     | 102kB 2.8MB/s eta 0:00:01[K     |███████████▊                    | 112kB 2.8MB/s eta 0:00:01[K     |████████████▊                   | 122kB 2.

## Agent: NN Model

One hidden layer NN with 128 neurons and a ReLU activation function.

We will use a batch size of 16.

For cross-entropy, we will use a 70th percentile as the reward boundary. Keep only the top 30%.

The output from the NN is a probability distribution over actions, so a straightforward way to proceed would be to include softmax nonlinearity after the last layer but we don't use it to increase the numerical stability of training.


PyTorch class nn.CrossEntropyLoss, combines both softmax and cross-entropy in a single, more numerically stable expression. CrossEntropyLoss requires raw, unnormalized values from the NN (also called logits). The downside of this is that we need to remember to apply softmax every time we need to get probabilities from our NN's output.

In [8]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

In [14]:
class Net(nn.Module):
  def __init__(self, obs_size, hidden_size, n_actions):
    super(Net, self).__init__()
    self.net = nn.Sequential(
        nn.Linear(obs_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, n_actions)
    )

  def forward(self, x):
    return self.net(x)

## Cross-entropy Algorithm

  1. Play N number of episodes using our current model and environment.
  2. Calculate the total reward for every episode and decide on a reward boundary. Usually, we use some percentile of all rewards, such as 50th or 70th.
  3. Throw away all episodes with a reward below the boundary.
  4. Train on the remaining "elite" episodes using observations as the input and issued actions as the desired output.
  5. Repeat from step 1 until we become satisfied with the result

In [11]:
#  This is a single episode stored as total undiscounted reward and a collection of EpisodeStep
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
#  represent one single step that our agent made in the episode
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size):
  '''
  Generate batches with episodes
  '''
  batch = []
  episode_reward = 0.0
  episode_steps = []
  obs = env.reset()
  sm = nn.Softmax(dim=1) # to output to probability dist of actions
  while True:
      obs_v = torch.FloatTensor([obs])
      act_probs_v = sm(net(obs_v))
      # Both our NN and the softmax layer return tensors that track gradients, 
      # so we need to unpack this by accessing the tensor.data field 
      # and then converting the tensor into a NumPy array. 
      act_probs = act_probs_v.data.numpy()[0]
      # Random choice according to obtained probabilities
      action = np.random.choice(len(act_probs), p=act_probs) 
      next_obs, reward, is_done, _ = env.step(action)
      episode_reward += reward
      step = EpisodeStep(observation=obs, action=action)
      episode_steps.append(step)
      if is_done:
          e = Episode(reward=episode_reward, steps=episode_steps)
          batch.append(e)
          episode_reward = 0.0
          episode_steps = []
          next_obs = env.reset()
          if len(batch) == batch_size:
              yield batch
              batch = []
      obs = next_obs


The training of our NN and the generation of our episodes are performed at the same time.

Every time our loop accumulates enough episodes (16), it passes control to this function caller, which is supposed to train the NN using gradient descent.

In [12]:
def filter_batch(batch, percentile):
  rewards = list(map(lambda s: s.reward, batch))
  reward_bound = np.percentile(rewards, percentile)
  reward_mean = float(np.mean(rewards))

  train_obs = []
  train_act = []
  for reward, steps in batch:
      # check that the episode has a higher total reward than our boundary
      if reward < reward_bound:
          continue
      train_obs.extend(map(lambda step: step.observation, steps))
      train_act.extend(map(lambda step: step.action, steps))

  train_obs_v = torch.FloatTensor(train_obs)
  train_act_v = torch.LongTensor(train_act)
  return train_obs_v, train_act_v, reward_bound, reward_mean

In [15]:
if __name__ == "__main__":
  env = gym.make("CartPole-v0")
  # env = gym.wrappers.Monitor(env, directory="mon", force=True)
  obs_size = env.observation_space.shape[0]
  n_actions = env.action_space.n

  net = Net(obs_size, HIDDEN_SIZE, n_actions)
  objective = nn.CrossEntropyLoss()
  optimizer = optim.Adam(params=net.parameters(), lr=0.01)
  writer = SummaryWriter(comment="-cartpole")

  for iter_no, batch in enumerate(iterate_batches(
          env, net, BATCH_SIZE)):
      obs_v, acts_v, reward_b, reward_m = \
          filter_batch(batch, PERCENTILE)
      optimizer.zero_grad()
      action_scores_v = net(obs_v)
      loss_v = objective(action_scores_v, acts_v)
      loss_v.backward()
      optimizer.step()
      print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
          iter_no, loss_v.item(), reward_m, reward_b))
      writer.add_scalar("loss", loss_v.item(), iter_no)
      writer.add_scalar("reward_bound", reward_b, iter_no)
      writer.add_scalar("reward_mean", reward_m, iter_no)
      if reward_m > 199:
          print("Solved!")
          break
  writer.close()

0: loss=0.681, reward_mean=16.8, rw_bound=19.0
1: loss=0.695, reward_mean=27.9, rw_bound=30.5
2: loss=0.674, reward_mean=31.8, rw_bound=42.0
3: loss=0.659, reward_mean=30.9, rw_bound=32.0
4: loss=0.684, reward_mean=31.3, rw_bound=36.0
5: loss=0.648, reward_mean=29.6, rw_bound=27.5
6: loss=0.641, reward_mean=33.5, rw_bound=34.5
7: loss=0.639, reward_mean=32.7, rw_bound=40.5
8: loss=0.636, reward_mean=39.0, rw_bound=48.0
9: loss=0.624, reward_mean=43.6, rw_bound=51.0
10: loss=0.619, reward_mean=57.2, rw_bound=73.5
11: loss=0.603, reward_mean=48.1, rw_bound=58.5
12: loss=0.597, reward_mean=58.6, rw_bound=60.0
13: loss=0.593, reward_mean=51.1, rw_bound=51.5
14: loss=0.586, reward_mean=51.7, rw_bound=61.5
15: loss=0.581, reward_mean=52.4, rw_bound=61.0
16: loss=0.587, reward_mean=51.7, rw_bound=58.0
17: loss=0.583, reward_mean=49.6, rw_bound=55.5
18: loss=0.552, reward_mean=51.5, rw_bound=61.0
19: loss=0.551, reward_mean=56.6, rw_bound=67.0
20: loss=0.546, reward_mean=56.0, rw_bound=71.0
21