# Cross Entropy for CartPole Problem

In [1]:
# !pip install tensorboardX
# Import required packages
import gym
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

## Agent: NN Model

One hidden layer NN with 128 neurons and a ReLU activation function.

We will use a batch size of 16.

For cross-entropy, we will use a 70th percentile as the reward boundary. Keep only the top 30%.

The output from the NN is a probability distribution over actions, so a straightforward way to proceed would be to include softmax nonlinearity after the last layer but we don't use it to increase the numerical stability of training.


PyTorch class nn.CrossEntropyLoss, combines both softmax and cross-entropy in a single, more numerically stable expression. CrossEntropyLoss requires raw, unnormalized values from the NN (also called logits). The downside of this is that we need to remember to apply softmax every time we need to get probabilities from our NN's output.

In [2]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

device

device(type='cuda')

In [3]:
class Net(nn.Module):
  def __init__(self, obs_size, hidden_size, n_actions):
    super(Net, self).__init__()
    self.net = nn.Sequential(
        nn.Linear(obs_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, n_actions)
    )

  def forward(self, x):
    return self.net(x)

## Cross-entropy Algorithm

  1. Play N number of episodes using our current model and environment.
  2. Calculate the total reward for every episode and decide on a reward boundary. Usually, we use some percentile of all rewards, such as 50th or 70th.
  3. Throw away all episodes with a reward below the boundary.
  4. Train on the remaining "elite" episodes using observations as the input and issued actions as the desired output.
  5. Repeat from step 1 until we become satisfied with the result

In [4]:
#  This is a single episode stored as total undiscounted reward and a collection of EpisodeStep
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
#  represent one single step that our agent made in the episode
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

def iterate_batches(env, net, batch_size):
  '''
  Generate batches with episodes
  '''
  batch = []
  episode_reward = 0.0
  episode_steps = []
  obs = env.reset()
  sm = nn.Softmax(dim=1) # to output to probability dist of actions
  while True:
      obs_v = torch.FloatTensor([obs])
      act_probs_v = sm(net(obs_v))
      # Both our NN and the softmax layer return tensors that track gradients, 
      # so we need to unpack this by accessing the tensor.data field 
      # and then converting the tensor into a NumPy array. 
      act_probs = act_probs_v.data.numpy()[0]
      # Random choice according to obtained probabilities
      action = np.random.choice(len(act_probs), p=act_probs) 
      next_obs, reward, is_done, _ = env.step(action)
      episode_reward += reward
      step = EpisodeStep(observation=obs, action=action)
      episode_steps.append(step)
      if is_done:
          e = Episode(reward=episode_reward, steps=episode_steps)
          batch.append(e)
          episode_reward = 0.0
          episode_steps = []
          next_obs = env.reset()
          if len(batch) == batch_size:
              yield batch
              batch = []
      obs = next_obs


The training of our NN and the generation of our episodes are performed at the same time.

Every time our loop accumulates enough episodes (16), it passes control to this function caller, which is supposed to train the NN using gradient descent.

In [5]:
def filter_batch(batch, percentile):
  rewards = list(map(lambda s: s.reward, batch))
  reward_bound = np.percentile(rewards, percentile)
  reward_mean = float(np.mean(rewards))

  train_obs = []
  train_act = []
  for reward, steps in batch:
      # check that the episode has a higher total reward than our boundary
      if reward < reward_bound:
          continue
      # add the observationa and action to the training lists
      train_obs.extend(map(lambda step: step.observation, steps))
      train_act.extend(map(lambda step: step.action, steps))

  # convert the lists to float tensor for use with the NN in PyTorch
  train_obs_v = torch.FloatTensor(train_obs)
  train_act_v = torch.LongTensor(train_act)
  return train_obs_v, train_act_v, reward_bound, reward_mean

## Main training loop

1. Initialize environment and NN agent (as well as optimizer and loss function)
2. Start a loop by creating batches using *iterate_batches*. The loop stops only when the target mean reward is reached.
3. Clean the batch to make sure only those above reward bound are used.
4. Train the NN on the batch for 1 epoch as follows (Standard backprop process):

    a) Forward pass

    b) Calculate loss
    
    c) Backpropagate loss and update weights

In [6]:
if __name__ == "__main__":
  target_mean = 199
  env = gym.make("CartPole-v0")
#   NOTE: downgrade pyglet to 1.3.2 -- Otherwise it breaks gym
#   install ffmpeg as well
#   env = gym.wrappers.Monitor(env, directory="mon", force=True)
  obs_size = env.observation_space.shape[0]
  n_actions = env.action_space.n

  net = Net(obs_size, HIDDEN_SIZE, n_actions)
  objective = nn.CrossEntropyLoss()
  optimizer = optim.Adam(params=net.parameters(), lr=0.01)
  writer = SummaryWriter(comment="-cartpole")

  for iter_no, batch in enumerate(iterate_batches(
          env, net, BATCH_SIZE)):
      # filter the batch to get only those above the reward boundary
      obs_v, acts_v, reward_b, reward_m = \
          filter_batch(batch, PERCENTILE)
      # train on the filtered batch for 1 epoch
      optimizer.zero_grad()
      action_scores_v = net(obs_v)
      loss_v = objective(action_scores_v, acts_v)
      loss_v.backward()
      optimizer.step()
      print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
          iter_no, loss_v.item(), reward_m, reward_b))
      writer.add_scalar("loss", loss_v.item(), iter_no)
      writer.add_scalar("reward_bound", reward_b, iter_no)
      writer.add_scalar("reward_mean", reward_m, iter_no)
      if reward_m > target_mean:
          print("Solved!")
          break
  writer.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
0: loss=0.690, reward_mean=17.6, rw_bound=20.5
1: loss=0.672, reward_mean=22.2, rw_bound=23.0
2: loss=0.662, reward_mean=28.3, rw_bound=28.0
3: loss=0.649, reward_mean=25.8, rw_bound=30.0
4: loss=0.642, reward_mean=43.3, rw_bound=37.0
5: loss=0.628, reward_mean=52.4, rw_bound=67.5
6: loss=0.618, reward_mean=64.2, rw_bound=69.5
7: loss=0.625, reward_mean=47.8, rw_bound=58.5
8: loss=0.608, reward_mean=63.0, rw_bound=65.0
9: loss=0.602, reward_mean=63.2, rw_bound=73.5
10: loss=0.582, reward_mean=62.8, rw_bound=73.5
11: loss=0.584, reward_mean=79.3, rw_bound=74.0
12: loss=0.575, reward_mean=69.8, rw_bound=84.0
13: loss=0.585, reward_mean=85.1, rw_bound=96.5
14: loss=0.582, reward_mean=90.9, rw_bound=103.0
15: loss=0.579, reward_mean=113.4, rw_bound=139.5
16: loss=0.563, reward_mean=95.8, rw_bound=101.0
17: loss=0.565, reward_mean=112.3, rw_bound=158.0
18: loss=0.550, reward_mean=109.