# Cross-Entropy Method
Let's start with several ways how one can describe a RL method:
* **model-free** vs **model-based**: Does the method construct a model of the environment or does it simply associate obervations with appropriate actions?
* **policy-based** vs **value-based**: Does it directly learn a policy or a value function and then picks an action that maximizes this value?
* **off-policy** vs **on-policy**: Does it work based on historical data collected either by previous version, the same agent several episodes ago or a human (off-policy) or does require fresh observations (on-policy)?

The *Cross-Entropy Method* is a *model-free* (does not build a model of the environment), *policy-based* (approximates the policy), *on-policy* (requires freas data from the environment) method that even though being quite simple works pretty well in non-complex envirnments where episodes are expected to be short. 

We represent the agent by a NN (or any other classifier) that takes observations on the input and outputs class probabilities for each action (the policy). The outline of the method is as follows:
1. We let the agent play N episodes in the environment with current model
1. Then we compute the total (discounted) reward for each episode and drop those below a reward boundary (e.g. 70th percentile)
1. Next we train the NN on a batch of the remaining episodes with observations as inputs and corresponding actions as targets (i.e. fit the model to produce actions that lead high rewards)
1. go to 1. or terminate if the policy is good enough or we ran out of time

## CartPole
Let's use the Cross-Entropy method on the CartPole environment.

In [1]:
import random
from dataclasses import dataclass
from typing import Iterable, List, NamedTuple, Sequence, Tuple

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter

# Maximum total reward in CartPole
MAX_REWARD = 200

# Hyperparameters
HIDDEN_UNITS = 128
BATCH_SIZE = 16
PERCENTILE = 70


class PolicyNet(nn.Module):
    """
    Simple network representing a policy.

    Note that the actual output of this NN are raw action scores
    and to convert them to class probabilities one should use a softmax.
    For training we'll use the `nn.CrossEntropyLoss` to cover for this
    and to get better numerical stability.
    """

    def __init__(
        self,
        obs_dim: int,
        n_hidden: int,
        n_actions: int,
    ) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_actions),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)


class TimeStep(NamedTuple):
    observation: np.ndarray
    action: int


@dataclass
class Episode:
    reward: float
    discounted_reward: float
    steps: List[TimeStep]

    @classmethod
    def new(cls) -> "Episode":
        return cls(reward=0.0, discounted_reward=0.0, steps=[])

    @property
    def length(self) -> int:
        return len(self.steps)


def iterate_batches(
    env: gym.Env, net: PolicyNet, batch_size: int, gamma: float = 1.0
) -> Iterable[List[Episode]]:
    assert 0 < gamma <= 1
    discount_rewards = gamma < 1

    # Make a softmax layer for computing action probabilities
    softmax = nn.Softmax(dim=1)

    def sample_action(obs: np.ndarray) -> int:
        action_probs = softmax(net(torch.FloatTensor([obs])))
        action_probs = action_probs.data.numpy()[0]
        return np.random.choice(len(action_probs), p=action_probs)

    # Current batch and episode
    batch = []
    episode = Episode.new()

    # Initialize the environment
    obs = env.reset()

    while True:
        # Sample an action from the policy net
        #  - and apply it to the environment
        action = sample_action(obs)
        next_obs, reward, done, _ = env.step(action)

        discounted_reward = reward

        # Discount reward if necessary
        if discount_rewards:
            discounted_reward *= gamma ** episode.length

        # Add new step to current episode
        episode.reward += reward
        episode.discounted_reward += discounted_reward
        episode.steps.append(TimeStep(obs, action))

        if done:
            # Record old and start new episode
            batch.append(episode)
            episode = Episode.new()
            next_obs = env.reset()

            # Output new batch if current one is of full size
            if len(batch) == batch_size:
                yield batch
                batch = []

        obs = next_obs


def filter_batch(
    batch: Sequence[Episode],
    percentile: float,
) -> Tuple[List[Episode], torch.FloatTensor, torch.LongTensor, float, float]:

    # Collect discounted rewards and compute new reward bound
    #  - Also compute the mean for monitoring
    discounted_rewards = [e.discounted_reward for e in batch]
    reward_bound = np.percentile(discounted_rewards, percentile)
    reward_mean = np.mean([e.reward for e in batch])

    observations = []
    actions = []
    elite_batch = []

    for episode, reward in zip(batch, discounted_rewards):

        # Check if this episode is good enough
        if reward >= reward_bound:

            # Collect observations and actions from the episode
            for observation, action in episode.steps:
                observations.append(observation)
                actions.append(action)

            elite_batch.append(episode)

    observations = torch.FloatTensor(observations)
    actions = torch.LongTensor(actions)
    return elite_batch, observations, actions, reward_bound, reward_mean

In [2]:
# Create CartPole environment
env = gym.make("CartPole-v0")

obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

# Build the policy net
net = PolicyNet(obs_dim, HIDDEN_UNITS, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)

with SummaryWriter(comment="-cartpole") as writer:

    for i, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        optimizer.zero_grad()

        # Find tbe best episodes and collect their observations and actions
        _, obs, actions, r_bound, r_mean = filter_batch(batch, PERCENTILE)

        # Use the net to compute action scores for episodic observations
        action_scores = net(obs)

        # Compute the cross-entropy loss and gradients
        #  - We use all the episodic actions as targets
        loss = objective(action_scores, actions)
        loss.backward()

        # Make a gradient descent step
        optimizer.step()

        # Log the progress
        print(
            f"{i}: loss={loss.item():.3f}, "
            f"r_mean={r_mean:.1f}, r_bound={r_bound:.1f}",
            end="\r",
        )

        # Record current statistics
        writer.add_scalar("loss", loss.item(), i)
        writer.add_scalar("reward_bound", r_bound, i)
        writer.add_scalar("reward_mean", r_mean, i)

        # Stop if we can expect maximum possible using this policy
        if r_mean >= MAX_REWARD:
            break

42: loss=0.526, r_mean=200.0, r_bound=200.0

## FrozenLake

### Naive Version

In [3]:
MAX_REWARD = 0.8
MAX_ITERS = 1000


class DiscreteOneHotWrapper(gym.ObservationWrapper):
    """
    Converts each discrete observation to a 1-hot encoded vector.
    """

    def __init__(self, env: gym.Env) -> None:
        super().__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(
            low=0.0,
            high=1.0,
            shape=(env.observation_space.n,),
            dtype=np.float32,
        )

    def observation(self, observation: np.ndarray) -> np.ndarray:
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res


# Create FrozenLake environment
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))

obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

# Build the policy net
net = PolicyNet(obs_dim, HIDDEN_UNITS, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)

with SummaryWriter(comment="-frozenlake-naive") as writer:

    for i, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        optimizer.zero_grad()

        # Find tbe best episodes and collect their observations and actions
        _, obs, actions, r_bound, r_mean = filter_batch(batch, PERCENTILE)

        # Use the net to compute action scores for episodic observations
        action_scores = net(obs)

        # Compute the cross-entropy loss and gradients
        #  - We use all the episodic actions as targets
        loss = objective(action_scores, actions)
        loss.backward()

        # Make a gradient descent step
        optimizer.step()

        # Log the progress
        print(
            f"{i}: loss={loss.item():.3f}, "
            f"r_mean={r_mean:.1f}, r_bound={r_bound:.1f}",
            end="\r",
        )

        # Record current statistics
        writer.add_scalar("loss", loss.item(), i)
        writer.add_scalar("reward_bound", r_bound, i)
        writer.add_scalar("reward_mean", r_mean, i)

        # Stop if we can expect maximum possible using this policy
        if r_mean >= MAX_REWARD or i == MAX_ITERS:
            break

1000: loss=0.001, r_mean=0.0, r_bound=0.0

### Improved Version
There are several reasons why the training failed on FrozenLake (mean reward didn't imrove at all). All these reasons point out the disadvantages of the Cross-Entropy method which are:
* FrozenLake has just two rewards - 0 for failure and 1 after reaching the goal state so the episodic reward distribution is far from normal.
* Moreover, because there is no diversity in reward values, our reward boundary does not work (it can easily happen that most episodes end with reward 0)
* Furthermore, the reward is very delayed (credit assignment problem)

What we can do to improve the method is following:
1. Discount future rewards in the episode (in this case basically just the last one)
1. Keep a small buffer of historical elite episodes and use them for next filtering
1. Increase batch size and decrease the learning rate
1. Longer learning time

In [4]:
# Set RNG state
random.seed(12345)

# Hyperparameters
MAX_REWARD = 0.8
MAX_ITERS = 10_000
BATCH_SIZE = 100
PERCENTILE = 30
GAMMA = 0.9
LEARNING_RATE = 0.001
HISTORY_SIZE = 500

# Create FrozenLake environment
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))

obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

# Build the policy net
net = PolicyNet(obs_dim, HIDDEN_UNITS, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)

with SummaryWriter(comment="-frozenlake-improved") as writer:

    elite = []

    i = 0
    for batch in iterate_batches(env, net, BATCH_SIZE, GAMMA):
        optimizer.zero_grad()

        # Extend the batch with the history of elite episodes
        batch = elite + batch

        # Find tbe best episodes and collect their observations and actions
        elite, obs, actions, r_bound, r_mean = filter_batch(batch, PERCENTILE)

        if not elite:
            # Do not apply (and count) this learning step
            continue

        # Limit the history of elite episodes
        elite = elite[-HISTORY_SIZE:]

        # Use the net to compute action scores for episodic observations
        action_scores = net(obs)

        # Compute the cross-entropy loss and gradients
        #  - We use all the episodic actions as targets
        loss = objective(action_scores, actions)
        loss.backward()

        # Make a gradient descent step
        optimizer.step()

        # Log the progress
        print(
            f"{i}: loss={loss.item():.3f}, "
            f"r_mean={r_mean:.1f}, r_bound={r_bound:.1f}",
            end="\r",
        )

        # Record current statistics
        writer.add_scalar("loss", loss.item(), i)
        writer.add_scalar("reward_bound", r_bound, i)
        writer.add_scalar("reward_mean", r_mean, i)

        # Stop if we can expect maximum possible using this policy
        if r_mean >= MAX_REWARD or i == MAX_ITERS:
            break

        # Increment learning step only after model update
        i += 1

10000: loss=0.339, r_mean=0.0, r_bound=0.0

Still not great, that only shows that we might have better luck with other methods...