# Cross-Entropy Method
Let's start with several ways how one can describe a RL method:
* **model-free** vs **model-based**: Does the method construct a model of the environment or does it simply associate obervations with appropriate actions?
* **policy-based** vs **value-based**: Does it directly learn a policy or a value function and then picks an action that maximizes this value?
* **off-policy** vs **on-policy**: Does it work based on historical data collected either by previous version, the same agent several episodes ago or a human (off-policy) or does require fresh observations (on-policy)?

The *Cross-Entropy Method* is a *model-free* (does not build a model of the environment), *policy-based* (approximates the policy), *on-policy* (requires freas data from the environment) method that even though being quite simple works pretty well in non-complex envirnments where episodes are expected to be short. 

We represent the agent by a NN (or any other classifier) that takes observations on the input and outputs class probabilities for each action (the policy). The outline of the method is as follows:
1. We let the agent play N episodes in the environment with current model
1. Then we compute the total (discounted) reward for each episode and drop those below a reward boundary (e.g. 70th percentile)
1. Next we train the NN on a batch of the remaining episodes with observations as inputs and corresponding actions as targets (i.e. fit the model to produce actions that lead high rewards)
1. go to 1. or terminate if the policy is good enough or we ran out of time

## CartPole
Let's use the Cross-Entropy method on the CartPole environment.

In [1]:
from dataclasses import dataclass
from typing import Iterable, List, NamedTuple, Optional, Sequence, Tuple

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter

# Maximum total reward in CartPole
MAX_REWARD = 200

# Hyperparameters
HIDDEN_UNITS = 128
BATCH_SIZE = 16
PERCENTILE = 70


class PolicyNet(nn.Module):
    """
    Simple network representing a policy.

    Note that the actual output of this NN are raw action scores
    and to convert them to class probabilities one should use a softmax.
    For training we'll use the `nn.CrossEntropyLoss` to cover for this
    and to get better numerical stability.
    """

    def __init__(
        self,
        obs_dim: int,
        n_hidden: int,
        n_actions: int,
    ) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_actions),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)


class TimeStep(NamedTuple):
    observation: np.ndarray
    action: int


@dataclass
class Episode:
    reward: float
    steps: List[TimeStep]

    @classmethod
    def new(cls) -> "Episode":
        return cls(reward=0.0, steps=[])

    @property
    def length(self) -> int:
        return len(self.steps)


def iterate_batches(
    env: gym.Env,
    net: PolicyNet,
    batch_size: int,
) -> Iterable[List[Episode]]:

    # Make a softmax layer for computing action probabilities
    softmax = nn.Softmax(dim=1)

    def sample_action(obs: np.ndarray) -> int:
        action_probs = softmax(net(torch.FloatTensor([obs])))
        action_probs = action_probs.data.numpy()[0]
        return np.random.choice(len(action_probs), p=action_probs)

    # Current batch and episode
    batch = []
    episode = Episode.new()

    # Initialize the environment
    obs = env.reset()

    while True:
        # Sample an action from the policy net
        #  - and apply it to the environment
        action = sample_action(obs)
        next_obs, reward, done, _ = env.step(action)

        # Add new step to current episode
        episode.reward += reward
        episode.steps.append(TimeStep(obs, action))

        if done:
            # Record old and start new episode
            batch.append(episode)
            episode = Episode.new()
            next_obs = env.reset()

            # Output new batch if current one is of full size
            if len(batch) == batch_size:
                yield batch
                batch = []

        obs = next_obs


def filter_batch(
    batch: Sequence[Episode],
    percentile: float,
    gamma: Optional[float] = None,
) -> Tuple[List[Episode], torch.FloatTensor, torch.LongTensor, float, float]:

    if gamma is not None and 0 < gamma <= 1:
        rewards = [e.reward * (gamma ** e.length) for e in batch]
    else:
        rewards = [e.reward for e in batch]

    reward_bound = np.percentile(rewards, percentile)
    reward_mean = np.mean(rewards)

    observations = []
    actions = []
    elite_batch = []

    for episode, total_reward in zip(batch, rewards):

        # Check if this episode is good enough
        if total_reward >= reward_bound:

            # Collect observations and actions from the episode
            for observation, action in episode.steps:
                observations.append(observation)
                actions.append(action)

            elite_batch.append(episode)

    observations = torch.FloatTensor(observations)
    actions = torch.LongTensor(actions)
    return elite_batch, observations, actions, reward_bound, reward_mean

In [2]:
# Create CartPole environment
env = gym.make("CartPole-v0")

obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

# Build the policy net
net = PolicyNet(obs_dim, HIDDEN_UNITS, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)

with SummaryWriter(comment="-cartpole") as writer:

    for i, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
        optimizer.zero_grad()

        # Find tbe best episodes and collect their observations and actions
        _, obs, actions, r_bound, r_mean = filter_batch(batch, PERCENTILE)

        # Use the net to compute action scores for episodic observations
        action_scores = net(obs)

        # Compute the cross-entropy loss and gradients
        #  - We use all the episodic actions as targets
        loss = objective(action_scores, actions)
        loss.backward()

        # Make a gradient descent step
        optimizer.step()

        # Log the progress
        print(
            f"{i}: loss={loss.item():.3f}, "
            f"r_mean={r_mean:.1f}, r_bound={r_bound:.1f}",
            end="\r",
        )

        # Record current statistics
        writer.add_scalar("loss", loss.item(), i)
        writer.add_scalar("reward_bound", r_bound, i)
        writer.add_scalar("reward_mean", r_mean, i)

        # Stop if we can expect maximum possible using this policy
        if r_mean >= MAX_REWARD:
            break

50: loss=0.536, r_mean=200.0, r_bound=200.0