# Exercises

Source: http://web.archive.org/web/20160830014637/https://gym.openai.com/docs/rl#id16

### Algorithm 1: Cross Entropy Method

>Initialize $\mu \in \mathbb{R}^d, \sigma \in \mathbb{R}^d<br/>
>For iteration = 1, 2, ...<br/>
>
>&nbsp;&nbsp;&nbsp;&nbsp;Collect $n$ samples of $\theta_i \sim N(\mu, diag(\sigma))$<br/>
>&nbsp;&nbsp;&nbsp;&nbsp;Perform a noise evaluation $f(\theta_i, \zeta_i)$ on each one<br/>
>&nbsp;&nbsp;&nbsp;&nbsp;Select the top $p%$ of samples $(e.g. p = 20)$, which we'll call the "elite set"<br/>
>&nbsp;&nbsp;&nbsp;&nbsp;Fit a Gaussian distribution, with diagonal covariance, to the elite set, obtaining a new $\mu$, $\sigma$.
>
>Return the final $\mu$

In the RL setting, we evaluate $f(θ_i, ζ_i)$ by executing the policy parameterized by $θ_i$ for one or more episodes, and computing the total return.

## 2.1 Exercises

### 1. Apply the cross-entropy method to the CartPole environment

In [11]:
class EvalParams:
    def __init__(self, n_samples=100, elite_percent=0.2, iterations=50, noise_factor=0.1, seed: int | None = None):
        self.n_samples = n_samples
        self.elite_percent = elite_percent
        self.iterations = iterations
        self.noise_factor = noise_factor
        self.seed = seed

In [12]:
import numpy as np
import typing

def cross_entropy_method(dimension: int, evaluator: typing.Callable[[np.ndarray, float], float], params=EvalParams()) -> np.ndarray:
    n_samples = params.n_samples
    elite_percent = params.elite_percent
    iterations = params.iterations
    noise_factor = params.noise_factor
    seed = params.seed

    if seed:
        np.random.seed(seed)

    # Step 1: Initialize μ and σ
    mu = np.zeros(dimension)  # Initial mean
    sigma = np.ones(dimension)  # Initial standard deviation (diagonal of covariance)

    n_elite = int(elite_percent * n_samples)  # Number of elite samples

    for iteration in range(iterations):
        # Step 2: Collect n samples of θ_i ∼ N(μ, diag(σ))
        samples = np.random.multivariate_normal(mu, np.diag(sigma), n_samples)

        # Step 3: Perform a noisy evaluation f(θ_i, ζ_i) on each one
        evaluations = np.array([evaluator(theta, noise_factor) for theta in samples])

        # Step 4: Select the top p% of samples (elite set)
        elite_indices = evaluations.argsort()[-n_elite:]  # Indices of top p% evaluations
        elite_samples = samples[elite_indices]

        # Step 5: Fit a new Gaussian distribution to the elite set (new μ, σ)
        mu = np.mean(elite_samples, axis=0)
        sigma = np.std(elite_samples, axis=0)

        best_evaluation = np.max(evaluations)
        worst_evaluation = np.min(evaluations)
        mean_evaluation = np.mean(evaluations)
        std_evaluation = np.std(evaluations)

        # Print progress
        iter_str = f"Evaluation at iteration {iteration + 1}"
        best_str = f"Best = {best_evaluation}"
        worst_str = f"Worst = {worst_evaluation}"
        mean_str = f"Mean = {mean_evaluation}"
        std_str = f"Std = {std_evaluation}"
        print(f"> {iter_str}: {best_str} | {worst_str} | {mean_str} | {std_str}")

    # Final evaluation
    evaluations = np.array([evaluator(theta, 0) for theta in samples])

    best_str = f"Best = {np.max(evaluations)}"
    worst_str = f"Worst = {np.min(evaluations)}"
    mean_str = f"Mean = {np.mean(evaluations)}"
    std_str = f"Std = {np.std(evaluations)}"
    print(f"Final evaluation (no noise): {best_str} | {worst_str} | {mean_str} | {std_str}")

    return mu

In [None]:
# Example usage

# Noisy evaluation function (replace this with your real function)
def evaluate(theta, noise_factor=0.1):
    # Example evaluation: simple quadratic function with noise
    noise = np.random.randn() * noise_factor
    return -np.sum(theta**2) + noise

dimension = 5  # Dimensionality of the problem
final_mu = cross_entropy_method(dimension, evaluate, params=EvalParams(seed=42))
print("Final μ:", final_mu)

In [14]:
from gymnasium import Env

class EvalParams:
    def __init__(self, n_samples=100, elite_percent=0.2, iterations=50, noise_factor=0.1, seed: int | None = None):
        self.n_samples = n_samples
        self.elite_percent = elite_percent
        self.iterations = iterations
        self.noise_factor = noise_factor
        self.seed = seed

def evaluate_cem(
    env: Env,
    action_selector: typing.Callable[[typing.Any, np.ndarray], typing.Any],
    params=EvalParams(),
):
    def evaluate_episode(theta: np.ndarray, noise_factor: float):
        total_reward = 0
        state, _ = env.reset()
        done = False

        while not done:
            action = action_selector(state, theta)
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

        # Add noise to the total reward
        noise = np.random.randn() * noise_factor
        return total_reward * (1 + noise)

    dimension = env.observation_space.shape[0]  # Dimensionality of the problem
    final_mu = cross_entropy_method(dimension, evaluate_episode, params=params)
    print("Final μ:", final_mu)

In [None]:
import gymnasium as gym

def evaluate_cem_cartpole(params=EvalParams()):
    env = gym.make("CartPole-v1")

    def action_selector(state, theta):
        return 0 if np.dot(theta, state) < 0 else 1

    evaluate_cem(env=env, action_selector=action_selector, params=params)

# Test the function
evaluate_cem_cartpole(EvalParams(seed=42))

### 2. (Practice *) Apply it to the Swimmer environment, which has a continuous action space. Try artificially increasing the variance and gradually lowering this noise to zero.

In [16]:
def is_windows():
    import platform
    return platform.system() == "Windows"

def evaluate_cem_swimmer(params=EvalParams()):
    env = gym.make("Swimmer-v3")

    def action_selector(state, theta):
        return np.clip(np.dot(theta, state), -1, 1)

    evaluate_cem(env=env, action_selector=action_selector, params=params)

if not is_windows():
    # Test the function
    evaluate_cem_swimmer(EvalParams(seed=42))

## 3.5 Exercises

### 1. Implement a policy gradient algorithm and apply it to the CartPole environment. Compare the following variants:

$\quad a.\text{ }\widehat{A}_t = R$

$\quad b.\text{ }\widehat{A}_t = r_t + \gamma r_{t+1} + \gamma^2 r_{t+2} + ... - V(s_t)$, with and without the discount and baseline (4 variants total).

In [17]:
import torch
import torch.nn.functional as F

# Define a simple policy network
class GradientPolicy(torch.nn.Module):
    def __init__(self, n_features: int, n_actions: int, hidden_dim: int = 128, p_dropout=0.6):
        super().__init__()
        self.output_dim = n_actions
        self.linear1 = torch.nn.Linear(n_features, hidden_dim)
        self.dropout = torch.nn.Dropout(p_dropout)
        self.linear2 = torch.nn.Linear(hidden_dim, n_actions)

    def forward(self, x):
        x = F.relu(self.dropout(self.linear1(x)))
        x = (
            F.softmax(self.linear2(x), dim=-1)
            if self.output_dim > 1
            else F.sigmoid(self.linear2(x))
        )
        return x

# This network is used to calculate the state-value and the baseline
class StateValueNetwork(torch.nn.Module):
    def __init__(self, n_states: int, hidden_dim: int = 128):
        super().__init__()
        self.linear1 = torch.nn.Linear(n_states, hidden_dim)
        self.linear2 = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = torch.relu(self.linear1(x))
        x = self.linear2(x)
        return x

class History:
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []

    def add(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)

    def clear(self):
        self.states.clear()
        self.actions.clear()
        self.rewards.clear()

def get_default_optimizer(model: torch.nn.Module, lr: float):
    return torch.optim.Adam(model.parameters(), lr=lr)

class Manager:
    def __init__(
        self,
        env: Env,
        policy: GradientPolicy,
        v_net: StateValueNetwork | None,
        discount=1.0,
        policy_alpha=0.001,
        v_alpha=0.001,
        debug = False,
        get_policy_optimizer=get_default_optimizer,
        get_v_optimizer=get_default_optimizer,
    ):
        self.env = env
        self.policy = policy
        self.v_net = v_net
        self.discount = discount
        self.debug = debug

        self.policy_optimizer = get_policy_optimizer(policy, lr=policy_alpha)
        self.v_optimizer = (
            get_v_optimizer(v_net, lr=v_alpha)
            if v_net is not None
            else None)

    def run_episode(self, update_v: bool):
        env = self.env
        policy = self.policy
        v_net = self.v_net
        discount = self.discount
        debug = self.debug
        policy_optimizer = self.policy_optimizer
        v_optimizer = self.v_optimizer

        policy_optimizer.zero_grad()
        if v_optimizer is not None:
            v_optimizer.zero_grad()

        state, _ = env.reset()
        done = False
        log_actions: list[torch.Tensor] = []
        history = History()

        if debug:
            print('actions: ', end='')

        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32)
            action_probs = policy(state_tensor)
            action = torch.multinomial(action_probs, 1).item()
            log_actions.append(torch.log(action_probs[action]))

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            history.add(state, action, reward)

            state = next_state

            if debug:
                print(f'{action} ({action_probs[action]}), ', end='')

        if debug:
            print('')

        R = 0
        advantage_list: list[float] = []
        baselines = (
            v_net(torch.tensor(history.states, dtype=torch.float32)).squeeze(-1)
            if v_net is not None
            else torch.zeros(len(history.states))
        )
        detached_baselines = baselines.detach()
        cum_rewards: list[float] = []

        for t in range(len(history.rewards)-1, -1, -1):
            R = history.rewards[t] + discount * R
            cum_rewards.append(R)
            advantage = R - detached_baselines[t]
            advantage_list.append(advantage)

        advantage_list.reverse()
        cum_rewards.reverse()

        A = torch.stack(advantage_list)

        log_pi = torch.stack(log_actions).squeeze(-1)
        loss_policy = -torch.sum(log_pi * A)

        if debug:
            print('steps', len(A))
            print('actions', history.actions)
            print('loss', type(loss_policy), loss_policy)
        loss_policy.backward()

        policy_optimizer.step()
        if update_v and v_optimizer is not None:
            loss_v = F.mse_loss(baselines, torch.tensor(cum_rewards, dtype=torch.float32))
            loss_v.backward()
            v_optimizer.step()

        return sum(history.rewards)

    def train(self, title, n_episodes: int, update_v_every=10):
        all_rewards: list[float] = []
        for i in range(n_episodes):
            rewards = self.run_episode(update_v=(i % update_v_every == 0))
            all_rewards.append(rewards)

            if (i+1) % max(n_episodes // 20, 1) == 0:
                print(f"Episode {i + 1}: {rewards}")

        import matplotlib.pyplot as plt

        plt.figure(figsize=(10, 5))
        plt.plot(all_rewards)
        plt.xlabel("Episode")
        plt.ylabel("Total reward")
        plt.title(f"{title} - Training progress")

        # plot with a moving avg of 10 episodes
        n = 10
        cumsum, moving_aves = [0], []
        for i, x in enumerate(all_rewards, 1):
            cumsum.append(cumsum[i-1] + x)
            if i >= n:
                moving_ave = (cumsum[i] - cumsum[i-n]) / n
                moving_aves.append(moving_ave)
        plt.figure(figsize=(10, 5))
        plt.plot(moving_aves)
        plt.xlabel("Episode")
        plt.ylabel(f"Average reward over {n} episodes")
        plt.title(f"{title} - Average reward progress")

    def predict(self, state):
        with torch.no_grad():
            state_tensor = torch.tensor(state, dtype=torch.float32)
            action_probs = self.policy(state_tensor)
            return torch.argmax(action_probs).item()

In [None]:
debug = False

def run(
    title: str,
    env_name: str,
    n_episodes = 1000,
    use_v_net: bool = False,
    discount = 1.0,
    seed = 42,
    get_optimizer = get_default_optimizer,
):
    print('=' * 80)
    print(f'[Start] {title}')
    print('-' * 80)
    np.random.seed(seed)
    torch.manual_seed(seed)
    env = gym.make(env_name)
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n
    policy = GradientPolicy(n_states, n_actions)
    v_net = StateValueNetwork(n_states) if use_v_net else None
    manager = Manager(
        env=env,
        policy=policy,
        v_net=v_net,
        discount=discount,
        debug=debug,
        get_policy_optimizer=get_optimizer,
        get_v_optimizer=get_optimizer)
    manager.train(title=title, n_episodes=n_episodes)
    print('-' * 80)
    print(f'[End] {title}')
    print('=' * 80)

def run_cases(number: int, env_name: str, n_episodes = 1000):
    optimizers_fn = [
        (None, get_default_optimizer),
        ('SGD with Momentum', lambda model, lr: torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)),
        ('RMSProp', lambda model, lr: torch.optim.RMSprop(model.parameters(), lr=lr)),
        ('Adam', lambda model, lr: torch.optim.Adam(model.parameters(), lr=lr)),
    ]

    for optimizer_name, get_optimizer in optimizers_fn:
        optimizer_info = '' if optimizer_name is None else f' [optimizer: {optimizer_name}]'
        number_suffix = f'.3.{number}' if optimizer_name is None else f'.{number}'
        run(
            f'3.5.{number_suffix}.{optimizer_info} without discount and without baseline (A_t = R)',
            env_name=env_name,
            n_episodes=n_episodes,
            use_v_net=False,
            discount=1.0,
            get_optimizer=get_optimizer)
        run(
            f'3.5.{number_suffix}.{optimizer_info} with discount and without baseline',
            env_name=env_name,
            n_episodes=n_episodes,
            use_v_net=False,
            discount=0.9,
            get_optimizer=get_optimizer)
        run(
            f'3.5.{number_suffix}.{optimizer_info} without discount and with baseline',
            env_name=env_name,
            n_episodes=n_episodes,
            use_v_net=True,
            discount=1.0,
            get_optimizer=get_optimizer)
        run(
            f'3.5.{number_suffix}.{optimizer_info} with discount and with baseline',
            env_name=env_name,
            n_episodes=n_episodes,
            use_v_net=True,
            discount=0.9,
            get_optimizer=get_optimizer)

run_cases(number=1, env_name='CartPole-v1', n_episodes = 1000)