# Exercises

Source: http://web.archive.org/web/20160830014637/https://gym.openai.com/docs/rl#id16

### Algorithm 1: Cross Entropy Method

>Initialize $\mu \in \mathbb{R}^d, \sigma \in \mathbb{R}^d<br/>
>For iteration = 1, 2, ...<br/>
>
>&nbsp;&nbsp;&nbsp;&nbsp;Collect $n$ samples of $\theta_i \sim N(\mu, diag(\sigma))$<br/>
>&nbsp;&nbsp;&nbsp;&nbsp;Perform a noise evaluation $f(\theta_i, \zeta_i)$ on each one<br/>
>&nbsp;&nbsp;&nbsp;&nbsp;Select the top $p%$ of samples $(e.g. p = 20)$, which we'll call the "elite set"<br/>
>&nbsp;&nbsp;&nbsp;&nbsp;Fit a Gaussian distribution, with diagonal covariance, to the elite set, obtaining a new $\mu$, $\sigma$.
>
>Return the final $\mu$

In the RL setting, we evaluate $f(θ_i, ζ_i)$ by executing the policy parameterized by $θ_i$ for one or more episodes, and computing the total return.

## 2.1 Exercises

### 1. Apply the cross-entropy method to the CartPole environment

In [8]:
class EvalParams:
    def __init__(self, n_samples=100, elite_percent=0.2, iterations=50, noise_factor=0.1, seed: int | None = None):
        self.n_samples = n_samples
        self.elite_percent = elite_percent
        self.iterations = iterations
        self.noise_factor = noise_factor
        self.seed = seed

In [9]:
import numpy as np
import typing

def cross_entropy_method(dimension: int, evaluator: typing.Callable[[np.ndarray, float], float], params=EvalParams()) -> np.ndarray:
    n_samples = params.n_samples
    elite_percent = params.elite_percent
    iterations = params.iterations
    noise_factor = params.noise_factor
    seed = params.seed

    if seed:
        np.random.seed(seed)

    # Step 1: Initialize μ and σ
    mu = np.zeros(dimension)  # Initial mean
    sigma = np.ones(dimension)  # Initial standard deviation (diagonal of covariance)

    n_elite = int(elite_percent * n_samples)  # Number of elite samples

    for iteration in range(iterations):
        # Step 2: Collect n samples of θ_i ∼ N(μ, diag(σ))
        samples = np.random.multivariate_normal(mu, np.diag(sigma), n_samples)

        # Step 3: Perform a noisy evaluation f(θ_i, ζ_i) on each one
        evaluations = np.array([evaluator(theta, noise_factor) for theta in samples])

        # Step 4: Select the top p% of samples (elite set)
        elite_indices = evaluations.argsort()[-n_elite:]  # Indices of top p% evaluations
        elite_samples = samples[elite_indices]

        # Step 5: Fit a new Gaussian distribution to the elite set (new μ, σ)
        mu = np.mean(elite_samples, axis=0)
        sigma = np.std(elite_samples, axis=0)

        best_evaluation = np.max(evaluations)
        worst_evaluation = np.min(evaluations)
        mean_evaluation = np.mean(evaluations)
        std_evaluation = np.std(evaluations)

        # Print progress
        iter_str = f"Evaluation at iteration {iteration + 1}"
        best_str = f"Best = {best_evaluation}"
        worst_str = f"Worst = {worst_evaluation}"
        mean_str = f"Mean = {mean_evaluation}"
        std_str = f"Std = {std_evaluation}"
        print(f"> {iter_str}: {best_str} | {worst_str} | {mean_str} | {std_str}")

    # Final evaluation
    evaluations = np.array([evaluator(theta, 0) for theta in samples])

    best_str = f"Best = {np.max(evaluations)}"
    worst_str = f"Worst = {np.min(evaluations)}"
    mean_str = f"Mean = {np.mean(evaluations)}"
    std_str = f"Std = {np.std(evaluations)}"
    print(f"Final evaluation (no noise): {best_str} | {worst_str} | {mean_str} | {std_str}")

    return mu

In [10]:
# Example usage

# Noisy evaluation function (replace this with your real function)
def evaluate(theta, noise_factor=0.1):
    # Example evaluation: simple quadratic function with noise
    noise = np.random.randn() * noise_factor
    return -np.sum(theta**2) + noise

dimension = 5  # Dimensionality of the problem
final_mu = cross_entropy_method(dimension, evaluate, params=EvalParams(seed=42))
print("Final μ:", final_mu)

> Evaluation at iteration 1: Best = -0.7163898685709817 | Worst = -16.630537879032783 | Mean = -4.81642544861259 | Std = 3.0088976229443496
> Evaluation at iteration 2: Best = -0.030286620483872893 | Worst = -8.395106551733688 | Mean = -2.695192797417966 | Std = 1.7424096212903402
> Evaluation at iteration 3: Best = -0.183146818144696 | Worst = -5.784244448889563 | Mean = -1.8918651297640066 | Std = 1.142651509511421
> Evaluation at iteration 4: Best = -0.01041125216644051 | Worst = -5.965552672451809 | Mean = -1.7013528648158394 | Std = 1.1336715866212825
> Evaluation at iteration 5: Best = -0.09221563865514357 | Worst = -8.175666769391277 | Mean = -1.4474574674884966 | Std = 1.0886044912391468
> Evaluation at iteration 6: Best = -0.14374786467820738 | Worst = -4.537120205517496 | Mean = -1.7124577548664752 | Std = 0.9337543613322696
> Evaluation at iteration 7: Best = -0.016785042731554983 | Worst = -5.338396521998412 | Mean = -1.6470026657881511 | Std = 1.0951476764484096
> Evaluati

In [11]:
from gymnasium import Env

class EvalParams:
    def __init__(self, n_samples=100, elite_percent=0.2, iterations=50, noise_factor=0.1, seed: int | None = None):
        self.n_samples = n_samples
        self.elite_percent = elite_percent
        self.iterations = iterations
        self.noise_factor = noise_factor
        self.seed = seed

def evaluate_cem(
    env: Env,
    action_selector: typing.Callable[[typing.Any, np.ndarray], typing.Any],
    params=EvalParams(),
):
    def evaluate_episode(theta: np.ndarray, noise_factor: float):
        total_reward = 0
        state, _ = env.reset()
        done = False

        while not done:
            action = action_selector(state, theta)
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

        # Add noise to the total reward
        noise = np.random.randn() * noise_factor
        return total_reward + noise

    dimension = env.observation_space.shape[0]  # Dimensionality of the problem
    final_mu = cross_entropy_method(dimension, evaluate_episode, params=params)
    print("Final μ:", final_mu)

In [12]:
import gymnasium as gym

def evaluate_cem_cartpole(params=EvalParams()):
    env = gym.make("CartPole-v1")

    def action_selector(state, theta):
        return 0 if np.dot(theta, state) < 0 else 1

    evaluate_cem(env=env, action_selector=action_selector, params=params)

# Test the function
evaluate_cem_cartpole(EvalParams(seed=42))

> Evaluation at iteration 1: Best = 499.99373209027266 | Worst = 7.793255789996012 | Mean = 61.86439955294896 | Std = 99.04554906229103
> Evaluation at iteration 2: Best = 500.15033983017673 | Worst = 8.027902152577035 | Mean = 187.1227853824887 | Std = 158.3693825217052
> Evaluation at iteration 3: Best = 500.26016831141806 | Worst = 8.858163426706696 | Mean = 309.9864407630564 | Std = 168.1395492628534
> Evaluation at iteration 4: Best = 500.313774853366 | Worst = 9.065590077652976 | Mean = 335.8186637306612 | Std = 178.89979421950864
> Evaluation at iteration 5: Best = 500.2363872494619 | Worst = 7.846889249744428 | Mean = 388.718769490725 | Std = 185.68580809325368
> Evaluation at iteration 6: Best = 500.2411676675578 | Worst = 7.886993066095965 | Mean = 389.64420595602815 | Std = 166.70803510157654
> Evaluation at iteration 7: Best = 500.2272434720447 | Worst = 7.978654322064957 | Mean = 335.74979201640286 | Std = 194.45285755128398
> Evaluation at iteration 8: Best = 500.31520567

### 2. (Practice *) Apply it to the Swimmer environment, which has a continuous action space. Try artificially increasing the variance and gradually lowering this noise to zero.

In [13]:
def is_windows():
    import platform
    return platform.system() == "Windows"

def evaluate_cem_swimmer(params=EvalParams()):
    env = gym.make("Swimmer-v3")

    def action_selector(state, theta):
        return np.clip(np.dot(theta, state), -1, 1)

    evaluate_cem(env=env, action_selector=action_selector, params=params)

if not is_windows():
    # Test the function
    evaluate_cem_swimmer(EvalParams(seed=42))