**Goal**: Implement REINFORCE and use it to solve Gymansium CartPole-v1

### Import

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from torch.distributions.categorical import Categorical
from tqdm import tqdm

## REINFORCE

### Approximate the policy with a NN

In [2]:
# Specify the RL environement
env = gym.make("CartPole-v1")

In [3]:
# Define a standard feed-forward neural network class
def mlp(sizes: list, activation=nn.Tanh, output_activation=nn.Identity):
    layers = []
    for j in range(len(sizes) - 1):
        act = (
            activation if j < len(sizes) - 2 else output_activation
        )  # A cool declaration using if/else
        layers += [nn.Linear(sizes[j], sizes[j + 1]), act()]
    return nn.Sequential(*layers)


class NeuralNetwork(nn.Module):
    def __init__(self, sizes: list, activation=nn.Tanh, output_activation=nn.Identity):
        super().__init__()
        self.NN = mlp(sizes, activation, output_activation)

    def forward(self, x):
        return self.NN(x)

In [4]:
# Create a NN that approximates the policy
nn_policy = NeuralNetwork([env.observation_space.shape[0], 32, env.action_space.n])


def policy(observation):
    logits = nn_policy(torch.as_tensor(observation, dtype=torch.float32))
    return Categorical(logits=logits)

### Test - Untrained policy

In [5]:
# Untrained policy
env_test = gym.make("CartPole-v1", render_mode = 'human')
episodes = 5
episode_rewards = []
for episode in range(1, episodes + 1):
    observation, _ = env_test.reset()
    score = 0
    terminated = False
    truncated = False

    while not terminated or truncated:
        action = policy(observation).sample().item()
        observation, reward, terminated, truncated, info = env_test.step(action)
        score += reward
    print(f"Episode {episode}: {score} ")
env_test.close()

Episode 1: 16.0 
Episode 2: 17.0 
Episode 3: 22.0 
Episode 4: 10.0 
Episode 5: 21.0 


### Identify the policy objective function

Our policy objective function is the expected reward following the policy starting form the initial state. However, we note that from the policy gradient theorem, we can equivalently use as the objective function
$$
J = \mathbb{E}_{\pi_\theta} \left[ \log \pi_\theta(s,a) Q^{\pi_\theta}(s,a) \right] ~.
$$
We can approximate this by the actual return following the policy (Monte-Carlo sampling). I.e., we have
$$
J \approx \log \pi_\theta(s_t,a_t) G_t ~.
$$
We will simplify things further by setting $G_t = G_1$ for all steps in an episode. I.e., we have
$$
J \approx \log \pi_\theta(s_t,a_t) G_1 ~.
$$
For simplicity, we further set the discount factor to be 1. I.e., we don't discount energything, $G_1$ is the sum of rewards obtained throughout the epsiode. In Cartpole, this is the same as the episode length.

In [6]:
# Run 5000 steps and return batch observations, actions, and ep length. These will act as training data for our loss function.
def collect_experience(batch_size=5000):
    batch_observations = []
    batch_actions = []
    batch_weights = []

    while True:
        observation, _ = env.reset()
        terminated = False
        truncated = False
        episode_length = 0

        while True:
            episode_length += 1
            action = policy(observation).sample().item()
            batch_actions.append(action)
            batch_observations.append(observation)
            if terminated or truncated:
                break
            observation, reward, terminated, truncated, info = env.step(action)

        batch_weights += [episode_length] * episode_length
        if len(batch_observations) > batch_size:
            break

    return np.array(batch_observations), batch_actions, batch_weights

In [7]:
# Loss function (defined as - 1 times objection function)
def compute_loss(obserations, actions, weights):
    logp = policy(obserations).log_prob(actions) # This gives log \pi(s_t, a_t). log_prob is a method of nn.Categorical.
    return - (logp * weights).mean() # The - is because torch optimiser default optimisation direction is Desend and not Acsend.

### Optimise policy objective function

We shall not be using vanilla stochastic gradient descent but batch ADAM. I.e., we will collect around 5000 data points before doing 1 update instead of updating at every data point. The optimiser is ADAM instead of vanilla stochastic gradient descent.

In [8]:
# Set up an optimizer
optimizer = torch.optim.Adam(params=nn_policy.parameters(), lr=0.01)

# Train with 300 batches of 5000 data points (300 policy updates with each informed by 5000 time steps)
num_epoch = 300
batch_size = 5000
for epoch in tqdm(range(num_epoch)):
    observations, actions, weights = collect_experience()
    loss = compute_loss(
        torch.as_tensor(observations, dtype=torch.float32),
        torch.as_tensor(actions, dtype=torch.float32),
        torch.as_tensor(weights, dtype=torch.float32),
    )
    optimizer.zero_grad()  # Reset the parameters in the optimiser (which might have been saved in memory from the last epoch)
    loss.backward()  # Compute partial dervatives of loss w.r.t to model parameters
    optimizer.step()  # Perform the parameter update

100%|██████████| 300/300 [08:45<00:00,  1.75s/it]


### Test - trained policy

In [9]:
# Test model
env = gym.make("CartPole-v1", render_mode = "human")
state, _ = env.reset()
score = 0

while True:
    # Perform the chosen action and observe the next state and reward
    env.render()
    action = policy(state).sample().item()
    state, reward, terminated, truncated, info = env.step(action)
    score += reward
    if terminated or truncated:
        break
print(score)
env.close()

184.0


In [10]:
# Test model
env = gym.make("CartPole-v1")
total_rewards = []
episodes = 1000
for episode in tqdm(range(episodes)):
    state, _ = env.reset()
    score = 0

    while True:
        # Perform the chosen action and observe the next state and reward
        action = policy(state).sample().item()
        state, reward, terminated, truncated, info = env.step(action)
        score += reward
        if terminated or truncated:
            total_rewards.append(score)
            break
total_rewards = np.array(total_rewards)
env.close()

print(
    f"The mean reward is {np.mean(total_rewards)} and the standard deviation is {np.std(total_rewards)}."
)

100%|██████████| 1000/1000 [00:58<00:00, 17.12it/s]

The mean reward is 167.413 and the standard deviation is 12.069317752052102.



