# REINFORCE Algorithm Implementation for CartPole

In this project, I will implement the REINFORCE algorithm to train an agent to balance a pole on a cart using the CartPole environment from OpenAI Gym. The objective is to understand the core concepts of the REINFORCE algorithm and gain hands-on experience in reinforcement learning.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import gymnasium as gym

In [None]:
class PolicyNetwork(nn.Module):
    """
    A neural network model for representing a policy in reinforcement learning.

    Args:
        layer_sizes (list[int]): The sizes of each layer in the network.
        activation (nn.Module): The activation function used for intermediate layers (default: nn.Tanh).
        output_activation (nn.Module): The activation function used for the output layer (default: nn.Identity).
    """
    def __init__(self, layer_sizes: list[int], activation=nn.Tanh, output_activation=nn.Identity):
        super().__init__()
        layers = []
        num_of_layers = len(layer_sizes)
        for i in range(num_of_layers - 1):
            layer = nn.Linear(layer_sizes[i], layer_sizes[i+1])
            activation_function = activation if i < (num_of_layers - 2) else output_activation
            layers += [layer, activation_function()]
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        """
        Forward pass through the policy network.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor.
        """
        return self.model(x)


In [None]:

from statistics import mean


class Agent:
    """
    A reinforcement learning agent.

    Args:
        obs_space (int): The size of the observation space.
        hidden_sizes (list[int]): The sizes of the hidden layers in the policy network.
        n_act (int): The number of actions in the action space.
        learning_rate (float): The learning rate for the optimizer (default: 1e-3).
    """
    def __init__(self, obs_space, hidden_sizes, n_act, learning_rate=1e-3):
        self.policy_network = PolicyNetwork([obs_space, *hidden_sizes, n_act])
        self.optimizer = Adam(self.policy_network.parameters(), lr=learning_rate)

    def get_policy(self, obs):
        """
        Get the policy distribution given an observation.

        Args:
            obs (torch.Tensor): The observation tensor.

        Returns:
            Categorical: The policy distribution.
        """
        logits = self.policy_network(obs)
        return Categorical(logits=logits)

    def sample_action(self, obs):
        """
        Sample an action from the policy distribution given an observation.

        Args:
            obs (torch.Tensor): The observation tensor.

        Returns:
            int: The sampled action.
        """
        return self.get_policy(obs).sample().item()

    def compute_loss(self, observations, actions, weights):
        """
        Compute the policy gradient loss.

        Args:
            obs (torch.Tensor): The observation tensor.
            act (torch.Tensor): The action tensor.
            weights (torch.Tensor): The weights tensor. In this case, this is the reward of the current episode.

        Returns:
            torch.Tensor: The policy gradient loss.
        """
        logp = self.get_policy(observations).log_prob(actions)
        return -(logp * weights).mean()

    def store(self, filepath):
        """
        Store the agent's policy network parameters to a file.

        Args:
            filepath (str): The file path where the model parameters will be saved.
        """
        torch.save(self.policy_network.state_dict(), filepath)
        print(f'Model saved to {filepath}')

    def load(self, filepath):
        """
        Load the agent's policy network parameters from a file.

        Args:
            filepath (str): The file path where the model parameters are stored.
        """
        self.policy_network.load_state_dict(torch.load(filepath))
        self.policy_network.eval()
        print(f'Model loaded from {filepath}')

    def train(self, env, epochs=100, episodes=500):
        """
        Train the agent in the given environment.

        Args:
            env (Environment): The environment in which the agent will be trained.
            episodes (int): The number of episodes to run the training (default: 500).
            render (bool): Whether to render the environment during training (default: False).
        """
        total_returns = np.empty(epochs)
        for epoch in range(epochs):
            observations, actions, weights = np.empty((0, env.observation_space.shape[0])), [], []
            returns, lengths = np.zeros(episodes), np.zeros(episodes, dtype=int)

            for episode in range(episodes):
                obs, info  = env.reset()
                terminated = truncated = False
                rewards = []
                obs_tensor = torch.tensor(np.array([obs]), dtype=torch.float32)

                while not terminated and not truncated:
                    observations = np.vstack([observations, obs])
                    action = self.sample_action(obs_tensor)
                    obs, reward, terminated, truncated, info = env.step(action)
                    obs_tensor = torch.tensor(np.array([obs]), dtype=torch.float32)
                    actions.append(action)
                    rewards.append(reward)

                returns[episode] = sum(rewards)
                lengths[episode] = len(rewards)
                weights.extend([sum(rewards)] * len(rewards))

            self.optimizer.zero_grad()
            actions = np.array(actions, dtype=np.int32)
            weights = np.array(weights, dtype=np.float32)
            observations = torch.tensor(observations, dtype=torch.float32)
            actions = torch.tensor(actions, dtype=torch.int32)
            weights = torch.tensor(weights, dtype=torch.float32)
            loss  = self.compute_loss(observations, actions, weights)
            loss.backward()
            self.optimizer.step()
            total_returns[epoch] = mean(returns)
            print(f"Epoch: {epoch}, Return: {mean(returns)}")
        return total_returns

In [None]:
env = gym.make("LunarLander-v2")
obs_space = env.observation_space.shape[0]
hidden_sizes = [64, 64, 64, 64]
n_acts = env.action_space.n
agent = Agent(obs_space=obs_space, hidden_sizes=hidden_sizes, n_act=n_acts)
epochs = 50
episodes = 100
total_returns = agent.train(env, epochs, episodes)
agent.store("model.pth")

In [None]:
plt.title("Returns")
plt.plot(range(len(total_returns)), total_returns)