# REINFORCE Algorithm Implementation for CartPole

In this project, I will implement the REINFORCE algorithm to train an agent to balance a pole on a cart using the CartPole environment from OpenAI Gym. The objective is to understand the core concepts of the REINFORCE algorithm and gain hands-on experience in reinforcement learning.

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import gymnasium as gym

In [5]:
class Policy_Network(nn.Module):
    """
    A neural network model for representing a policy in reinforcement learning.

    Args:
        layer_sizes (list[int]): The sizes of each layer in the network.
        activation (nn.Module): The activation function used for intermediate layers (default: nn.Tanh).
        output_activation (nn.Module): The activation function used for the output layer (default: nn.Identity).
    """
    def __init__(self, layer_sizes: list[int], activation=nn.Tanh, output_activation=nn.Identity):
        super().__init__()
        layers = []
        num_of_layers = len(layer_sizes)
        for i in range(num_of_layers - 1):
            layer = nn.Linear(layer_sizes[i], layer_sizes[i+1])
            activation_function = activation if i < (num_of_layers - 2) else output_activation
            layers += [layer, activation_function()]
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        """
        Forward pass through the policy network.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor.
        """
        return self.model(x)


In [6]:

class Agent:
    """
    A reinforcement learning agent.

    Args:
        obs_space (int): The size of the observation space.
        hidden_sizes (list[int]): The sizes of the hidden layers in the policy network.
        n_act (int): The number of actions in the action space.
        learning_rate (float): The learning rate for the optimizer (default: 1e-3).
    """
    def __init__(self, obs_space, hidden_sizes, n_act, learning_rate=1e-3):
        layer_sizes = [obs_space] + hidden_sizes + [n_act]
        self.policy_network = Policy_Network(layer_sizes)
        self.optimizer = Adam(self.policy_network.parameters(), lr=learning_rate)

    def get_policy(self, obs):
        """
        Get the policy distribution given an observation.

        Args:
            obs (torch.Tensor): The observation tensor.

        Returns:
            Categorical: The policy distribution.
        """
        logits = self.policy_network.forward(obs)
        return Categorical(logits=logits)

    def sample_action(self, obs):
        """
        Sample an action from the policy distribution given an observation.

        Args:
            obs (torch.Tensor): The observation tensor.

        Returns:
            int: The sampled action.
        """
        return self.get_policy(obs).sample().item()

    def compute_loss(self, obs, act, weights):
        """
        Compute the policy gradient loss.

        Args:
            obs (torch.Tensor): The observation tensor.
            act (torch.Tensor): The action tensor.
            weights (torch.Tensor): The weights tensor.

        Returns:
            torch.Tensor: The policy gradient loss.
        """
        logp = self.get_policy(obs).log_prob(act)
        return -(logp * weights).mean()

    def store(self, filepath):
        """
        Store the agent's policy network parameters to a file.

        Args:
            filepath (str): The file path where the model parameters will be saved.
        """
        torch.save(self.policy_network.state_dict(), filepath)
        print(f'Model saved to {filepath}')

    def load(self, filepath):
        """
        Load the agent's policy network parameters from a file.

        Args:
            filepath (str): The file path where the model parameters are stored.
        """
        self.policy_network.load_state_dict(torch.load(filepath))
        self.policy_network.eval()
        print(f'Model loaded from {filepath}')

    def evaluate_agent(agent, env, num_episodes=100):
        """
        Evaluate the agent's performance in the
        given environment for a number of episodes.

        Args:
            agent (Agent): The agent to be evaluated.
            env (Environment): The environment in which the agent will be evaluated.
            num_episodes (int): The number of episodes to run the evaluation (default: 100).
        """
        for episode in range(num_episodes):
            obs, info = env.reset()
            terminated = False
            truncated = False
            total_reward = 0
            
            while not terminated and not truncated:

                # Use the loaded model to select actions
                obs_tensor = torch.tensor(np.array([obs]), dtype=torch.float32)
                act = agent.sample_action(obs_tensor)

                obs, reward, terminated, truncated, info = env.step(act)
                total_reward += reward
            
            print(f'Episode: {episode}, Total Reward: {total_reward}')

    def train(self, env, episodes=500, render=False):
        """
        Train the agent in the given environment.

        Args:
            env (Environment): The environment in which the agent will be trained.
            episodes (int): The number of episodes to run the training (default: 500).
            render (bool): Whether to render the environment during training (default: False).
        """
        for episode in range(episodes):
            obs, info  = env.reset()
            terminated = False
            truncated = False
            ep_obs, ep_acts, ep_weights, ep_rews, ep_rets = [], [], [], [], []
            
            while not terminated and not truncated:
                ep_obs.append(obs.copy())
                act = self.sample_action(torch.tensor(np.array([obs])))
                ep_acts.append(act)
                obs, reward, terminated, truncated, info = env.step(act)
                ep_rews.append(reward)
            
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            ep_rets.append(ep_ret)
            ep_weights += [sum(ep_rews)] * len(ep_rews)

            # take a single policy gradient update step
            self.optimizer.zero_grad()
            episode_loss = self.compute_loss(obs=torch.as_tensor(np.array(ep_obs), dtype=torch.float32),
                                            act=torch.as_tensor(ep_acts, dtype=torch.int32),
                                            weights=torch.as_tensor(ep_weights, dtype=torch.float32))
            episode_loss.backward()
            self.optimizer.step()

            print(f'epoch: {episode}, loss: {episode_loss.item()}, return: {np.mean(ep_rets)}, ep_len: {np.mean(len(ep_rews))}')

In [7]:
env = gym.make("CartPole-v1")
obs_space = env.observation_space.shape[0]
hidden_sizes = [32, 32, 32]
n_acts = env.action_space.n
agent = Agent(obs_space=obs_space, hidden_sizes=hidden_sizes, n_act=n_acts)
episodes = 5000
agent.train(env, episodes, True)
agent.store("model.pth")

epoch: 0, loss: 10.143739700317383, return: 15.0, ep_len: 15.0
epoch: 1, loss: 9.922320365905762, return: 14.0, ep_len: 14.0
epoch: 2, loss: 14.675436973571777, return: 21.0, ep_len: 21.0
epoch: 3, loss: 9.3435697555542, return: 13.0, ep_len: 13.0
epoch: 4, loss: 17.64455795288086, return: 25.0, ep_len: 25.0
epoch: 5, loss: 11.702350616455078, return: 17.0, ep_len: 17.0
epoch: 6, loss: 24.76561737060547, return: 36.0, ep_len: 36.0
epoch: 7, loss: 10.776822090148926, return: 16.0, ep_len: 16.0
epoch: 8, loss: 10.773527145385742, return: 16.0, ep_len: 16.0
epoch: 9, loss: 7.13240385055542, return: 11.0, ep_len: 11.0
epoch: 10, loss: 29.818744659423828, return: 43.0, ep_len: 43.0
epoch: 11, loss: 9.254685401916504, return: 14.0, ep_len: 14.0
epoch: 12, loss: 8.373499870300293, return: 13.0, ep_len: 13.0
epoch: 13, loss: 7.1780104637146, return: 11.0, ep_len: 11.0
epoch: 14, loss: 66.8125991821289, return: 95.0, ep_len: 95.0
epoch: 15, loss: 22.891077041625977, return: 34.0, ep_len: 34.0
e

KeyboardInterrupt: 

In [None]:
agent.load("model.pth")
env = gym.make("CartPole-v1", render_mode="human")
agent.evaluate_agent(env)

Model loaded from model.pth
Episode: 0, Total Reward: 13.0
Episode: 1, Total Reward: 20.0
Episode: 2, Total Reward: 15.0
Episode: 3, Total Reward: 19.0
Episode: 4, Total Reward: 12.0
Episode: 5, Total Reward: 23.0
Episode: 6, Total Reward: 44.0
Episode: 7, Total Reward: 19.0
Episode: 8, Total Reward: 12.0
Episode: 9, Total Reward: 25.0
Episode: 10, Total Reward: 17.0
Episode: 11, Total Reward: 23.0
Episode: 12, Total Reward: 25.0
Episode: 13, Total Reward: 22.0
Episode: 14, Total Reward: 25.0
Episode: 15, Total Reward: 9.0
Episode: 16, Total Reward: 21.0
Episode: 17, Total Reward: 14.0
Episode: 18, Total Reward: 18.0
Episode: 19, Total Reward: 13.0
Episode: 20, Total Reward: 17.0
Episode: 21, Total Reward: 15.0
Episode: 22, Total Reward: 15.0
Episode: 23, Total Reward: 11.0
Episode: 24, Total Reward: 12.0
Episode: 25, Total Reward: 17.0
Episode: 26, Total Reward: 11.0
Episode: 27, Total Reward: 18.0
Episode: 28, Total Reward: 17.0
Episode: 29, Total Reward: 13.0
Episode: 30, Total Rewa