# REINFORCE Algorithm Implementation for CartPole

In this project, I will implement the REINFORCE algorithm to train an agent to balance a pole on a cart using the CartPole environment from OpenAI Gym. The objective is to understand the core concepts of the REINFORCE algorithm and gain hands-on experience in reinforcement learning.

In [31]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import gymnasium as gym

In [32]:
class Policy_Network(nn.Module):
    def __init__(self, layer_sizes: list[int], activation=nn.Tanh, output_activation=nn.Identity):
        super().__init__()
        layers = []
        num_of_layers = len(layer_sizes)
        for i in range(num_of_layers - 1):
            layer = nn.Linear(layer_sizes[i], layer_sizes[i+1])
            activation_function = activation if i < (num_of_layers - 2) else output_activation
            layers += [layer, activation_function()]
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [33]:
class Agent:
    def __init__(self, obs_space, hidden_sizes, n_act, learning_rate=1e-3):
        layer_sizes = [obs_space] + hidden_sizes + [n_act]
        self.policy_network = Policy_Network(layer_sizes)
        self.optimizer = Adam(self.policy_network.parameters(), lr=learning_rate)

    def get_policy(self, obs):
        logits = self.policy_network.forward(obs)
        return Categorical(logits=logits)

    def sample_action(self, obs):
        return self.get_policy(obs).sample().item()

    def compute_loss(self, obs, act, weights):
        logp = self.get_policy(obs).log_prob(act)
        return -(logp * weights).mean()

    def store(self, filepath):
        torch.save(self.policy_network.state_dict(), filepath)
        print(f'Model saved to {filepath}')

    def load(self, filepath):
        self.policy_network.load_state_dict(torch.load(filepath))
        self.policy_network.eval()
        print(f'Model loaded from {filepath}')
    
    def evaluate_agent(agent, env, num_episodes=100):
        for episode in range(num_episodes):
            obs, info = env.reset()
            terminated = False
            truncated = False
            total_reward = 0
            
            while not terminated and not truncated:

                # Use the loaded model to select actions
                obs_tensor = torch.tensor(np.array([obs]), dtype=torch.float32)
                act = agent.sample_action(obs_tensor)

                obs, reward, terminated, truncated, info = env.step(act)
                total_reward += reward
            
            print(f'Episode: {episode}, Total Reward: {total_reward}')


    def train(self, env, episodes=500, render=False):
        for episode in range(episodes):
            obs, info  = env.reset()
            terminated = False
            truncated = False
            ep_obs, ep_acts, ep_weights, ep_rews, ep_rets = [], [], [], [], []
            
            while not terminated and not truncated:
                ep_obs.append(obs.copy())
                act = self.sample_action(torch.tensor(np.array([obs])))
                ep_acts.append(act)
                obs, reward, terminated, truncated, info = env.step(act)
                ep_rews.append(reward)
            
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            ep_rets.append(ep_ret)
            ep_weights += [sum(ep_rews)] * len(ep_rews)

            # take a single policy gradient update step
            self.optimizer.zero_grad()
            episode_loss = self.compute_loss(obs=torch.as_tensor(np.array(ep_obs), dtype=torch.float32),
                                            act=torch.as_tensor(ep_acts, dtype=torch.int32),
                                            weights=torch.as_tensor(ep_weights, dtype=torch.float32))
            episode_loss.backward()
            self.optimizer.step()

            print(f'epoch: {episode}, loss: {episode_loss.item()}, return: {np.mean(ep_rets)}, ep_len: {np.mean(len(ep_rews))}')


In [34]:
env = gym.make("CartPole-v1")
obs_space = env.observation_space.shape[0]
hidden_sizes = [32, 32, 32, 32, 32, 32]
n_acts = env.action_space.n
agent = Agent(obs_space=obs_space, hidden_sizes=hidden_sizes, n_act=n_acts)
episodes = 5000
agent.train(env, episodes, True)
agent.store("model.pth")

epoch: 0, loss: 12.736475944519043, return: 19.0, ep_len: 19.0
epoch: 1, loss: 21.305572509765625, return: 30.0, ep_len: 30.0
epoch: 2, loss: 13.636045455932617, return: 21.0, ep_len: 21.0
epoch: 3, loss: 22.470333099365234, return: 31.0, ep_len: 31.0
epoch: 4, loss: 9.756717681884766, return: 13.0, ep_len: 13.0
epoch: 5, loss: 8.78268814086914, return: 13.0, ep_len: 13.0
epoch: 6, loss: 9.676324844360352, return: 13.0, ep_len: 13.0
epoch: 7, loss: 13.604188919067383, return: 20.0, ep_len: 20.0
epoch: 8, loss: 5.62244987487793, return: 9.0, ep_len: 9.0
epoch: 9, loss: 26.98796272277832, return: 39.0, ep_len: 39.0
epoch: 10, loss: 19.915212631225586, return: 30.0, ep_len: 30.0
epoch: 11, loss: 15.569891929626465, return: 22.0, ep_len: 22.0
epoch: 12, loss: 14.732205390930176, return: 21.0, ep_len: 21.0
epoch: 13, loss: 12.372334480285645, return: 19.0, ep_len: 19.0
epoch: 14, loss: 32.92118835449219, return: 46.0, ep_len: 46.0
epoch: 15, loss: 9.260339736938477, return: 13.0, ep_len: 13

In [35]:
agent.load("model.pth")
env = gym.make("CartPole-v1", render_mode="human")
agent.evaluate_agent(env)

Model loaded from model.pth
Episode: 0, Total Reward: 13.0
Episode: 1, Total Reward: 20.0
Episode: 2, Total Reward: 15.0
Episode: 3, Total Reward: 19.0
Episode: 4, Total Reward: 12.0
Episode: 5, Total Reward: 23.0
Episode: 6, Total Reward: 44.0
Episode: 7, Total Reward: 19.0
Episode: 8, Total Reward: 12.0
Episode: 9, Total Reward: 25.0
Episode: 10, Total Reward: 17.0
Episode: 11, Total Reward: 23.0
Episode: 12, Total Reward: 25.0
Episode: 13, Total Reward: 22.0
Episode: 14, Total Reward: 25.0
Episode: 15, Total Reward: 9.0
Episode: 16, Total Reward: 21.0
Episode: 17, Total Reward: 14.0
Episode: 18, Total Reward: 18.0
Episode: 19, Total Reward: 13.0
Episode: 20, Total Reward: 17.0
Episode: 21, Total Reward: 15.0
Episode: 22, Total Reward: 15.0
Episode: 23, Total Reward: 11.0
Episode: 24, Total Reward: 12.0
Episode: 25, Total Reward: 17.0
Episode: 26, Total Reward: 11.0
Episode: 27, Total Reward: 18.0
Episode: 28, Total Reward: 17.0
Episode: 29, Total Reward: 13.0
Episode: 30, Total Rewa