In [None]:

# Mount google drive
# from google.colab import drive
# drive.mount('/content/drive')

# # Install gym
# !pip install gym==0.25

# # Downgrade to lower numpy
# !pip install numpy==1.23.5

# Make the directory
!mkdir -p video

# REINFORCE on CartPole-v0

Algorithm is reported in slide 53:

**REINFORCE** is acronym for "**RE**ward **I**ncrement = **N**onnegative **F**actor * **O**ffset **R**einforcement * **C**haracteristic **E**ligibility".

The algorithm ([Williams, 1992](https://link.springer.com/content/pdf/10.1007/BF00992696.pdf)) is a monte carlo variation of policy gradient algorithm in RL. The agent collects the trajectory of an episode from current policy. Usually, this policy depends on the policy parameter which denoted as $\theta$.

The tools that are used for this demo are:

1. Google Collab
2. Pytorch
3. OpenAI gym CartPole-v0




In [None]:
# Import required methods

import gym  # GPU accelerated RL framework from OpenAI, NVIDIA, ETHZ ...
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
torch.manual_seed(0)

import base64, io, os

# For visualization
from gym.wrappers import RecordVideo
from IPython.display import HTML
from IPython import display
import glob

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

### Instantiate the Environment and Agent

CartPole environment is very simple. It has discrete action space (2) and 4 dimensional state space.

In [None]:
env = gym.make('CartPole-v1') # Reference: https://gymnasium.farama.org/environments/classic_control/cart_pole/
env.seed(0)

# Change the default maximum steps
env._max_episode_steps = 1000

print('observation space:', env.observation_space)
print('action space:', env.action_space)

### Define Policy
Unlike value-based method, the output of policy-based method is the probability of each action. It can be represented as policy. So activation function of output layer will be softmax, not ReLU.

In [None]:
class Policy(nn.Module):
    def __init__(self, state_size=4, action_size=2, hidden_size=32):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        # we just consider 1 dimensional probability of action
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action)

### REINFORCE

In [None]:
def reinforce(policy, optimizer, n_episodes=1000, max_t=1000, gamma=0.99, print_every=100, pass_score=200):

    scores_window = deque(maxlen=20)
    scores_by_eps = []
    scores_smooth = []

    for e in range(1, n_episodes):

        # Initialize logs
        lnpis_buf   = []
        rewards_buf = []

        # Get state at time 0
        state = env.reset()

        # Collect trajectory
        for t in range(max_t):

            # Sample the action from current policy
            action, lnpi = policy.act(state)
            lnpis_buf.append(lnpi)
            state, reward, done, _ = env.step(action)
            rewards_buf.append(reward)
            if done:
                break

        # Calculate total reward
        scores_window.append(sum(rewards_buf))
        scores_by_eps.append(sum(rewards_buf))
        scores_smooth.append(np.mean(scores_window))

        # Calculate the return by applying discounted factor to the rewards
        t_steps            = np.arange(len(rewards_buf))
        discounted_rewards = [ (gamma**i) * r for i, r in enumerate(rewards_buf) ]
        returns_buf        = np.cumsum(discounted_rewards[::-1])[::-1] / gamma ** t_steps

        # Calculate the loss
        policy_loss = []
        for lnpi, gt in zip(lnpis_buf, returns_buf):
            # Note that we are using Gradient Ascent, not Descent. So we need to calculate it with negative rewards.
            policy_loss.append(-lnpi * gt)

        # After that, we concatenate whole policy loss in 0th dimension
        policy_loss = torch.cat(policy_loss).sum()

        # Backpropagation
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if e % print_every == 0:
            print(f'Episode {e:5d}. Average Score: {np.mean(scores_window):.2f}')

        if all(num > pass_score for num in scores_window) :
            print(f'Environment solved in {e - 100:d} episodes!\tAverage Score: {np.mean(scores_window):.2f}')
            break

    return scores_by_eps, scores_smooth

### Run

In [None]:
pass_score = 400

policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
scores, scores_smooth = reinforce(policy, optimizer, max_t=1000, n_episodes=2000, pass_score=pass_score)

### Plot the learning progress

In [None]:
# plot the scores
plt.rcParams.update({"font.size": 18})  # Applies to all text elements

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(scores, linewidth=1, color='b')
plt.plot(scores_smooth, linewidth=2, color='r')
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.grid('on')
plt.show()

### Animate it with Video

In [None]:
import warnings
warnings.filterwarnings("ignore")

max_play_t = 1000

def show_video(env_name):
    mp4list = glob.glob('video/**/*.mp4', recursive=True)
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

def play_model(policy, env_name, max_play_t):
    env = gym.make(env_name)
    env._max_episode_steps = max_play_t
    env = RecordVideo(env, f"video/{env_name}", new_step_api=True)
    state = env.reset()
    terminated = False
    truncated = False
    for t in range(max_play_t):
        # vid.capture_frame()
        action, _ = policy.act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        state = next_state
        if terminated or truncated:
            print("Done by termination or truncation:", terminated, truncated, t)
            break
    if t >= max_play_t-1:
        print("Done by termination or truncation", terminated, truncated, t)
    # vid.close()
    env.close()
    
play_model(policy, 'CartPole-v1', max_play_t)
show_video('CartPole-v1')