<a href="https://colab.research.google.com/github/murphybrendan/ml-courses/blob/main/huggingface/deep-rl/unit4/reinforce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies

In [1]:
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git git+https://github.com/simoninithomas/gym-games huggingface_hub imageio-ffmpeg

Collecting git+https://github.com/ntasfi/PyGame-Learning-Environment.git
  Cloning https://github.com/ntasfi/PyGame-Learning-Environment.git to /tmp/pip-req-build-cedo2lfb
  Running command git clone --filter=blob:none --quiet https://github.com/ntasfi/PyGame-Learning-Environment.git /tmp/pip-req-build-cedo2lfb
  Resolved https://github.com/ntasfi/PyGame-Learning-Environment.git to commit 3dbe79dc0c35559bb441b9359948aabf9bb3d331
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/simoninithomas/gym-games
  Cloning https://github.com/simoninithomas/gym-games to /tmp/pip-req-build-oe5l4_8q
  Running command git clone --filter=blob:none --quiet https://github.com/simoninithomas/gym-games /tmp/pip-req-build-oe5l4_8q
  Resolved https://github.com/simoninithomas/gym-games to commit f31695e4ba028400628dc054ee8a436f28193f0b
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ple, gym-games
  Building wheel for ple (set

In [2]:
import numpy as np

from collections import deque

import matplotlib.pyplot as plt
%matplotlib inline

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Gym
import gym
import gym_pygame

# Hugging Face Hub
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
import imageio

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [15]:
from dataclasses import dataclass, field

@dataclass
class TrainingArgs:
    n_training_episodes: int
    max_t: int
    gamma: float
    lr: float
    env_id: str
    hidden_layers: list[int] = field(default_factory=list)
    reporting_interval: int = -1

  and should_run_async(code)


# Implement Reinforce

## The Network

In [6]:
class ReinforcePolicy(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_layers=[64, 128]):
        layers = [state_dim] + hidden_layers
        modules = []
        for i in range(len(layers)):
            modules.append(nn.Linear(layers[i], layers[i+1]))
            modules.append(nn.ReLU())
        modules.append(nn.Linear(layers[-1], action_dim))
        self.layers = nn.Sequential(*modules)

    def forward(self, x):
        x = self.layers(x)
        return F.softmax(x, dim=-1)

    def act(self, state: np.ndarray) -> tuple[int, float]:
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        action_probs = self.forward(state)
        m = Categorical(action_probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

  and should_run_async(code)


## The Agent

In [17]:
from tqdm.notebook import trange

class ReinforceAgent:

    def __init__(self, env: gym.Env, args: TrainingArgs):
        self.args = args

        self.env = gym.make(args.env_id)
        self.policy = ReinforcePolicy(env.observation_space.shape[0], env.action_space.n, args.hidden_layers).to(device)
        self.optim = torch.optim.Adam(self.policy.parameters(), lr=args.lr)

    def generate_episode(self):
        self._rewards = []
        self._log_probs = []
        state = self.env.reset()
        for i in range(self.args.max_t):
            action, log_prob = self.policy(state)
            self._log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            self._rewards.append(reward)
            if done:
                break
        self._scores.append(sum(self._rewards))

    def optimization_step(self):
        # Calculated the discounted returns, iterating backwards
        returns = deque()
        for t in range(len(self._rewards))[::-1]:
            g_t1 = returns[0] if returns else 0
            returns.appendleft(self._rewards[t] + self.args.gamma*g_t1)

        assert len(returns) == len(self._rewards)

        # Normalize the returns
        eps = np.finfo(np.float32).eps.item()

        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        # Calculate the loss for each action/return
        losses = []
        for log_prob, discounted_return in (zip(self._log_probs, returns)):
            losses.append(-log_prob * discounted_return)
        loss = torch.cat(losses).sum()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def train(self):
        self._scores = []
        progress_bar = trange(self.args.n_training_episodes)
        for i in progress_bar:
            self.generate_episode()
            self.optimization_step()

            if self.args.reporting_interval > 0 and i % self.args.reporting_interval == 0:
                progress_bar.set_description(f"Episode {i} Average Score: {np.mean(self._scores):.2f}")

# Run environments

## Cartpole

In [None]:
cartpole_args = TrainingArgs(
    env_id="CartPole-v1",
    n_training_episodes=1000,
    max_t=1000,
    gamma=1.0,
    lr=1e-2,
    hidden_layers=[16, 32],
    reporting_interval=100
)

agent = ReinforceAgent(cartpole_args)
agent.train()

## PixelCopter

In [None]:
pixelcopter_args = {
    env_id = "Pixelcopter-PLE-v0",
    n_training_episodes = 50000,
    max_t = 10000,
    gamma = 0.99,
    lr = 1e-4,
    hidden_layers = [64, 128],
    reporting_interval = 1000
}