# Ways to Speed up RL

In [1]:
%%bash
!(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit
echo "Running on Google Colab, therefore installing dependencies..."

pip install ptan>=0.7 pytorch-ignite

## Imports and Hyperparameters

In [2]:
# flake8: noqa: E402,I001

import random
import warnings
from datetime import datetime, timedelta
from types import SimpleNamespace
from typing import Any, Collection, Dict, Iterable, List, NamedTuple, Union

import gym
import numpy as np
import ptan
import torch
import torch.nn as nn
from ignite.contrib.handlers import tensorboard_logger as tb_logger
from ignite.engine import Engine
from ignite.metrics import RunningAverage
from ptan.actions import EpsilonGreedyActionSelector
from ptan.agent import DQNAgent, TargetNet
from ptan.experience import (
    ExperienceFirstLast,
    ExperienceReplayBuffer,
    ExperienceSource,
    ExperienceSourceFirstLast,
    ExperienceSourceRollouts,
)
from ptan.ignite import (
    EndOfEpisodeHandler,
    EpisodeEvents,
    EpisodeFPSHandler,
    PeriodEvents,
    PeriodicEvents,
)

ExpSource = Union[
    ExperienceSource,
    ExperienceSourceFirstLast,
    ExperienceSourceRollouts,
]


# RNG seed
SEED = 123


# Hyperparameters for different Atari games
HYPERPARAMS = {
    "pong": SimpleNamespace(
        **{
            "env_name": "PongNoFrameskip-v4",
            "stop_reward": 18.0,
            "run_name": "pong",
            "log_period": 10,
            "replay_size": 100000,
            "replay_initial": 10000,
            "target_net_sync": 1000,
            "epsilon_frames": 10 ** 5,
            "epsilon_start": 1.0,
            "epsilon_final": 0.02,
            "learning_rate": 0.0001,
            "gamma": 0.99,
            "batch_size": 32,
        }
    ),
}

## Experience Batches

In [3]:
def batch_stream(
    buffer: ExperienceReplayBuffer,
    initial: int,
    batch_size: int,
) -> Iterable[List[ExperienceFirstLast]]:
    """
    Fills up the buffer with `initial` capacity and then
    produces an infinite stream of sampled batches from it
    while adding new experiences.
    """
    buffer.populate(initial)
    while True:
        buffer.populate(1)
        yield buffer.sample(batch_size)


class ExperienceBatch(NamedTuple):
    states: np.ndarray
    actions: np.ndarray
    rewards: np.ndarray
    last_states: np.ndarray
    dones: np.ndarray


def unpack_batch(batch: Collection[ExperienceFirstLast]) -> ExperienceBatch:
    states, actions, rewards, last_states, dones = [], [], [], [], []

    # Unzip batch experiences into components
    #  - When epoch ends last_state is the initial state
    #  - This is ok because the result will be masked anyway
    for exp in batch:

        done = exp.last_state is None
        state = np.array(exp.state)
        last_state = state if done else np.array(exp.last_state)

        states.append(state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        last_states.append(last_state)
        dones.append(done)

    # Wrap all components into numpy and a named tuple
    return ExperienceBatch(
        states=np.array(states, copy=False),
        actions=np.array(actions),
        rewards=np.array(rewards, dtype=np.float32),
        last_states=np.array(last_states, copy=False),
        dones=np.array(dones, dtype=np.uint8),
    )

## Loss function

In [4]:
def dqn_loss(
    batch: Collection[ExperienceFirstLast],
    net: nn.Module,
    target_net: TargetNet,
    gamma: float,
    device: str = "cpu",
) -> torch.Tensor:
    # Unwrap experience batch into components
    exp_batch = unpack_batch(batch)

    # Turn them into tensors
    states = torch.tensor(exp_batch.states).to(device)
    next_states = torch.tensor(exp_batch.last_states).to(device)
    actions = torch.tensor(exp_batch.actions).to(device)
    rewards = torch.tensor(exp_batch.rewards).to(device)
    done_mask = torch.BoolTensor(exp_batch.dones).to(device)

    # Compute Q values for all actions played in batch states
    q_values = net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

    # Do not add these operations to the computation graph for autograd
    with torch.no_grad():
        # Compute fixed values of future states using the target DQN
        #  - And zero out terminal states
        future_values = target_net(next_states).max(1)[0]
        future_values[done_mask] = 0.0

    # Compute TD targets and their MSE to current Q values
    td_targets = rewards + gamma * future_values.detach()
    return nn.MSELoss()(q_values, td_targets)


@torch.no_grad()
def compute_state_values(
    states: np.ndarray,
    net: nn.Module,
    device: str = "cpu",
) -> np.ndarray:
    mean_values = []

    for batch in np.array_split(states, 64):
        states = torch.tensor(batch).to(device)
        q_values = net(states)
        best_q_values = q_values.max(1)[0]
        mean_values.append(best_q_values.mean().item())

    return np.mean(mean_values)

## Epsilon Schedule

In [5]:
class EpsilonScheduler:
    def __init__(
        self,
        selector: EpsilonGreedyActionSelector,
        params: SimpleNamespace,
    ) -> None:
        self.selector = selector
        self.params = params
        self.set_epsilon(t=0)

    def set_epsilon(self, t: int) -> None:
        """
        Set epsilon for current iteration t in the associated selector.
        """
        eps = self.params.epsilon_start - t / self.params.epsilon_frames
        self.selector.epsilon = max(self.params.epsilon_final, eps)

## Ignite Handlers

In [6]:
def setup_ignite(
    engine: Engine,
    params: SimpleNamespace,
    exp_source: ExpSource,
    run_name: str,
    extra_metrics: Iterable[str] = (),
):
    # Get rid of missing metrics warning
    warnings.simplefilter("ignore", category=UserWarning)

    # Register some PTAN handlers to the Ingite engine
    handler = EndOfEpisodeHandler(
        exp_source, bound_avg_reward=params.stop_reward
    )
    handler.attach(engine)
    EpisodeFPSHandler().attach(engine)

    # Log training progress at the end of each episode
    @engine.on(EpisodeEvents.EPISODE_COMPLETED(every=params.log_period))
    def episode_completed(trainer: Engine) -> None:
        passed = trainer.state.metrics.get("time_passed", 0)
        elapsed = timedelta(seconds=int(passed))
        avg_fps = trainer.state.metrics.get("avg_fps", 0)
        print(
            f"Episode {trainer.state.episode}: "
            f"reward={trainer.state.episode_reward:.0f}, "
            f"steps={trainer.state.episode_steps}, "
            f"speed={avg_fps:.1f} fps, "
            f"elapsed={elapsed}"
        )

    # Log and terminate when good enough solution has been found
    @engine.on(EpisodeEvents.BOUND_REWARD_REACHED)
    def game_solved(trainer: Engine) -> None:
        passed = trainer.state.metrics["time_passed"]
        elapsed = timedelta(seconds=int(passed))
        print(
            f"Game solved in {elapsed}, "
            f"after {trainer.state.episode} episodes "
            f"and {trainer.state.iteration} iterations!"
        )
        trainer.should_terminate = True

    # Setup TensorBoard logger with fresh logging directory
    now = datetime.now().isoformat(timespec="minutes").replace(":", "")
    logdir = f"runs/{now}-{params.run_name}-{run_name}"
    tb = tb_logger.TensorboardLogger(log_dir=logdir)

    # Register smoothened loss as a metric
    run_avg = RunningAverage(output_transform=lambda v: v["loss"])
    run_avg.attach(engine, "avg_loss")

    # Record following metrics at the end of each epoch
    metrics = ["reward", "steps", "avg_reward"]
    tb.attach(
        engine=engine,
        log_handler=tb_logger.OutputHandler(
            tag="episodes", metric_names=metrics
        ),
        event_name=EpisodeEvents.EPISODE_COMPLETED,
    )

    # Write other metrics to TensorBoard every 100 iterations
    PeriodicEvents().attach(engine)

    handler = tb_logger.OutputHandler(
        tag="train",
        metric_names=["avg_loss", "avg_fps"] + list(extra_metrics),
        output_transform=lambda a: a,
    )

    tb.attach(
        engine=engine,
        log_handler=handler,
        event_name=PeriodEvents.ITERS_100_COMPLETED,
    )

## Environment

In [9]:
# Determine where computations will take place
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} to run computations.")

# Get the set of hyperparameters for Atari Pong
params = HYPERPARAMS["pong"]
print(f"The Atari environment is '{params.env_name}'.")

Using cpu to run computations.
The Atari environment is 'PongNoFrameskip-v4'.


## DQN

In [10]:
from typing import Tuple  # noqa


class DQN(nn.Module):
    def __init__(self, input_shape: Tuple[int, ...], n_actions: int) -> None:
        super().__init__()

        n_conv_inputs = input_shape[0]

        # Stack of 2D convolutional layers with ReLU activations
        self.conv = nn.Sequential(
            nn.Conv2d(n_conv_inputs, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )

        n_fc_inputs = self._conv_output_dim(input_shape)

        # Fully connected layers for the regression part
        #  - Outputs Q(., a) for each action a
        self.fc = nn.Sequential(
            nn.Linear(n_fc_inputs, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
        )

    def _conv_output_dim(self, shape: Tuple[int, ...]) -> int:
        dummy_conv_input = torch.zeros(1, *shape, dtype=torch.float32)
        dummy_conv_output = self.conv(dummy_conv_input)
        return int(np.prod(dummy_conv_output.size()))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Scale the inputs
        inputs = x.float() / 256
        # `view(batch_size, -1)` flattens all the feature dimensions
        conv_out = self.conv(inputs).view(inputs.size()[0], -1)
        return self.fc(conv_out)

## TensorBoard

## Training Loop