# Ways to Speed up RL

In [2]:
%%bash
!(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit
echo "Running on Google Colab, therefore installing dependencies..."

pip install ptan>=0.7 pytorch-ignite

Running on Google Colab, therefore installing dependencies...


## Imports and Hyperparameters

In [6]:
# flake8: noqa: E402,I001

import random
import warnings
from datetime import datetime, timedelta
from types import SimpleNamespace
from typing import Collection, Iterable, List, NamedTuple, Tuple

import gym
import numpy as np
import ptan
import torch
import torch.multiprocessing as mp
import torch.nn as nn
from ignite.contrib.handlers import tensorboard_logger as tb_logger
from ignite.engine import Engine
from ignite.metrics import RunningAverage
from ptan.actions import EpsilonGreedyActionSelector
from ptan.agent import DQNAgent, TargetNet
from ptan.experience import (
    ExperienceFirstLast,
    ExperienceReplayBuffer,
    ExperienceSourceFirstLast,
)
from ptan.ignite import (
    EndOfEpisodeHandler,
    EpisodeEvents,
    EpisodeFPSHandler,
    PeriodEvents,
    PeriodicEvents,
)

# Get rid of missing metrics warning
warnings.simplefilter("ignore", category=UserWarning)

# Determine where computations will take place
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {DEVICE} to run computations.")

# RNG seed
SEED = 123

# Hyperparameters
PARAMS = SimpleNamespace(
    **{
        "env_name": "PongNoFrameskip-v4",
        "stop_reward": 18.0,
        "run_name": "pong",
        "log_period": 10,
        "replay_size": 100000,
        "replay_initial": 10000,
        "target_net_sync": 1000,
        "epsilon_frames": 10 ** 5,
        "epsilon_start": 1.0,
        "epsilon_final": 0.02,
        "learning_rate": 0.0001,
        "gamma": 0.99,
        "batch_size": 32,
        "batch_mul": 4,
    }
)

Using cuda to run computations.


## DQN & Loss Function
To show performance optimization techniques it will be sufficient to use the baseline DQN from previous chapter.

In [4]:
class DQN(nn.Module):
    def __init__(self, input_shape: Tuple[int, ...], n_actions: int) -> None:
        super().__init__()

        n_conv_inputs = input_shape[0]

        # 2D convolutional layers
        self.conv = nn.Sequential(
            nn.Conv2d(n_conv_inputs, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )

        n_fc_inputs = self._conv_output_dim(input_shape)

        # Dense layers
        self.fc = nn.Sequential(
            nn.Linear(n_fc_inputs, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
        )

    def _conv_output_dim(self, shape: Tuple[int, ...]) -> int:  
        dummy_conv_input = torch.zeros(1, *shape, dtype=torch.float32)
        dummy_conv_output = self.conv(torch.autograd.Variable(dummy_conv_input))
        return int(np.prod(dummy_conv_output.size()))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        inputs = x.float() / 256
        conv_out = self.conv(inputs).view(inputs.size()[0], -1)
        return self.fc(conv_out)


def unpack(batch: Collection[ExperienceFirstLast]) -> Tuple[np.ndarray, ...]:
    states, actions, rewards, last_states, dones = [], [], [], [], []

    # Unzip batch experiences into components
    #  - When epoch ends last_state is the initial state
    #  - This is ok because the result will be masked anyway
    for exp in batch:

        done = exp.last_state is None
        state = np.array(exp.state)
        last_state = state if done else np.array(exp.last_state)

        states.append(state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        last_states.append(last_state)
        dones.append(done)

    # Wrap all components into numpy and return a tuple
    return (
        np.array(states, copy=False),
        np.array(actions),
        np.array(rewards, dtype=np.float32),
        np.array(last_states, copy=False),
        np.array(dones, dtype=np.uint8),
    )


def dqn_loss(
    batch: Collection[ExperienceFirstLast],
    net: nn.Module,
    target_net: TargetNet,
    gamma: float,
    device: str = "cpu",
) -> torch.Tensor:
    # Unwrap experience batch into components
    states, actions, rewards, next_states, dones = unpack(batch)

    # Turn them into tensors
    states = torch.tensor(states).to(device)
    next_states = torch.tensor(next_states).to(device)
    actions = torch.tensor(actions).to(device)
    rewards = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    # Compute Q values for all actions played in batch states
    q_values = net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

    # Do not add these operations to the computation graph for autograd
    with torch.no_grad():
        # Compute fixed values of future states using the target DQN
        #  - And zero out terminal states
        future_values = target_net(next_states).max(1)[0]
        future_values[done_mask] = 0.0

    # Compute TD targets and their MSE to current Q values
    td_targets = rewards + gamma * future_values.detach()
    return nn.MSELoss()(q_values, td_targets)

## Epsilon Schedule

In [5]:
class EpsilonScheduler:
    def __init__(
        self,
        selector: EpsilonGreedyActionSelector,
        params: SimpleNamespace,
    ) -> None:
        self.selector = selector
        self.epsilon_start = params.epsilon_start
        self.epsilon_final = params.epsilon_final
        self.epsilon_frames = params.epsilon_frames
        self.set_epsilon(step=0)

    def set_epsilon(self, step: int) -> None:
        """
        Set epsilon for current step in the associated selector.
        """
        eps = self.epsilon_start - step / self.epsilon_frames
        self.selector.epsilon = max(self.epsilon_final, eps)

## Play Process



In [7]:
class EpisodeEnded(NamedTuple):
    reward: float
    steps: int
    epsilon: float


def play(
    net: DQN,
    exp_queue: mp.Queue,
    params: SimpleNamespace,
    device: str = DEVICE,
    seed: int = SEED,
) -> None:
    
    # TODO: make_atari and wrap_deepmind
    # Setup new environment
    env = wrap_deepmind(
        env=make_atari(params.env_name, skip_noop=True, skip_maxskip=True),
        pytorch_img=True,
        frame_stack=True,
        frame_stack_count=2,
    )
    env.seed(seed)

    device = torch.device(device)

    # Initialize the DQN agent and experience source as usual

    selector = EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
    epsilon_scheduler = EpsilonScheduler(selector, params)
    agent = DQNAgent(net, selector, device=device)
    exp_source = ExperienceSourceFirstLast(env, agent, gamma=params.gamma)

    # Retrieve experiences from the environment
    for frame, exp in enumerate(exp_source):

        # Decay epsilon according to the schedule
        epsilon_scheduler.set_epsilon(step=frame/params.batch_mul)

        # Push the experience to the MQ
        exp_queue.put(exp)
        
        # Publish also final statistics for every episode
        for reward, steps in exp_source.pop_rewards_steps():
            exp_queue.put(EpisodeEnded(reward, steps, selector.epsilon))

## Training Loop