# Train CNN-based DQN with History Buffer
Largely inspired by the Atari paper by DeepMind, I want to build a deep Q network that uses CNN architecture to encode frames and feeds the fixed-length encoding to an RNN.

In [1]:
import gfootball.env as football_env
import matplotlib.pyplot as plt
import torch
from torch import nn
from collections import deque
import random
from tqdm.notebook import tqdm_notebook
import copy
import numpy as np

In [2]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
!nvidia-smi

Thu Oct 29 04:23:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    92W / 149W |     11MiB / 11441MiB |     55%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

Import custom modules from src

In [4]:
import sys
sys.path.append('../src')

%load_ext autoreload
%autoreload 2
import dqn_utils as dq
import training_ground as tg

#### History buffer
The `HistoryBuffer` class is already set up to handle the `(72, 96, 4)` pixel representation of the pitch. It does normalization and conversion to `float`. Picking an arbitrary number of frames to use for the agent's memory

In [5]:
history = dq.HistoryBuffer(10)

In [6]:
history.get_tensor().shape

torch.Size([10, 4, 96, 72])

#### Training strategy
The `TrainingPlan` class schedules a series of progressively more difficult training scenarios. We just tell it how many of each difficulty to use.

In [7]:
training = tg.TrainingPlan(basic_rounds=5,
                           easy_rounds=10,
                           medium_rounds=10,
                           hard_rounds=10,
                           full_match_rounds=10)

#### Architecture Specification
Now we want to set up the DQN architecture. Starting with an encoder which will be applied to each observation (i.e. the $n$ most recent frames). This is where the Atari model inspiration really begins.

* Inputs: $(n, C, L, W)$ tensor where $n$ is the history, $C$ the number of channels (4 here), and $L, W$ representing pitch dimensions
* Output: $(n, d)$ tensor where $d$ is the dimensionality of the encoding

This doesn't work super well for batched observations, but that's quick enough to handle downstream from this class

In [8]:
class CNNEncoder(nn.Module):
    def __init__(self, out_size):
        super(CNNEncoder, self).__init__()
        self.c1 = nn.Conv2d(4, 32, kernel_size=8, stride=4, padding=1)
        self.c2 = nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=1)
        self.c3 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1)
        self.c4 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.LeakyReLU()
        self.linear = nn.Linear(1536, out_size)
        
    def forward(self, x):
        h = self.relu(self.c1(x))
        h = self.relu(self.c2(h))
        h = self.relu(self.c3(h))
        h = self.relu(self.c4(h))
        flattened = h.flatten(-3)
        out = self.relu(self.linear(flattened))
        return out

In [9]:
enc = CNNEncoder(512)

In [10]:
enc(history.get_tensor()).shape

torch.Size([10, 512])

Now for the class which will use these encoders for each frame for each observation

In [11]:
class HistoryConvAgent(nn.Module):
    def __init__(self, dropout_p = 0.1):
        super(HistoryConvAgent, self).__init__()
        self.encoder = CNNEncoder(out_size=256)
        self.gru = nn.GRU(256, 256, num_layers=1, bidirectional=False, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 18)
        self.activation = nn.LeakyReLU()
        
    def forward(self, x):
        # batching doesn't play nicely here
        if x.ndim == 4:
            encoded = self.encoder(x)
            _, gru_out = self.gru(encoded.unsqueeze(0))
        else:
            encoded = torch.stack([self.encoder(x[i]) for i in range(x.shape[0])])
            _, gru_out = self.gru(encoded)
        gru_out = self.dropout(gru_out.squeeze())
        fc1_out = self.activation(self.fc1(gru_out))
        fc2_out = self.activation(self.fc2(fc1_out))
        return fc2_out
    
model = HistoryConvAgent()

Non-batched

In [12]:
model(history.get_tensor())

tensor([ 4.4607e-02,  1.6296e-02,  3.2347e-02, -1.8517e-04, -5.3473e-04,
         4.5788e-02,  1.8718e-02, -1.8510e-04,  6.2336e-02, -1.4492e-04,
        -3.7471e-04,  2.1265e-03, -5.7802e-05, -6.5788e-04,  3.8532e-02,
         4.2193e-02,  6.1999e-03,  1.1429e-02], grad_fn=<LeakyReluBackward0>)

Batched

In [13]:
model(torch.stack([history.get_tensor(), history.get_tensor() * 0.99])).shape

torch.Size([2, 18])

Awesome - how many params are we working with?

In [14]:
sum([np.prod(x.shape) for x in model.parameters()])

991986

Just under a cool 1M

### Training Loop

In [15]:
target_net = copy.deepcopy(model)

In [16]:
EPSILON = 0.2
EPS_DECAY = 0.999
GAMMA = 0.99
BATCH_SIZE = 32
BUFFER_SIZE = 256

loss_fn = nn.MSELoss()
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
replay_buffer = deque(maxlen=BUFFER_SIZE)

In [17]:
model = model.to(device)
target_net = target_net.to(device)

In [24]:
epoch_losses = []
epoch_rewards = []
progress = tqdm_notebook(range(len(training.training_plan)))
for match in progress:
    env = training.get_next()
    progress.set_description(training.current_scenario_name)
    performance = dq.play_round_with_history(
        env,
        model=model, 
        target_network=target_net, 
        device=device, 
        loss_fn=loss_fn,
        optimizer=optim,
        sync_freq=10,
        replay_buffer=replay_buffer,
        history_length=10,
        batch_size=BATCH_SIZE,
        epsilon=EPSILON,
        gamma=GAMMA
    )
    epoch_rewards.append(performance['reward'])
    epoch_losses.append(np.mean(performance['losses']))
    EPSILON *= EPS_DECAY

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




KeyboardInterrupt: 