# Prepare Torchbeast model for submission
Need to unravel the code necessary to pass pixel obs into a buffer/model, so that I can submit an agent trained with `torchbeast`

In [3]:
import torch
from torch import nn
from torch.nn import functional as F
import gfootball.env as football_env
import numpy as np
import os

In [4]:
MODEL_PATH = '/home/jupyter/logs/torchbeast/empty_goal/'

The easy part: the model class

In [5]:
class AtariNet(nn.Module):
    def __init__(self, observation_shape, num_actions, use_lstm=False):
        super(AtariNet, self).__init__()
        self.observation_shape = observation_shape
        self.num_actions = num_actions

        # Feature extraction.
        self.conv1 = nn.Conv2d(
            in_channels=self.observation_shape[0],
            out_channels=32,
            kernel_size=8,
            stride=4,
        )
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        # Fully connected layer.
        self.fc = nn.Linear(3136, 512)

        # FC output size + one-hot of last action + last reward.
        core_output_size = self.fc.out_features + num_actions + 1

        self.use_lstm = use_lstm
        if use_lstm:
            self.core = nn.LSTM(core_output_size, core_output_size, 2)

        self.policy = nn.Linear(core_output_size, self.num_actions)
        self.baseline = nn.Linear(core_output_size, 1)

    def initial_state(self, batch_size):
        if not self.use_lstm:
            return tuple()
        return tuple(
            torch.zeros(self.core.num_layers, batch_size, self.core.hidden_size)
            for _ in range(2)
        )

    def forward(self, inputs, core_state=()):
        x = inputs["frame"]  # [T, B, C, H, W].
        T, B, *_ = x.shape
        x = torch.flatten(x, 0, 1)  # Merge time and batch.
        x = x.float() / 255.0
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(T * B, -1)
        x = F.relu(self.fc(x))

        one_hot_last_action = F.one_hot(
            inputs["last_action"].view(T * B), self.num_actions
        ).float()
        clipped_reward = torch.clamp(inputs["reward"], -1, 1).view(T * B, 1)
        core_input = torch.cat([x, clipped_reward, one_hot_last_action], dim=-1)

        if self.use_lstm:
            core_input = core_input.view(T, B, -1)
            core_output_list = []
            notdone = (~inputs["done"]).float()
            for input, nd in zip(core_input.unbind(), notdone.unbind()):
                # Reset core state to zero whenever an episode ended.
                # Make `done` broadcastable with (num_layers, B, hidden_size)
                # states:
                nd = nd.view(1, -1, 1)
                core_state = tuple(nd * s for s in core_state)
                output, core_state = self.core(input.unsqueeze(0), core_state)
                core_output_list.append(output)
            core_output = torch.flatten(torch.cat(core_output_list), 0, 1)
        else:
            core_output = core_input
            core_state = tuple()

        policy_logits = self.policy(core_output)
        baseline = self.baseline(core_output)

        if self.training:
            action = torch.multinomial(F.softmax(policy_logits, dim=1), num_samples=1)
        else:
            # Don't sample when testing.
            action = torch.argmax(policy_logits, dim=1)

        policy_logits = policy_logits.view(T, B, self.num_actions)
        baseline = baseline.view(T, B)
        action = action.view(T, B)

        return (
            dict(policy_logits=policy_logits, baseline=baseline, action=action),
            core_state,
        )


The framestack is what's making this have 16 channels

In [6]:
model = AtariNet((16, 84, 84), num_actions=19, use_lstm=True)

In [7]:
checkpoint = torch.load(os.path.join(MODEL_PATH, 'model.tar'), map_location='cpu')

In [8]:
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

### observation processing
* scale=True
* frame_stack=True

Repurposing my old history buffer

In [9]:
from collections import deque

In [67]:
# my history buffer class
class HistoryBuffer():
    def __init__(self, n, obs_shape=(4, 84, 84), fill_value=0., device=None):
        '''
        Set up the history buffer.

        Inputs: 
          - n: number of observations to retain
          - obs_shape: tuple representing the shape of a single normalize observation
          - fill_value: value used to fill initial buffer tensors
        '''
        self.n=n
        self.obs_shape=obs_shape
        self.fill_value=fill_value 
        self.device=device
        self.reset()

    def reset(self):
        self.buffer = deque(maxlen=self.n)
        for _ in range(self.n):
            self.buffer.append(torch.full(self.obs_shape, fill_value=self.fill_value))
    
    def append(self, obs):
        'Normalize a raw observation and append to the history buffer'
        norm = self.normalize_obs(obs)
        self.buffer.append(norm)

    def get_tensor(self):
        '''
        Return a single tensor containing the observations in the buffer.
        Uses torch.stack on the torch.Tensors within the deque; most recent 
        observations will be at the end of the first index. 

        Returns: (Sequence x Channels x Pitch Length x Pitch Width)
        '''
        if self.device is not None:
            return torch.cat(list(self.buffer)).to(self.device)
        else:
            return torch.cat(list(self.buffer))

    def normalize_obs(self, obs):
        'Return the normalized pixel observation in the shape (Channels x Length x Width)'
        return torch.from_numpy((obs/255).transpose(2, 0, 1)).float()

In [49]:
env = football_env.create_environment('11_vs_11_kaggle', channel_dimensions=(84, 84))

In [50]:
raw_obs = env.reset()

In [51]:
raw_obs.shape

(84, 84, 4)

In [52]:
buff = HistoryBuffer(4)

In [53]:
for _ in range(5): buff.append(raw_obs)

In [54]:
model_obs = buff.get_tensor()

In [55]:
model_in = {
    'frame':model_obs.unsqueeze(0).unsqueeze(0),
    'last_action':torch.tensor(1),
    'reward':torch.tensor(0).view(1, 1),
    'done':torch.tensor(False).view(1, 1)
}

In [56]:
initial_state = model.initial_state(1)

In [57]:
model(model_in, initial_state)

({'policy_logits': tensor([[[-0.1676,  0.4421,  0.3763, -0.1808,  0.7571,  3.9786,  0.8066,
             0.2107, -0.2174, -1.9995, -1.9141, -2.4971, -0.2875, -0.8430,
            -0.0868,  0.4746, -0.9659, -0.4161,  1.1495]]],
         grad_fn=<ViewBackward>),
  'baseline': tensor([[0.9666]], grad_fn=<ViewBackward>),
  'action': tensor([[5]])},
 (tensor([[[-0.3732, -0.0995, -0.0600,  ...,  0.2822,  0.4375,  0.3881]],
  
          [[-0.2363, -0.1951,  0.0311,  ...,  0.2163,  0.0717, -0.2254]]],
         grad_fn=<StackBackward>),
  tensor([[[-0.6079, -0.1861, -0.1225,  ...,  0.4782,  0.6633,  0.6863]],
  
          [[-0.4371, -0.4068,  0.0704,  ...,  0.4171,  0.1392, -0.3970]]],
         grad_fn=<StackBackward>)))

In [58]:
agent_output, unused_state = model(model_in, initial_state)

In [59]:
int(agent_output['action'].item())

5

Now put it into a loop

In [85]:
base_env = football_env.create_environment('academy_empty_goal', channel_dimensions=(84, 84))

In [86]:
model.eval()

AtariNet(
  (conv1): Conv2d(16, 32, kernel_size=(8, 8), stride=(4, 4))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (fc): Linear(in_features=3136, out_features=512, bias=True)
  (core): LSTM(532, 532, num_layers=2)
  (policy): Linear(in_features=532, out_features=19, bias=True)
  (baseline): Linear(in_features=532, out_features=1, bias=True)
)

In [87]:
buff = HistoryBuffer(4)
torch.cat(list(buff.buffer)).shape

torch.Size([16, 84, 84])

In [93]:
EPISODES = 25
rewards = []
for _ in range(EPISODES):
    done = False
    obs = base_env.reset()
    last_action = 0
    reward = 0
    agent_state = model.initial_state(1)
    episode_reward = 0
    buff = HistoryBuffer(4)

    while not done:
        # prepare observation 
        buff.append(obs)
        clean_obs = buff.get_tensor().unsqueeze(0).unsqueeze(0)
        model_in = {
            'frame':clean_obs,
            'last_action':torch.tensor(last_action),
            'reward':torch.tensor(0).view(1, 1),
            'done':torch.tensor(False).view(1, 1)
        }
        action = model(model_in, agent_state)[0]['action'].item()

        obs, reward, done, _ = base_env.step(action)
        last_action = action
        episode_reward += reward
    
    rewards.append(episode_reward)

print(f"Average reward: {np.mean(rewards)}")

Average reward: 1.0


Raw environment - get rewards when we can't call `.step`

In [89]:
raw_env = football_env.create_environment('academy_empty_goal', channel_dimensions=(84, 84), representation='raw')
raw_obs = raw_env.reset()

In [92]:
my_score, their_score = raw_obs[0]['score']

Old

In [47]:
def scale_frame(obs):
    return np.array(obs).astype(np.float32) / 255.0

In [48]:
frames = deque([], maxlen=4)

In [141]:
clean_obs = torch.from_numpy(scale_frame(raw_obs.transpose(2, 0, 1)))

In [142]:
clean_obs.shape

torch.Size([4, 84, 84])

In [143]:
for _ in range(5): frames.append(clean_obs)

In [97]:
model_obs = torch.cat(list(frames))

In [98]:
model_obs.shape

torch.Size([16, 84, 84])

In [99]:
state = model.initial_state(1)

In [121]:
model_in = {
    'frame':model_obs.unsqueeze(0).unsqueeze(0),
    'last_action':torch.tensor(1),
    'reward':torch.tensor(0).view(1, 1),
    'done':torch.tensor(False).view(1, 1)
}