# PyTorch Lightning Training Loop: CNN-LSTM DQN
Developed first in a colab notebook, based on [this Medium post](https://towardsdatascience.com/en-lightning-reinforcement-learning-a155c217c3de) (which is also integrated into the Lightning repo examples)

In [1]:
import gfootball.env as football_env
import torch
from torch import nn
import pytorch_lightning as pl
from collections import deque, OrderedDict
import random 
from copy import deepcopy
import numpy as np

from dqn_utils import HistoryBuffer

Model class

In [2]:
# build the model
# encoder for pixel images
class CNNEncoder(nn.Module):
    def __init__(self, out_size):
        super(CNNEncoder, self).__init__()
        self.c1 = nn.Conv2d(4, 32, kernel_size=8, stride=4, padding=1)
        self.c2 = nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=1)
        self.c3 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1)
        self.c4 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.LeakyReLU()
        self.linear = nn.Linear(1536, out_size)
        
    def forward(self, x):
        h = self.relu(self.c1(x))
        h = self.relu(self.c2(h))
        h = self.relu(self.c3(h))
        h = self.relu(self.c4(h))
        flattened = h.flatten(-3)
        out = self.relu(self.linear(flattened))
        return out

# agent class
class HistoryConvAgent(nn.Module):
    def __init__(self, dropout_p = 0.1, action_size=18):
        super(HistoryConvAgent, self).__init__()
        self.encoder = CNNEncoder(out_size=256)
        self.gru = nn.GRU(256, 256, num_layers=1, bidirectional=False, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, action_size)
        self.activation = nn.LeakyReLU()
        
    def forward(self, x):
        # batching doesn't play nicely here
        if x.ndim == 4:
            encoded = self.encoder(x)
            _, gru_out = self.gru(encoded.unsqueeze(0))
        else:
            encoded = torch.stack([self.encoder(x[i]) for i in range(x.shape[0])])
            _, gru_out = self.gru(encoded)
        gru_out = self.dropout(gru_out.squeeze())
        fc1_out = self.activation(self.fc1(gru_out))
        fc2_out = self.activation(self.fc2(fc1_out))
        return fc2_out

Give the replay buffer its own class

In [3]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, dones, next_states = zip(*batch)
        return states, actions, rewards, dones, next_states

class RLDataset(torch.utils.data.IterableDataset):
    def __init__(self, buffer, sample_size=64):
        self.buffer = buffer
        self.sample_size = sample_size
  
    def __iter__(self):
        states, actions, rewards, dones, new_states = self.buffer.sample(self.sample_size)
        for i in range(len(dones)):
            yield states[i], actions[i], rewards[i], dones[i], new_states[i]

Generic environment getter - including the checkpoint rewards

In [4]:
def get_env(scenario='11_vs_11_kaggle'):
    env = football_env.create_environment(env_name=scenario, 
                                      stacked=False, 
                                      representation='extracted',
                                      write_goal_dumps=False, 
                                      write_full_episode_dumps=False, 
                                      rewards='scoring,checkpoints',
                                      render=False)
    return env

Create a class to interact with the environment: handle buffer creation, stepping, resetting, etc.

In [5]:
class Agent:
    '''
    Base class for interacting with environment
    '''
    def __init__(self, env, replay_buffer, history_length=5):
        self.env = env 
        self.replay_buffer = replay_buffer 
        self.history_length = history_length
        self.reset()

    def reset(self):
        obs = self.env.reset()
        self.history_buffer = HistoryBuffer(self.history_length)
        self.history_buffer.append(obs)
        self.state = self.history_buffer.get_tensor()

    def get_action(self, net, epsilon, device):
        if random.random() < epsilon:
            action = random.randint(0, 17)
        else:
            state = self.state
            if device != 'cpu':
                state = state.cuda(device)
            q_values = net(state)
            action = q_values.argmax().item()

        return int(action)

    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device='cpu'):
        action = self.get_action(net, epsilon, device)
        obs, reward, done, _ = self.env.step(action)
        self.history_buffer.append(obs)
        next_state = self.history_buffer.get_tensor()
        exp = (self.state, action, reward, done, next_state)
        self.replay_buffer.append(exp)

        self.state = next_state 
        if done:
            self.reset()

        return reward, done

Model and params

In [6]:
hparams = {
    'scenario':'11_vs_11_kaggle',
    'total_steps':1_000_000,
    'eps_start':1.0, # beginning epsilon value
    'eps_end':0.01, # final epsilon value
    'eps_last_frame':600_000, # training step to stop decaying epsilon
    'gamma':0.99,
    'replay_size':256,
    'history_len':5,
    'sync_rate':150,
    'warm_start_steps':1_000,
    'lr':1e-4,
    'batch_size':24
}

class AgentLightning(pl.LightningModule):
    def __init__(self, hparams:dict):
        super().__init__()
        self.hparams = hparams
        self.env = get_env(self.hparams['scenario'])
        self.replay_buffer = ReplayBuffer(self.hparams['replay_size'])
        self.net = HistoryConvAgent()
        self.target_net = deepcopy(self.net)
        self.agent = Agent(self.env, self.replay_buffer, self.hparams['history_len'])
        self.total_reward = 0
        self.episode_reward = 0
        self.rolling_rewards = deque(maxlen=100)
        self.populate(self.hparams['warm_start_steps'])
        self.episodes_total = 0

    def populate(self, steps=1000):
        'Warm up with random moves to populate the replay buffer'
        for _ in range(steps):
            self.agent.play_step(self.net, epsilon=1.0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        output = self.net(x)
        return output

    def dqn_mse_loss(self, batch):
        states, actions, rewards, dones, next_states = batch 
        state_action_values = self.net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

        with torch.no_grad():
            next_state_values = self.target_net(next_states).max(1)[0]
            next_state_values[dones] = 0.0
            next_state_values = next_state_values.detach()
    
        expected_state_action_values = next_state_values * self.hparams['gamma'] + rewards

        return nn.MSELoss()(state_action_values, expected_state_action_values)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.net.parameters(), lr=self.hparams['lr'])
        return [optimizer]

    def train_dataloader(self):
        dataset = RLDataset(self.replay_buffer, self.hparams['batch_size'])
        dataloader = torch.utils.data.DataLoader(dataset, self.hparams['batch_size'], sampler=None)
        return dataloader

    def get_device(self, batch):
        return batch[0].device.index if self.on_gpu else 'cpu'

    def training_step(self, batch, nb_batch):
        device = self.get_device(batch)
        epsilon = max(self.hparams['eps_end'], self.hparams['eps_start'] * (1 - self.global_step/self.hparams['eps_last_frame']))

        reward, done = self.agent.play_step(self.net, epsilon, device)

        self.episode_reward += reward 

        loss = self.dqn_mse_loss(batch)

        if self.trainer.use_dp or self.trainer.use_ddp2:
            loss = loss.unsqueeze(0)

        if done:
            self.total_reward += self.episode_reward 
            self.rolling_rewards.append(self.episode_reward)
            self.episode_reward = 0
            self.episodes_total += 1
    
        if self.global_step % self.hparams['sync_rate'] == 0:
            self.target_net.load_state_dict(self.net.state_dict())
    
        if len(self.rolling_rewards) > 30:
            avg_reward = np.mean(self.rolling_rewards)
        else:
            avg_reward = 0
            
        log = {
            'total_reward':torch.tensor(self.total_reward).to(device),
            'episodes_total':self.episodes_total,
            'average_reward': avg_reward,
            'epsilon':epsilon
        }
        self.log_dict(log, prog_bar=True, logger=True, on_epoch=True)
        return loss


Actually do training

In [7]:
model = AgentLightning(hparams)

In [8]:
torch.cuda.is_available()

True

In [9]:
trainer = pl.Trainer(gpus=1, 
                     max_epochs=1_000_000, 
                     val_check_interval=10_000, 
                     precision=16)

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: False, using: 0 TPU cores
INFO:lightning:TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.
INFO:lightning:Using native 16bit precision.


In [None]:
trainer.fit(model)


  | Name       | Type             | Params
------------------------------------------------
0 | net        | HistoryConvAgent | 991 K 
1 | target_net | HistoryConvAgent | 991 K 
INFO:lightning:
  | Name       | Type             | Params
------------------------------------------------
0 | net        | HistoryConvAgent | 991 K 
1 | target_net | HistoryConvAgent | 991 K 


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

Process of playing a match and stuff

In [41]:
# cr = 0
# env = get_env('11_vs_11_hard_stochastic')
# obs = env.reset()
# history = HistoryBuffer(5)
# history.append(obs)
# done = False
# with torch.no_grad():
#     while not done:
#         q = model.net(history.get_tensor())
#         action = int(q.argmax().item())
#         obs, reward, done, _ = env.step(action)
#         history.append(obs)
#         cr += reward