# Training an encoder only Transformer network for word reconstruciton

In this notebook, we will implement the pre-training step where we train our transformer neural network located in the `model` directory to be able to *re-construct* an incomplete word (`_ppl_`) into the complete word (`apple`). This is designed as a superwised learning task where the input is the *masked* word (according to the rules of hangman, i.e. all occurences of a letter would be either masked or shown, exclusively) and the target being the complete word.

`MyMasker` is a class under `utils` that implements the desired masking strategy.

### Importing the libraries

In [1]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

### Loading the data
We load the text data, i.e. the list of words and split it randomly for train and validation. 

In [3]:
from utils.utils import MyTokenizer, MyMasker, TextDataset
from torch.utils.data import Dataset, DataLoader, random_split

# prepare the dataset
dataset = TextDataset('./data/words_250000_train.txt')

train_split_percent = 0.99
train_size = int(train_split_percent * len(dataset))
test_size = len(dataset) - train_size

print(f'Training size : {train_size}, \nValidation size : {test_size}')

# Fix seed for reproducibility
train_dataset, val_dataset = random_split(dataset, [train_size, test_size], generator=torch.Generator().manual_seed(0))

batch_size = 128
trainloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
valloader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

Training size : 225027, 
Validation size : 2273


### Load transformer network
1. We choose a `max_word_length` of $32$, assuming that the length of the longest word is $\le 32$.
2. The `src_vocab` is $26 + 1 + 1 = 28$, i.e. no. of letters of the english alphabet + special token for the blank space `_` + and a padding token `0`, respectively.
3. The other hyperparameters were chosen according to convention.

In [4]:
from model.Models import Transformer

# Loading Tranformer model from scratch
max_word_length = 32
model = Transformer(src_vocab=28, d_model=128, max_seq_len=max_word_length, N=12, heads=8, dropout=0.1)

# Use Xavier initialization
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# We will use Adam optimizer
optim = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

### Training the model
Before defining the training loop, we showcase the working of the `Masker` and `Tokenizer` utility functions.

In [5]:
masker = MyMasker()
tokenizer = MyTokenizer(max_word_length)

In [6]:
example = ['hello', 'world']
print(example)

['hello', 'world']


In [7]:
masked_example = masker.mask(example, percentage=.5)  # run this multiple times with different percentage values
print(masked_example)

['h_ll_', 'wo_l_']


In [8]:
# Tokenized words
for a, b in zip(masked_example, tokenizer.encode(masked_example).tolist()):
    print(f'{a}\n{b}\n')

h_ll_
[8, 27, 12, 12, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

wo_l_
[23, 15, 27, 12, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



Now we define the training loop below ...

In [11]:
def create_masks(x):
    '''This function replaces non pad elements, i.e. letters and blank space with True while pad with False'''
    return (x != 0).unsqueeze(-2)

def train_model(model, epochs, printevery=1):
    
    start = time.time()
    if torch.cuda.is_available():
        device='cuda'
        print('CUDA supported GPU detected! Training now ...')
    else:
        device = 'cpu'
        print('No CUDA supported GPU detected! Exiting training...')
        return
    
    # Main loop
    model.to(device)
    for epoch in range(epochs):

        total_loss = 0
        model.train()
        for i, trg in enumerate(trainloader):

            # src is the masked incomplete word
            perc=None
            src = masker.mask(trg, perc)  # e.g. ['__llo', 'w_r_d']
            src = tokenizer.encode(src)  # e.g. [[ 8, 27, 12, 12, 15,  0, ..., 0], [23, 15, 27, 12,  4,  0,  ..., 0]]
            
            # trg is the complete word
            trg = tokenizer.encode(trg)

            # our src_mask is the same as trg_mask = mask
            mask = create_masks(src) 

            # Loading to CUDA GPU
            src, mask, trg = src.to(device), mask.to(device), trg.to(device)
            
            # Predictions are as logits from the model output
            preds = model(src)

            optim.zero_grad()
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), trg.contiguous().view(-1), ignore_index=0)
            loss.backward()
            optim.step()

            total_loss += loss.item()

            if (i + 1) % printevery == 0:
                p = int(100 * (i + 1) / len(trainloader.dataset) * trainloader.batch_size)
                avg_loss = total_loss / printevery
                print("\r   %dm: epoch %d [%s%s]  %d%%  loss = %.3f" %((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_loss), end='')
                total_loss = 0
            
            if (i+1) % 10 == 0:  # checkpoint saving
                torch.save(model.state_dict(), f'./weights/pretrained_model_weights_epoch{epoch}')
                pass
        
        # Save model after each epoch
        torch.save(model.state_dict(), f'./weights/pretrained_model_weights_epoch{epoch}')
        
        # Run a validation after each epoch
        total_val_loss = 0
        model.eval()
        
        sims = 0
        for i, val in enumerate(valloader):
            perc=None
            src = masker.mask(val, perc)
            src = tokenizer.encode(src)
            
            val = tokenizer.encode(val)
            
            mask = create_masks(src)
            
            # Loading to CUDA GPU
            src, mask, val = src.to(device), mask.to(device), val.to(device)
            
            preds = model(src)
            
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), val.contiguous().view(-1), ignore_index=0)
            
            total_val_loss += loss.item()
            sims += 1
            if (i + 1) % printevery == 0:
                p = int(100 * (i + 1) / len(valloader.dataset) * valloader.batch_size)
                avg_val_loss = total_val_loss / sims
                print("\r   %dm: epoch %d [%s%s]  %d%%  loss = %.3f" %((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_val_loss), end='')
            
        print("\r   %dm: epoch %d [%s%s]  %d%%  loss = %.3f\nepoch %d complete, val loss = %.03f" %\
        ((time.time() - start)//60, epoch + 1, "".join('#'*(100//5)), "".join(' '*(20-(100//5))), 100, avg_loss, epoch + 1, avg_val_loss))

Calling the train function ...

In [12]:
train_model(model, epochs=1)

CUDA supported GPU detected! Training now ...
   0m: epoch 1 [                    ]  2%  loss = 1.084


KeyboardInterrupt



### Next steps
After successful training of the model, we can use the model predicted probabilites of the missing letters to formulate a winning strategy for the game of Hangman!