In [1]:
import torch
import math
import numpy as np
import os

from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from bs4 import BeautifulSoup
from torch.nn.utils.rnn import pad_sequence
from torch.nn import functional as F

In [2]:
class PositionalEncoding(nn.Module):
    """
    Computes positional encoding as given in paper 'Attention is all you need'.

    """
    def __init__(self, dim_model : int, dropout_p : float, max_len : int) -> torch.Tensor:
        super().__init__()
        # Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
        # max_len determines how far the position can have an effect on a token (window)
        
        # Dropout
        self.dropout = nn.Dropout(dropout_p)
        
        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0)
        
        self.register_buffer("pos_encoding",pos_encoding)

    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        # Residual connection + pos encoding
        return self.dropout(token_embedding + self.pos_encoding[:, :token_embedding.size(1), :])

In [6]:
class NorwegianDataset(Dataset):
    """
    Custom Dataset to process .xml-files containing
    texts from Norwegian Newspaper corpus. 
    """

    def __init__(self) -> None:

        # Utilizing NorBERT tokenizer, adding special tokens     
        self.tokenizer = AutoTokenizer.from_pretrained('ltg/norbert2')
        new_special_tokens = {'additional_special_tokens' : ['[BOS]', '[EOS]']}
        self.tokenizer.add_special_tokens(new_special_tokens)

        files  = os.listdir('docs')  # Here is where .xml files of Norwegian corpus are supposed to be 
        self.texts = []

        # Processing files
        for file in tqdm(files):
            with open(os.path.join('docs', file), 'r', encoding='utf8') as f:
                data = f.read()
                bs_data = BeautifulSoup(data, "xml")
                text = bs_data.find('div', {'type':'text'})
                if text:
                    self.texts.append(text.get_text())

        self.sentences = []
        self.longest_sequence = 0

        # Extracting sequences + longest seq. len
        for text in tqdm(self.texts):
            text = text.split('\n')
            sentences_list = text[1:]
            for sent in sentences_list:
                tokenized_len = len(self.tokenizer.tokenize(sent))
                if tokenized_len > self.longest_sequence:
                    self.longest_sequence = tokenized_len
            self.sentences.extend(sentences_list)
           

    def __getitem__(self, index : int) -> dict[str, torch.tensor]:
        text =self.sentences[index]

        seq_to_int = [self.tokenizer.vocab['[BOS]']]
        seq_to_int.extend(self.tokenizer.encode(text, add_special_tokens=False))
        seq_to_int.append(self.tokenizer.vocab['[EOS]'])

        # Remember offset of 1: we are trying to predict future tokens
        src = seq_to_int[:-1]
        tgt = seq_to_int[1:]
        
        return {'src': torch.tensor(src),
                'tgt': torch.tensor(tgt)}
    
    def __len__(self):
        return len(self.sentences)
    
    
    def longest_sequence(self):
        return self.longest_sequence + 1
    

    def vocab_size(self):
        return len(self.tokenizer.vocab)

In [3]:
class NorwegianCollator:
    """
    Simple collator to pad decoder sequences
    """

    def __init__(self, pad_idx : int) -> None:
        self.pad_idx = pad_idx
        
    def __call__(self, samples) -> dict[str, torch.tensor]: 
        srcs = []
        tgts = []

        for sample in sorted(samples, key=lambda x: len(x['src']), reverse=True):
            srcs.append(sample['src'])
            tgts.append(sample['tgt'])

        # Padding sequences
        padded_src_tensors = pad_sequence(srcs, 
                                          batch_first=True,
                                          padding_value=self.pad_idx)
        
        padded_tgt_tensors = pad_sequence(tgts, 
                                          batch_first=True,
                                          padding_value=self.pad_idx)

        
        return {'src': padded_src_tensors,
                'tgt': padded_tgt_tensors}

In [4]:
class NorwegianModel(nn.Module):
    """
    Simple Transformer-based encoder-decoder modek;
    """

    def __init__(self,
                 dim_model : int,
                 decoder_layers : int,
                 heads : int,
                 mlp_dim : int,
                 dropout_p: float,
                 max_seq_len : int,
                 vocab_size : int) -> None:
        
        super().__init__()
        self.dim_model = dim_model

        # Init target seq. positional encoding instance
        self.pos_enc = PositionalEncoding(
            dim_model=dim_model,
            dropout_p=dropout_p,
            max_len=max_seq_len
        )

        # Init charachter embeddings table
        self.embedding = nn.Embedding(vocab_size, dim_model)
    
        # Instance of decoder layer
        decoder_layer = nn.TransformerDecoderLayer(d_model=dim_model,
                                                   dim_feedforward=mlp_dim,
                                                   nhead=heads,
                                                   dropout=dropout_p,
                                                   batch_first=True)
        
        # Whole decoder made of decoder layers
        self.decoder = nn.TransformerDecoder(decoder_layer=decoder_layer,
                                             num_layers=decoder_layers)

        # Linear proj. to vocab. size
        self.out = nn.Linear(dim_model, vocab_size)


    def forward(self, src : torch.tensor, 
                      tgt : torch.tensor,
                      src_pad_mask=None, 
                      tgt_mask : torch.tensor=None, 
                      tgt_pad_mask : torch.tensor=None) -> torch.Tensor:
            
        src = self.embedding(src) * math.sqrt(self.dim_model)
        tgt = self.embedding(tgt) * math.sqrt(self.dim_model)
        src = self.pos_enc(src)
        tgt = self.pos_enc(tgt)

        decoder_out = self.decoder(src, tgt, tgt_mask=tgt_mask, memory_key_padding_mask=src_pad_mask, tgt_key_padding_mask=tgt_pad_mask)
        out = self.out(decoder_out)
        
        return out


    def get_tgt_mask(self, size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0
        
        return mask
    

    def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
        # If matrix = [1,2,3,0,0,0] where pad_token=0, the result mask is
        # [False, False, False, True, True, True]
        return (matrix == pad_token)


In [None]:
norwegian_dataset = NorwegianDataset()

In [None]:
# Model hyperparameters
dim_model = 768
decoder_layers = 6
heads = 12
mlp_dim = 2048
dropout_p = 0.1

vocab_size = norwegian_dataset.vocab_size()
max_seq_len = norwegian_dataset.longest_sequence + 1

In [None]:
norwegian_model = NorwegianModel(dim_model=dim_model,
                                 decoder_layers=decoder_layers,
                                 heads=heads,
                                 mlp_dim=mlp_dim,
                                 dropout_p=dropout_p,
                                 max_seq_len=max_seq_len,
                                 vocab_size=vocab_size)

In [None]:
# Training hyperparemeters 
epochs = 2
init_lr = 1e-3
optimizer = torch.optim.AdamW(norwegian_model.parameters(), lr=init_lr)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
norwegian_model = norwegian_model.to(device)

In [None]:
@torch.no_grad()
def generate(model, device):
    """
    Generates text based on BOS-token + potentially some other token of choice.
    """
    
    model.eval()
    src = torch.tensor([norwegian_dataset.tokenizer.vocab['[BOS]'],
                        norwegian_dataset.tokenizer.vocab['fra' ]], dtype=torch.int64, device=device).unsqueeze(0)
    
    tgt = torch.tensor([norwegian_dataset.tokenizer.vocab['deg']], dtype=torch.int64, device=device).unsqueeze(0)

    for _ in range(10):
        logits = model(src, tgt)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1).detach().to('cpu')
        idx_next = np.argmax(probs, axis=-1).unsqueeze(0).to(device)
        tgt = torch.cat(((tgt, idx_next)), dim=1)
    
    print(norwegian_dataset.tokenizer.decode(tgt.squeeze(0).detach().to('cpu')))

In [None]:
def train(model, iterator, optimizer, device):
    """
    Train loop
    """

    model.train()
    total_loss = 0
    eval_batches = [10000, 20000, 30000, 40000, 50000]
    for batch_num, batch in enumerate(tqdm(iterator), 1):
        src = batch['src'].to(device)
        tgt = batch['tgt'].to(device)

        tgt_len = tgt.size(1)

        # Attention mask for decoder + pad masks for encoder and decoder
        tgt_attention_mask = model.get_tgt_mask(tgt_len).to(device)
        src_pad_mask = model.create_pad_mask(src, norwegian_dataset.tokenizer.pad_token_id)
        tgt_pad_mask = model.create_pad_mask(tgt, norwegian_dataset.tokenizer.pad_token_id)

        logits = model(src, tgt, tgt_mask=tgt_attention_mask, tgt_pad_mask=tgt_pad_mask, src_pad_mask=src_pad_mask)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        tgt = tgt.view(B*T)
        loss = F.cross_entropy(logits, tgt, ignore_index=norwegian_dataset.tokenizer.pad_token_id)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.detach().item()

        if batch_num in eval_batches:
            generate(model, device)
             

    return total_loss / len(iterator)

In [None]:
train_dataloader = DataLoader(norwegian_dataset, 
                              batch_size=16, 
                              shuffle=True,
                              collate_fn=NorwegianCollator(pad_idx=norwegian_dataset.tokenizer.pad_token_id))

In [None]:
for epoch in range(epochs):
    print(f'Epoch: {epoch + 1}', end='\n\n')
    epoch_loss = train(norwegian_model, train_dataloader, optimizer, device)
    print()
    print(f'Loss: {epoch_loss}')    