https://andrewpeng.dev/transformer-pytorch/

https://nlp.seas.harvard.edu/2018/04/03/attention.html

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [1]:
import os
os.chdir("/mnt/c/cloud/thesis/")
import math

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd

import tokenizers

import seaborn as sns
sns.set(rc={'figure.figsize':(15, 10)})

from einops import rearrange

from constants import paths as p
from constants import tokens as t
from constants import hyperparameters as hp

import pytorch_lightning as pl
import pytorch_lightning.callbacks as cb
from torch.utils.data import TensorDataset

In [15]:
train = pd.read_pickle(p.TRAIN_NUMERICALIZED_PATH)

In [2]:
validation = pd.read_pickle(p.VALIDATION_NUMERICALIZED_PATH)

In [6]:
random_train_batch = validation.sample(n=hp.BATCH_SIZE, random_state=42)

In [7]:
random_train_batch.shape

(32, 27)

In [8]:
random_train_batch.columns

Index(['id', 'input', 'output', 'helpfulness_difference', 'rating_difference',
       'length_difference', 'target_string', 'source_string',
       'source_numericalized', 'target_numericalized',
       'source_numericalized_padded', 'target_numericalized_input',
       'target_numericalized_output', 'target_numericalized_input_padded',
       'target_numericalized_output_padded', 'source_start_token_indexes',
       'source_end_token_indexes', 'source_padding_token_first_index',
       'target_input_padding_token_first_index', 'source_padded_indices',
       'source_padding_mask', 'target_input_padding_mask', 'source_ratings',
       'source_helpfulnesses', 'helpfulness_difference_vector',
       'rating_difference_vector', 'length_difference_vector'],
      dtype='object')

In [5]:
properties_columns = ["source_ratings", "source_helpfulnesses", "helpfulness_difference_vector", "rating_difference_vector", "length_difference_vector"]

In [87]:
n[0].shape, n[1].shape

(torch.Size([3, 10]), torch.Size([3, 15]))

In [6]:
def extract_training_matrixes(data, file):
    batched_source_numericalized = torch.from_numpy(np.stack(data.source_numericalized_padded)).long()
    batched_source_properties = torch.from_numpy(np.stack([np.stack(data[p].to_numpy()) for p in properties_columns], axis=2)).float()
    batched_target_input_numericalized = torch.from_numpy(np.stack(data.target_numericalized_input_padded)).long()
    batched_source_padding_mask = torch.from_numpy(np.stack(data.source_padding_mask)).bool()
    batched_target_input_padding_mask = torch.from_numpy(np.stack(data.target_input_padding_mask)).bool()
    batched_target_output_numericalized = torch.from_numpy(np.stack(data.target_numericalized_output_padded)).long()
    
    torch.save(TensorDataset(batched_source_numericalized, batched_source_properties, batched_target_input_numericalized, batched_source_padding_mask, batched_target_input_padding_mask, batched_target_output_numericalized), file)

In [16]:
extract_training_matrixes(train, p.TRAIN_TENSOR_DATASET_PATH)

In [7]:
extract_training_matrixes(validation, p.VALIDATION_TENSOR_DATASET_PATH)

In [23]:
train = torch.load(p.TRAIN_TENSOR_DATASET_PATH)

In [8]:
validation = torch.load(p.VALIDATION_TENSOR_DATASET_PATH)

In [10]:
dl = torch.utils.data.DataLoader(validation, batch_size=hp.BATCH_SIZE)

In [11]:
n = next(iter(dl))

In [14]:
shape_and_type(*n)

[(torch.Size([32, 512]), 'torch.LongTensor'),
 (torch.Size([32, 512, 5]), 'torch.FloatTensor'),
 (torch.Size([32, 256]), 'torch.LongTensor'),
 (torch.Size([32, 512]), 'torch.BoolTensor'),
 (torch.Size([32, 256]), 'torch.BoolTensor'),
 (torch.Size([32, 256]), 'torch.LongTensor')]

In [13]:
def shape_and_type(*args):
    result = []
    for arg in args:
        result.append((arg.shape, arg.type()))
    return result

In [8]:
batched_source_numericalized = torch.from_numpy(np.stack(random_train_batch.source_numericalized_padded)).long()

In [9]:
batched_target_input_numericalized = torch.from_numpy(np.stack(random_train_batch.target_numericalized_input_padded)).long()

In [10]:
batched_source_properties = torch.from_numpy(np.stack([np.stack(random_train_batch[p].to_numpy()) for p in properties_columns], axis=2)).float()

In [11]:
batched_source_padding_mask = torch.from_numpy(np.stack(random_train_batch.source_padding_mask)).bool()

In [12]:
batched_target_input_padding_mask = torch.from_numpy(np.stack(random_train_batch.target_input_padding_mask)).bool()

In [13]:
shape_and_type(batched_source_numericalized, batched_source_properties, batched_target_input_numericalized, batched_source_padding_mask, batched_target_input_padding_mask)

[(torch.Size([32, 512]), 'torch.LongTensor'),
 (torch.Size([32, 512, 5]), 'torch.FloatTensor'),
 (torch.Size([32, 256]), 'torch.LongTensor'),
 (torch.Size([32, 512]), 'torch.BoolTensor'),
 (torch.Size([32, 256]), 'torch.BoolTensor')]

In [14]:
# batched_source_numericalized -> batch_size x source_sequence_length(start_id, t1, t2, t3.., end_id, start_id, t4.., pad_id, pad_id..)(int/long)
# batched_source_properties -> batch_size x source_sequence_length x 5(number_of_properties)(0 for special tokens...)(float)
# batched_target_input_numericalized -> batch_size x target_sequence_length (start_id, t1, t2... pad_id, pad_id, pad_id...)(int/long)(without end_id)
# batched_source/target_padding_mask -> batch_size x source/target_sequence_length(False, False..., True, True..)(bool)

In [18]:
self = Transformer()

In [19]:
batched_source_embedded = self.embedding_layer(batched_source_numericalized)
batched_target_embedded = self.embedding_layer(batched_target_input_numericalized)
# batched_source/target_embedded -> batch_size x source/target_sequence_length x embedding_size

In [20]:
shape_and_type(batched_source_embedded, batched_target_embedded)

[(torch.Size([32, 512, 400]), 'torch.FloatTensor'),
 (torch.Size([32, 256, 400]), 'torch.FloatTensor')]

In [21]:
hp.MODEL_DIMENSION, hp.EMBEDDING_SIZE

(400, 400)

In [22]:
batched_source = torch.cat((batched_source_embedded, batched_source_properties), dim=2) 
# -> batch_size x source_sequence_length x embedding_size + number_of_properies

In [23]:
shape_and_type(batched_source)

[(torch.Size([32, 512, 405]), 'torch.FloatTensor')]

In [24]:
batched_source = self.combine_embeddings_and_properties_layer(batched_source)  
# -> batch_size x source_sequence_length x model_dimension (512)

In [25]:
shape_and_type(batched_source)

[(torch.Size([32, 512, 400]), 'torch.FloatTensor')]

In [26]:
shape_and_type(batched_source, batched_target_embedded)

[(torch.Size([32, 512, 400]), 'torch.FloatTensor'),
 (torch.Size([32, 256, 400]), 'torch.FloatTensor')]

In [27]:
shape_and_type(self.positional_encoding_layer.positional_encoding)

[(torch.Size([5000, 400]), 'torch.FloatTensor')]

In [28]:
batched_source = self.positional_encoding_layer(batched_source * math.sqrt(self.model_dimension))  # normalizing(reducing variance) before positonally encoding
batched_target = self.positional_encoding_layer(batched_target_embedded * math.sqrt(self.model_dimension))
# batched_source/target -> batch_size x source/target_sequence_length x model_dimension

In [29]:
shape_and_type(batched_source, batched_target)

[(torch.Size([32, 512, 400]), 'torch.FloatTensor'),
 (torch.Size([32, 256, 400]), 'torch.FloatTensor')]

In [30]:
shape_and_type(batched_source_padding_mask, batched_target_input_padding_mask)

[(torch.Size([32, 512]), 'torch.BoolTensor'),
 (torch.Size([32, 256]), 'torch.BoolTensor')]

In [31]:
shape_and_type(self.decoder_attention_mask)

[(torch.Size([256, 256]), 'torch.FloatTensor')]

In [40]:
transformer_output = self.transformer(
    src=rearrange(batched_source[:,:10,:], 'b s m -> s b m'),  # batched_source.tranpose(0, 1)
    tgt=rearrange(batched_target[:,:20,:], 'b s m -> s b m'),
    tgt_mask=self.decoder_attention_mask[:20,:20],
    src_key_padding_mask=batched_source_padding_mask[:,:10],
    memory_key_padding_mask=batched_source_padding_mask[:,:10],
    tgt_key_padding_mask=batched_target_input_padding_mask[:,:20],
)

In [41]:
shape_and_type(transformer_output)

[(torch.Size([20, 32, 400]), 'torch.FloatTensor')]

transformer_output = self.transformer(
    src=rearrange(batched_source, 'b s m -> s b m'),  # batched_source.tranpose(0, 1)
    tgt=rearrange(batched_target, 'b s m -> s b m'),
    tgt_mask=self.decoder_attention_mask,
    src_key_padding_mask=batched_source_padding_mask,
    memory_key_padding_mask=batched_source_padding_mask,
    tgt_key_padding_mask=batched_target_input_padding_mask,
)

In [42]:
transformer_output = rearrange(transformer_output, 's b m -> b s m')
# transformer_output -> batch_size x target_sequence_length x model_dimension

In [43]:
shape_and_type(transformer_output)

[(torch.Size([32, 20, 400]), 'torch.FloatTensor')]

In [44]:
vocabulary_logits = self.transformer_to_vocabulary_logits_layer(transformer_output)
# vocabulary_logits -> batch_size x target_sequence_length x vocabulary_size

In [45]:
shape_and_type(vocabulary_logits)

[(torch.Size([32, 20, 20000]), 'torch.FloatTensor')]

In [52]:
vocabulary_logits.min()

tensor(-2.8783, grad_fn=<MinBackward1>)

In [53]:
vocabulary_logits.max()

tensor(2.9149, grad_fn=<MaxBackward1>)

In [58]:
vocabulary_log_probability = F.log_softmax(vocabulary_logits, dim=-1)

In [59]:
shape_and_type(vocabulary_log_probability)

[(torch.Size([32, 20, 20000]), 'torch.FloatTensor')]

In [60]:
vocabulary_log_probability.min()

tensor(-12.9500, grad_fn=<MinBackward1>)

In [61]:
vocabulary_log_probability.max()

tensor(-7.1453, grad_fn=<MaxBackward1>)

I could rename this as max target sequence lengthed or cached actually, same as positional encoding

In [80]:
transformer = Transformer(target_sequence_length=20)

In [81]:
shape_and_type(transformer.forward(
    batched_source_numericalized[:,:10],
    batched_source_properties[:,:10,:],
    batched_target_input_numericalized[:,:20],
    batched_source_padding_mask[:,:10], 
    batched_target_input_padding_mask[:,:20],
))

[(torch.Size([32, 20, 20000]), 'torch.FloatTensor')]

In [None]:
from models.transformer import Transformer

In [75]:
class Transformer(nn.Module):
    def __init__(self, vocabulary_size=hp.VOCABULARY_SIZE, embedding_size=hp.EMBEDDING_SIZE, number_of_properties=hp.NUMBER_OF_PROPERTIES, padding_index=hp.PADDING_INDEX, model_dimension=hp.MODEL_DIMENSION, target_sequence_length=hp.TARGET_SEQUENCE_LENGTH, dropout_probability=hp.DROPOUT_PROBABILITY, feed_forward_transformer_layer_dimension=hp.FEED_FORWARD_TRANSFORMER_LAYER_DIMENSION):
        super().__init__()
        
        self.model_dimension = model_dimension
        
        self.embedder = nn.Embedding(vocabulary_size, embedding_size, padding_idx=padding_index)
        self.embedder_drouput = nn.Dropout(dropout_probability)
        self.embedding_layer = nn.Sequential(self.embedder, self.embedder_drouput)

        self.combine_embeddings_and_properties_layer = nn.Linear(embedding_size + number_of_properties, model_dimension)
        self.positional_encoding_layer = PositionalEncoding(model_dimension, dropout_probability=dropout_probability)
        self.transformer = nn.Transformer(d_model=model_dimension, dim_feedforward=feed_forward_transformer_layer_dimension, dropout=dropout_probability)
        self.transformer_to_vocabulary_logits_layer = nn.Linear(model_dimension, vocabulary_size)

        self.register_buffer('decoder_attention_mask', torch.full((target_sequence_length, target_sequence_length), float("-inf")).triu(diagonal=1))

    def forward(self, batched_source_numericalized, batched_source_properties, batched_target_input_numericalized, batched_source_padding_mask, batched_target_input_padding_mask):
        # all needs to be torch.float, torch.double is not supported by Transformer()
        # batched_source_numericalized -> batch_size x source_sequence_length(start_id, t1, t2, t3.., end_id, start_id, t4.., pad_id, pad_id..)(int/long)
        # batched_source_properties -> batch_size x source_sequence_length x 5(number_of_properties)(0 for special tokens...)(float)
        # batched_target_input_numericalized -> batch_size x target_sequence_length (start_id, t1, t2... pad_id, pad_id, pad_id...)(int/long)(without end_id)
        # batched_source/target_padding_mask -> batch_size x source/target_sequence_length(False, False..., True, True..)(bool)

        batched_source_embedded = self.embedding_layer(batched_source_numericalized)
        batched_target_embedded = self.embedding_layer(batched_target_input_numericalized)
        # batched_source/target_embedded -> batch_size x source/target_sequence_length x embedding_size

        batched_source = torch.cat((batched_source_embedded, batched_source_properties), dim=2)  # batch_size x source_sequence_length x embedding_size + number_of_properies
        batched_source = self.combine_embeddings_and_properties_layer(batched_source)  # batch_size x source_sequence_length x model_dimension

        batched_source = self.positional_encoding_layer(batched_source * math.sqrt(self.model_dimension))  # normalizing(reducing variance) before positonally encoding
        batched_target = self.positional_encoding_layer(batched_target_embedded * math.sqrt(self.model_dimension))
        # batched_source/target -> batch_size x source/target_sequence_length x model_dimension

        transformer_output = self.transformer(
            src=rearrange(batched_source, 'b s m -> s b m'),  # batched_source.tranpose(0, 1)
            tgt=rearrange(batched_target, 'b s m -> s b m'),
            tgt_mask=self.decoder_attention_mask,
            src_key_padding_mask=batched_source_padding_mask,
            memory_key_padding_mask=batched_source_padding_mask,
            tgt_key_padding_mask=batched_target_input_padding_mask,
        )
        transformer_output = rearrange(transformer_output, 's b m -> b s m')
        # transformer_output -> batch_size x target_sequence_length x model_dimension

        vocabulary_logits = self.transformer_to_vocabulary_logits_layer(transformer_output)
        # vocabulary_logits -> batch_size x target_sequence_length x vocabulary_size

        #vocabulary_log_probability = F.log_softmax(vocabulary_logits, dim=-1)
        # it's more numerically stable
        # https://deepdatascience.wordpress.com/2020/02/27/log-softmax-vs-softmax/

        return vocabulary_logits

In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, model_dimension, dropout_probability=0.1, cached_maximum_sequence_length = 5_000):
        super().__init__()
        self.model_dimension = model_dimension
        self.cached_maximum_sequence_length = cached_maximum_sequence_length
        
        positional_encoding = self._compute_positional_encoding_matrix(model_dimension, cached_maximum_sequence_length)
        # positional_encoding -> cached_maximum_sequence_length x model_dimension
        self.register_buffer('positional_encoding', positional_encoding)
        
        self.dropout_layer = nn.Dropout(dropout_probability)
        
    def _compute_positional_encoding_matrix(self, model_dimension, sequence_length):
        first_multiplier = torch.arange(0, sequence_length, dtype=torch.float)
        second_multiplier = 1.0 / torch.pow(torch.tensor(10_000), torch.arange(0, model_dimension, 2, dtype=torch.float) / model_dimension)
        computation = rearrange(first_multiplier, 'f -> f 1') * rearrange(second_multiplier, 's -> 1 s')
        
        positional_encoding = torch.zeros(sequence_length, model_dimension)
        positional_encoding[:, ::2] = torch.sin(computation)
        positional_encoding[:, 1::2] = torch.cos(computation)
        
        return positional_encoding
        
    def forward(self, batched_input):
        # batched_input -> batch_size x sequence_length x model_dimension
        positional_encoding = self.positional_encoding if (batched_input.shape[-2] <= self.cached_maximum_sequence_length) else self._compute_positional_encoding_matrix(self.model_dimension, batched_input.shape[-2])
        
        positionaly_encoded_input = batched_input + positional_encoding[:batched_input.shape[-2], :] 
        return self.dropout_layer(positionaly_encoded_input)
    

In [44]:
positional_encoding_layer = PositionalEncoding(model_dimension, cached_maximum_sequence_length=1000)

In [45]:
positional_encoding_layer.positional_encoding.shape

torch.Size([1000, 512])

In [11]:
t = Transformer()

<super: <class 'Transformer'>, <Transformer object>>
