<a href="https://colab.research.google.com/github/luigiantonelli/DeepLearning-Project/blob/main/Deep_Learning_Project_Antonelli_Cuconasu_Gaudenzi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations and imports

In [1]:
!pip install pytorch-lightning --quiet
!pip install torchmetrics --quiet
!pip install gdown==4.5.4 --no-cache-dir --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m826.4/826.4 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 KB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [73]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, random_split
import torchmetrics
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
import pickle as pkl
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.progress import TQDMProgressBar
import math
from math import sqrt
import pickle
from typing import *
import gdown

In [3]:
"""
url = "https://drive.google.com/drive/folders/1-6MRkFoSSRJqeKgcMXm3PeA159KzHuB_?usp=sharing"
gdown.download_folder(url = url, quiet = True, use_cookies = False, remaining_ok=True)
"""

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
#dataset_folder_path = "/content/drive/MyDrive/Colab Notebooks/Deep Learning/DeepLearningProject-Shared"
dataset_folder_path = "/content/drive/MyDrive/Deep_Learning_Project"
os.chdir(dataset_folder_path)

In [7]:
!ls

algebra__linear_1d  lightning_logs	      model_classic_transformer.ckpt
datasets	    mathematics_dataset-v1.0  modules.txt


# Vocabulary

In this section we analyzed all the dataset files to retrieve the characters that will compose the vocabulary. Indeed, we wanted to be sure that our vocabulary contains all the files characters regardless the module we are working on.

Moreover, after this pre-processing phase we decided to add the special token `<unk>` (i.e., unknown). Thus, if during inference we are using characters that are not in the vocabulary, we are still able to pre-processes the input, since whathever unknown character is replaced by that special token.  

In [8]:
def read_dataset(text_path: str, lowercase: bool=True) -> Tuple[List[str], List[str]]:
    questions = []
    answers = []

    with open(text_path) as f:
        for idx, line in enumerate(f):
            if lowercase:
                if idx % 2 == 0: # Questions
                    questions.append(line.rstrip().lower()) 
                else: # Answers
                    answers.append(line.rstrip().lower())

    return questions, answers

In [9]:
def get_vocabulary(lists_of_texts: List[List[str]]) -> Set[str]:
    unified_text = []
    
    for l in lists_of_texts:
        unified_text += l

    return Counter(" ".join(unified_text)).keys()

In [10]:
# Get all files
folders = ['extrapolate', 'interpolate', 'train-easy', 'train-medium', 'train-hard']
files = []

for fold in folders:
    files += glob.glob(f"./mathematics_dataset-v1.0/{fold}/*.txt")

In [11]:
files[:5]

['./mathematics_dataset-v1.0/extrapolate/arithmetic__add_sub_multiple_longer.txt',
 './mathematics_dataset-v1.0/extrapolate/algebra__polynomial_roots_big.txt',
 './mathematics_dataset-v1.0/extrapolate/arithmetic__add_or_sub_big.txt',
 './mathematics_dataset-v1.0/extrapolate/arithmetic__div_big.txt',
 './mathematics_dataset-v1.0/extrapolate/arithmetic__mul_div_multiple_longer.txt']

In [12]:
def get_files_vocabulary(files: List[str], save: bool=False) -> List[str]:
    vocabulary = {}
    all_lists = []

    i = 0
    for f in files:
        train, test = read_dataset(f)
        all_lists += train
        all_lists += test
        
        # Set union
        vocabulary |= get_vocabulary(all_lists)
        all_lists = []

        # Save the vocabulary up to now
        if save and i % 10 == 0:
            vocabulary = sorted(list(vocabulary))
            with open('./datasets/pre_vocabulary.pkl', 'wb') as f:
                pickle.dump(vocabulary, f)

    # Save sorted vocabulary
    vocabulary = sorted(list(vocabulary))
    with open('./datasets/pre_vocabulary.pkl', 'wb') as f:
        pickle.dump(vocabulary, f)

    return vocabulary

This operation requires quite a bit of time (~ 25 min), as we are scanning all the files. So, it is commented to avoid executing it.

    vocabulary = get_files_vocabulary(files)

In [30]:
def create_vocabulary_from_set(voc):
    vocabulary = {'<pad>': 0, '<bos>': 1, '<eos>': 2, '<unk>': 3}
    i = 4
    for v in voc:
        vocabulary[v] = i
        i += 1
    return vocabulary

In [15]:
with open('./datasets/vocabulary.pkl', 'rb') as f:
    vocabulary = pickle.load(f)

In [16]:
len(vocabulary)

54

In [31]:
v = create_vocabulary_from_set(vocabulary)

In [32]:
v

{'<pad>': 0,
 '<bos>': 1,
 '<eos>': 2,
 '<unk>': 3,
 ' ': 4,
 '!': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 '<': 26,
 '=': 27,
 '>': 28,
 '?': 29,
 'a': 30,
 'b': 31,
 'c': 32,
 'd': 33,
 'e': 34,
 'f': 35,
 'g': 36,
 'h': 37,
 'i': 38,
 'j': 39,
 'k': 40,
 'l': 41,
 'm': 42,
 'n': 43,
 'o': 44,
 'p': 45,
 'q': 46,
 'r': 47,
 's': 48,
 't': 49,
 'u': 50,
 'v': 51,
 'w': 52,
 'x': 53,
 'y': 54,
 'z': 55,
 '{': 56,
 '}': 57}

# Dataset

In [45]:
def get_all_module_files(module_name: str) -> List[str]:
    folders = ['train-easy', 'train-medium', 'train-hard']
    files = []

    for fold in folders:
        files += glob.glob(f"./mathematics_dataset-v1.0/{fold}/{module_name}.txt")

    return files

In [46]:
module_files = get_all_module_files("algebra__linear_1d")
module_files

['./mathematics_dataset-v1.0/train-easy/algebra__linear_1d.txt',
 './mathematics_dataset-v1.0/train-medium/algebra__linear_1d.txt',
 './mathematics_dataset-v1.0/train-hard/algebra__linear_1d.txt']

In [47]:
def read_all_module_files(module_name: str) -> Tuple[List[str], List[str]]:
    module_files = get_all_module_files(module_name)
    module_train_lists = []
    module_test_lists = []

    for f in module_files:
        train, test = read_dataset(f)
        module_train_lists += train
        module_test_lists += test

    return module_train_lists, module_test_lists

In [48]:
# algebra_train, algebra_test = read_all_module_files("algebra__linear_1d")

In [49]:
# len(algebra_train)

In [50]:
algebra_path = "./mathematics_dataset-v1.0/train-easy/algebra__linear_1d.txt"
probability_path = "./mathematics_dataset-v1.0/train-easy/probability__swr_p_level_set.txt"
prime_path = "./mathematics_dataset-v1.0/train-easy/numbers__is_prime.txt"

In [51]:
"""
questions_easy_algebra, answers_easy_algebra = read_dataset(algebra_path)
questions_easy_probability, answers_easy_probability = read_dataset(probability_path)
questions_easy_prime, answers_easy_prime = read_dataset(prime_path)
"""

'\nquestions_easy_algebra, answers_easy_algebra = read_dataset(algebra_path)\nquestions_easy_probability, answers_easy_probability = read_dataset(probability_path)\nquestions_easy_prime, answers_easy_prime = read_dataset(prime_path)\n'

In [107]:
class Mathematics_Dataset(Dataset):
    def __init__(self, modules: List[str], vocabulary: dict):
        self.modules = modules
        self.questions = []
        self.answers = []
        for m in self.modules:
            q_m, a_m = self.read_dataset(m)
            self.questions += q_m
            self.answers += a_m
        self.max_len_question = 160
        self.max_len_answer = 30
        self.vocabulary = vocabulary

    def read_dataset(self, text_path: str, lowercase: bool=True) -> Tuple[List[str], List[str]]:
        questions = []
        answers = []
        with open(text_path, 'r') as f:
            for idx, line in enumerate(f):
                if lowercase:
                    if idx % 2 == 0: # Questions
                        questions.append(line.rstrip().lower()) 
                    else: # Answers
                        answers.append(line.rstrip().lower())
        return questions, answers

    def convert_chars_to_ids(self, sentence: str, max_len: int) -> torch.tensor:
        sentence_ids = np.full(max_len + 2, self.vocabulary['<pad>'])

        # Start with <bos>
        sentence_ids[0] = self.vocabulary['<bos>']

        for i, char in enumerate(sentence):
            sentence_ids[i + 1] = self.vocabulary.get(char, self.vocabulary['<unk>'])
            
        # End with <eos>
        sentence_ids[len(sentence) + 1] = self.vocabulary['<eos>']

        return torch.from_numpy(sentence_ids).long()


    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        assert idx < len(self.questions)
        
        q, a = self.questions[idx], self.answers[idx]

        question = self.convert_chars_to_ids(q, self.max_len_question)
        answer = self.convert_chars_to_ids(a, self.max_len_answer)
        
        return question, answer

In [53]:
d = Mathematics_Dataset(module_files, v)

In [54]:
q, a = d[8]
q, a

(tensor([ 1, 48, 44, 41, 51, 34,  4, 12, 16, 20,  9, 35,  4, 10,  4, 17, 16,  9,
         35,  4, 12,  4, 16, 17,  4, 27,  4, 15,  4, 35, 44, 47,  4, 35, 13,  2,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 tensor([ 1, 17,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]))

In [87]:
class Mathematics_DataModule(pl.LightningDataModule):
    def __init__(self, modules: List[str], batch_size: int = 32):
        super().__init__()
        self.modules = modules
        self.batch_size = batch_size
        self.load_vocabulary()
    
    def load_vocabulary(self):
        with open('./datasets/vocabulary.pkl', 'rb') as f:
            v = pickle.load(f)
        self.vocabulary = create_vocabulary_from_set(v)

    def setup(self, stage: str):
        self.math = Mathematics_Dataset(self.modules, self.vocabulary)
        self.math_train, self.math_val, self.math_test = random_split(self.math, [0.75, 0.05, 0.20])
    
    def train_dataloader(self):
        return DataLoader(self.math_train, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):                                                              
        return DataLoader(self.math_val, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.math_test, batch_size=self.batch_size)

    def teardown(self, stage: str):
        # Used to clean-up when the run is finished
        pass

In [56]:
dm = Mathematics_DataModule(['./mathematics_dataset-v1.0/train-easy/algebra__linear_1d.txt'], batch_size = 64)

# Modules

In [88]:
def scaled_dot_product_attention(query, key, value, mask):
    sqrt_q = sqrt(query.size(-1))

    t = torch.matmul(query, key.transpose(-2, -1)) / sqrt_q
    t = t.masked_fill_(mask == 0, -1e-10) #-1e-10 acts like -infinity, so that the softmax will consider these tokens less important
    return torch.matmul(F.softmax(t, dim = -1), value)

In [89]:
class MultiHeadAttention(nn.Module): 
    def __init__(self, embedding_dim, num_heads, tp_attention = False):
        super(MultiHeadAttention, self).__init__()
        assert embedding_dim % num_heads == 0
        self.tp_attention = tp_attention
        self.dim_head = embedding_dim // num_heads #single head dimension
        # self.sqrt_q = sqrt(self.dim_head)
        self.num_heads = num_heads
        self.W_q = nn.Linear(embedding_dim, embedding_dim, bias = True) #stack of num_heads matrices of dimension (d, dim_head), one for each head
        self.W_k = nn.Linear(embedding_dim, embedding_dim, bias = True)
        self.W_v = nn.Linear(embedding_dim, embedding_dim, bias = True)
        self.W_o = nn.Linear(embedding_dim, embedding_dim, bias = True)
        if self.tp_attention:
            self.W_r = nn.Linear(embedding_dim, embedding_dim, bias = True) #ruolo

    def forward(self, query, key, value, mask): #query, key, value
        batch_size = query.size(0)
        q = self.W_q(query).view(batch_size, -1, self.num_heads, self.dim_head).transpose(1, 2)
        k = self.W_k(key).view(batch_size, -1, self.num_heads, self.dim_head).transpose(1, 2)
        v = self.W_v(value).view(batch_size, -1, self.num_heads, self.dim_head).transpose(1, 2)

        attention_value = scaled_dot_product_attention(q, k, v, mask)
        
        if self.tp_attention:
            role = self.W_r(query).view(batch_size, -1, self.num_heads, self.dim_head).transpose(1, 2)
            attention_value *= role  #element-wise product between attention value and role before the final projection
        return self.W_o(attention_value.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads*self.dim_head))

In [90]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, hidden_size = None, dropout=0.2, tp_attention = False):
        super(TransformerBlock, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.attention = MultiHeadAttention(embedding_dim, num_heads, tp_attention)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout2 = nn.Dropout(dropout)
        hidden_size = 4*embedding_dim if hidden_size is None else hidden_size
        self.ff = nn.Sequential(nn.Linear(embedding_dim, hidden_size, bias = True), 
                                nn.ReLU(inplace = True),
                                nn.Linear(hidden_size, embedding_dim, bias = True))

    def forward(self, query, key, value, mask): #query, key, value
        x = query + self.attention(query, key, value, mask) #query as res conn because the decoder block requires it and it doesn't matter for encoder blocks
        x = self.dropout1(self.norm1(x))
        x = x + self.ff(x)
        x = self.dropout2(self.norm2(x))
        return x

In [91]:
class DecoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, hidden_size, dropout = 0.2, tp_attention = False):
        super(DecoderBlock, self).__init__()
        self.masked_attention = MultiHeadAttention(embedding_dim, num_heads, tp_attention)
        self.norm = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.transformer_block = TransformerBlock(embedding_dim, num_heads, hidden_size, dropout, tp_attention)

    def forward(self, output_encoder, src_mask, y, trg_mask):
        y = y + self.masked_attention(y, y, y, trg_mask) #masked attention (y = query = key = value) + residual connection
        y = self.dropout(self.norm(y))
        return self.transformer_block(y, output_encoder, output_encoder, src_mask)#query from the masked mha and key and value from the encoder

In [92]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len = 5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * -(math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return x + Variable(self.pe[:, :x.size(1)], requires_grad = False)

In [93]:
class TransformerEncoder(nn.Module):
    def __init__(self, embedding_dim, num_heads, hidden_size, dropout, num_blocks = 6, tp_attention = False):
        super(TransformerEncoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.encoder = nn.ModuleList(
            [TransformerBlock(embedding_dim, num_heads, hidden_size, dropout, tp_attention) for _ in range(num_blocks)]
            )

    def forward(self, x, mask): 
        for block in self.encoder:
            x = block(x, x, x, mask)
        return x

In [94]:
class TransformerDecoder(nn.Module):
    def __init__(self, embedding_dim, num_heads, hidden_size, dropout = 0.2, num_blocks = 6, tp_attention = False):
        super(TransformerDecoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.decoder = nn.ModuleList(
            [DecoderBlock(embedding_dim, num_heads, hidden_size, dropout, tp_attention) for _ in range(num_blocks)]
            )

    def forward(self, output_encoder, src_mask, y, trg_mask): 
        for block in self.decoder:
            y = block(output_encoder, src_mask, y, trg_mask)
        return y

In [95]:
class Transformer(pl.LightningModule):
    def __init__(self, special_idxs, embedding_dim = 256, num_heads = 4, hidden_size = 512, dropout = 0.2, vocabulary_size = 58, num_blocks_encoder = 6, num_blocks_decoder = 6, tp_attention = False):
        super(Transformer, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.bos_id = special_idxs['<bos>']
        self.eos_id = special_idxs['<eos>']
        self.pad_id = special_idxs['<pad>']
        self.token_embedding = nn.Embedding(vocabulary_size, embedding_dim, padding_idx = self.pad_id)
        self.positional_embedding = PositionalEncoding(embedding_dim)
        self.encoder = TransformerEncoder(embedding_dim, num_heads, hidden_size, dropout, num_blocks_encoder, tp_attention)
        self.decoder = TransformerDecoder(embedding_dim, num_heads, hidden_size, dropout, num_blocks_decoder, tp_attention)
        self.to_logits = nn.Linear(embedding_dim, vocabulary_size)
        
        self.max_len_question = 162
        self.max_len_answer = 32
        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=vocabulary_size, ignore_index = self.pad_id)

    def create_trg_mask(self, y): #compute a mask so that the prediction of the next token can only depend on the previous tokens
        # #[batch_size, 1, len, len] & [batch_size, 1, 1, len]
        return self.create_causal_mask(y) & self.create_padding_mask(y)

    def create_causal_mask(self, y):
        batch_size, seq_len = y.shape
        mask = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.int64, device = self.device)).expand(
            batch_size, 1, seq_len, seq_len)
        return mask

    def create_padding_mask(self, x):
        batch_size, seq_len = x.shape
        mask = (x != self.pad_id).unsqueeze(-2).unsqueeze(-2).expand(
                batch_size, 1, 1, seq_len)
        return mask

    def inference(self, x):
        #encode and then generate the output token by token greedily
        self.eval()
        with torch.no_grad():
            batch_size = x.shape[0]
            src_mask = self.create_padding_mask(x)
            x = self.token_embedding(x)
            x = self.positional_embedding(x)
            output_encoder = self.encoder(x, src_mask)
            output = torch.ones(batch_size, 1, dtype=torch.int64, device = self.device).fill_(self.bos_id)
            done = torch.zeros(batch_size, dtype = torch.uint8, device = self.device)
            for _ in range(self.max_len_answer - 1): 
                trg_mask = self.create_trg_mask(output)
                output_embedding = self.token_embedding(output)
                output_embedding = self.positional_embedding(output_embedding)
                out = self.decoder(output_encoder, src_mask, output_embedding, trg_mask)
                out = self.to_logits(out)
                out = torch.argmax(out[:,[-1],:], dim = -1)
                output = torch.cat([output, out], dim = 1)

                eos_reached = out.squeeze(1) == self.eos_id
                done |= eos_reached
                if done.sum() == batch_size:
                    break
            return output

    def forward(self, x, y):
        src_mask = self.create_padding_mask(x)
        trg_mask = self.create_trg_mask(y)

        x = self.token_embedding(x)
        x = self.positional_embedding(x)
        y = self.token_embedding(y)
        y = self.positional_embedding(y)

        output_encoder = self.encoder(x, src_mask)
        return self.to_logits(self.decoder(output_encoder, src_mask, y, trg_mask)).transpose(1,2)
    
    def configure_optimizers(self):# learning rate = 1x10^-4; beta1 =0.9; beta2 = 0.995 dal paper
        return torch.optim.Adam(self.parameters(), lr=1e-4, betas=(0.9, 0.995))

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x, y)
        loss = F.cross_entropy(y_pred, y, ignore_index = self.pad_id)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.inference(x)  #[batch_size, max_eos_found]
        y_pred = F.pad(y_pred, (0, self.max_len_answer - y_pred.shape[1]), mode='constant', value=self.pad_id) #[batch_size, max_len_answer]
        self.accuracy.update(y_pred, y)

    def validation_epoch_end(self, outputs):
        self.log('accuracy_epoch', self.accuracy.compute())
        self.accuracy.reset()


In [118]:
"""
output = torch.ones(5, 1, dtype=torch.int64).fill_(1)
done = torch.zeros(5, dtype = torch.uint8)

for i in range(10): 
    out = torch.zeros((5,1))
    out[i, :] = 2
    output = torch.cat([output, out], dim = 1)

    eos_reached = out.squeeze(1) == 2
    done |= eos_reached
    if done.sum() == 5:
        break
print(output.shape)
print(output)
output = F.pad(output, (0, 10 - output.shape[1]), mode='constant', value=5)
output.shape
print(output)
"""

torch.Size([5, 6])
tensor([[1., 2., 0., 0., 0., 0.],
        [1., 0., 2., 0., 0., 0.],
        [1., 0., 0., 2., 0., 0.],
        [1., 0., 0., 0., 2., 0.],
        [1., 0., 0., 0., 0., 2.]])
tensor([[1., 2., 0., 0., 0., 0., 5., 5., 5., 5.],
        [1., 0., 2., 0., 0., 0., 5., 5., 5., 5.],
        [1., 0., 0., 2., 0., 0., 5., 5., 5., 5.],
        [1., 0., 0., 0., 2., 0., 5., 5., 5., 5.],
        [1., 0., 0., 0., 0., 2., 5., 5., 5., 5.]])


# SOTA

In [96]:
vocabulary = v

In [105]:
root_dir = "./training/"
logger_dir = "./training/logs"
EPOCHS = 3
BATCH_SIZE = 4
EMBEDDING_DIM = 256
NUM_HEADS = 8
assert EMBEDDING_DIM % NUM_HEADS == 0
HIDDEN_SIZE = 512
DROP_PROB = 0.2
NUM_BLOCKS_ENCODER = 6
NUM_BLOCKS_DECODER = 6
SPECIAL_CHAR_DICT = {'<bos>': vocabulary['<bos>'], '<eos>': vocabulary['<eos>'], '<pad>': vocabulary['<pad>']}

In [106]:
tp_transformer = Transformer(
    SPECIAL_CHAR_DICT, embedding_dim = EMBEDDING_DIM, num_heads = NUM_HEADS, hidden_size = HIDDEN_SIZE, 
    dropout = DROP_PROB, vocabulary_size = len(vocabulary), num_blocks_encoder = NUM_BLOCKS_ENCODER,
    num_blocks_decoder = NUM_BLOCKS_DECODER, tp_attention = True
    )

logger = TensorBoardLogger(logger_dir, name="TP-Transformer", log_graph=True)
callbacks = [TQDMProgressBar(refresh_rate=20)]
trainer = pl.Trainer(default_root_dir=root_dir, accelerator='auto', devices=1, gradient_clip_val = 0.1, max_epochs = EPOCHS, logger = logger, callbacks = callbacks)
math_dm = Mathematics_DataModule(module_files, batch_size = BATCH_SIZE)
trainer.fit(tp_transformer, datamodule = math_dm)
#aggiungere folder per il log /content/drive/MyDrive/Deep_Learning_Project/lightning_logs


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                 | Type               | Params
------------------------------------------------------------
0 | token_embedding      | Embedding          | 14.8 K
1 | positional_embedding | PositionalEncoding | 0     
2 | encoder              | TransformerEncoder | 3.6 M 
3 | decoder              | TransformerDecoder | 5.5 M 
4 | to_logits            | Linear             | 14.9 K
5 | accuracy             | MulticlassAccuracy | 0     
------------------------------------------------------------
9.1 M     Trainable params
0         N

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [99]:
trainer.save_checkpoint("model_classic_transformer.ckpt")

In [100]:
"""
da fare:

-  aggiungere TensorBoardLogger
    var = TensorBoardLogger(path, name=modello (tp o vanilla), log_graph=True)
   (https://pytorch-lightning.readthedocs.io/en/stable/extensions/generated/pytorch_lightning.loggers.TensorBoardLogger.html)

-  utilizzare stage (parametro di setup) per caricare anche un solo dataset se stage = "train" ad esempio 
   (https://colab.research.google.com/drive/1oJrA-Q-neOl1fCQJhIWR_GmxpYaG-cFx?authuser=1#scrollTo=JM57yq7bJS0E)

-  aggiungere predict_step nel pl.LightningModule dove si chiama inference e relativo predict dataloader nel Lightning data module


-  RNN fatte molto bene:
    https://github.com/georgeyiasemis/Recurrent-Neural-Networks-from-scratch-using-PyTorch 
    https://towardsdatascience.com/building-a-lstm-by-hand-on-pytorch-59c02a4ec091
"""

'\nda fare:\n-  drop_last = True nei dataloader perché altrimenti abbiamo sicuramente degli errori\n   (possibile alternativa: nella mha ricavare la batch size dalla prima dimensione dell\'input \n    per evitare che dia errori di questo tipo durante l\'inference con un solo input)\n\n-  callbacks=[TQDMProgressBar(refresh_rate=20)] da aggiungere come parametro al trainer\n\n-  aggiungere TensorBoardLogger\n    var = TensorBoardLogger(path, name=modello (tp o vanilla), log_graph=True)\n   (https://pytorch-lightning.readthedocs.io/en/stable/extensions/generated/pytorch_lightning.loggers.TensorBoardLogger.html)\n\n-  utilizzare stage (parametro di setup) per caricare anche un solo dataset se stage = "train" ad esempio \n   (https://colab.research.google.com/drive/1oJrA-Q-neOl1fCQJhIWR_GmxpYaG-cFx?authuser=1#scrollTo=JM57yq7bJS0E)\n\n-  aggiungere predict_step nel pl.LightningModule dove si chiama inference e relativo predict dataloader nel Lightning data module\n\n-  in inference ha senso

# NON-SOTA