## Importação dos pacotes

In [1]:
import collections
import itertools
import functools
import math
import random

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook


In [2]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

Using cuda:0


## Implementação do MyDataset

In [3]:
from typing import List
from tqdm.notebook import tqdm


def tokenize(text: str, tokenizer):
    # Recomenda-se usar o tokenizer.batch_encode_plus pois é mais rápido.
    return tokenizer(text, return_tensors=None, add_special_tokens=False).input_ids


class MyDataset():
    def __init__(self, texts: List[str], tokenizer, max_seq_length: int):
        self.max_seq_length = max_seq_length
        self.tokenized_texts = []
        for text in tqdm(texts):
            tokenized_text = tokenize(f'[CLS] {text}', tokenizer)
            tokenized_text += [tokenizer.vocab['[PAD]']] * max(0, 1 + max_seq_length - len(tokenized_text))
            
            for i in range(0, len(tokenized_text) - 1, max_seq_length):
                
                if i + max_seq_length < len(tokenized_text):
                    self.tokenized_texts.append(tokenized_text[i: i + max_seq_length + 1])
                else:
                    self.tokenized_texts.append(tokenized_text[-max_seq_length - 1:])
                    
        self.tokenized_texts = torch.LongTensor(self.tokenized_texts)

    def __len__(self):
        return len(self.tokenized_texts)

    def __getitem__(self, idx):
        x_y = self.tokenized_texts[idx]
        return x_y[:-1], x_y[1:]

## Testando se a implementação do MyDataset está correta

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza']

dummy_dataset = MyDataset(texts=dummy_texts, tokenizer=tokenizer, max_seq_length=9)
dummy_loader = DataLoader(dummy_dataset, batch_size=6, shuffle=False)
assert len(dummy_dataset) == 2
print('Passou no assert de tamanho do dataset.')

first_batch_input, first_batch_target = next(iter(dummy_loader))

correct_first_batch_input = torch.LongTensor(
    [[  101,  3396, 10303,   125, 13239,     0,     0,     0,     0],
     [  101,  1660,  5971,   785,   125,  1847, 13779, 15616,     0]])

correct_first_batch_target = torch.LongTensor(
    [[ 3396, 10303,   125, 13239,     0,     0,     0,     0,     0],
     [ 1660,  5971,   785,   125,  1847, 13779, 15616,     0,     0]])

assert torch.equal(first_batch_input, correct_first_batch_input)
assert torch.equal(first_batch_target, correct_first_batch_target)

print('Passou no assert de dataset.')

  0%|          | 0/2 [00:00<?, ?it/s]

Passou no assert de tamanho do dataset.
Passou no assert de dataset.


In [5]:
tokenize('Eu gosto pizza', tokenizer)

[3396, 10303, 13779, 15616]

In [6]:
dummy_texts = ['Eu gosto de correr e de comer muita pizza', 'Ela gosta muito de comer pizza']

dummy_dataset = MyDataset(texts=dummy_texts, tokenizer=tokenizer, max_seq_length=9)
dummy_loader = DataLoader(dummy_dataset, batch_size=6, shuffle=False)
assert len(dummy_dataset) == 3
print('Passou no assert de tamanho do dataset.')

first_batch_input, first_batch_target = next(iter(dummy_loader))

correct_first_batch_input = torch.LongTensor(
    [[  101,  3396, 10303,  125, 13239,  122,    125,  1847,  5747],
     [  3396, 10303, 125, 13239,   122,  125,   1847,  5747, 13779],
     [  101,  1660,  5971,  785,   125,  1847, 13779, 15616,     0]])

correct_first_batch_target = torch.LongTensor(
    [[ 3396, 10303,  125, 13239,  122,    125,  1847,  5747, 13779],
     [ 10303, 125, 13239,   122,  125,   1847,  5747, 13779, 15616],
     [ 1660,  5971,   785,   125,  1847, 13779, 15616,     0,     0]])

assert torch.equal(first_batch_input, correct_first_batch_input)
assert torch.equal(first_batch_target, correct_first_batch_target)

print('Passou no assert de dataset.')

  0%|          | 0/2 [00:00<?, ?it/s]

Passou no assert de tamanho do dataset.
Passou no assert de dataset.


# Carregamento do dataset 

In [7]:
# !wget -nc https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula9/sample-1gb.txt

In [8]:
# Load datasets

seq_length = 12

max_seq_length = seq_length

texts = open('sample-1gb.txt').readlines()

len_max = int(len(texts)/200)

train_examples = int(len_max*0.6)
valid_examples = int(len_max*0.3)
test_examples = int(len_max*0.1)

print(f"train examples: {train_examples}")
print(f"valid examples: {valid_examples}")
print(f"test examples: {test_examples}")



print(f'Read {len(texts)} lines.')

max_lines = train_examples + valid_examples + test_examples
print(f'Truncating to {max_lines} lines.')
texts = texts[:max_lines]  

training_texts = texts[:-(valid_examples + test_examples)]
valid_texts = texts[-(valid_examples + test_examples):-test_examples]
test_texts = texts[-test_examples:]


train examples: 750
valid examples: 375
test examples: 125
Read 250000 lines.
Truncating to 1250 lines.


In [9]:
training_dataset = MyDataset(texts=training_texts, tokenizer=tokenizer, max_seq_length = max_seq_length)
valid_dataset = MyDataset(texts=valid_texts, tokenizer=tokenizer, max_seq_length = max_seq_length)
test_dataset = MyDataset(texts=test_texts, tokenizer=tokenizer, max_seq_length = max_seq_length)

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/375 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

In [10]:
print(f'training examples: {len(training_dataset)}')
print(f'valid examples: {len(valid_dataset)}')
print(f'test examples: {len(test_dataset)}')

training examples: 75145
valid examples: 30484
test examples: 12085


In [11]:
from collections import OrderedDict

class MultiHeadSelfAttention(torch.nn.Module):

    def __init__(self, max_seq_length, embedding_dim, num_heads, padding_id):
        super().__init__()
        
        self.num_heads = num_heads
        self.padding_id = padding_id
        self.max_seq_length = max_seq_length
        
        self.W_q = torch.nn.Linear(embedding_dim, embedding_dim, bias = False)
        self.W_k = torch.nn.Linear(embedding_dim, embedding_dim, bias = False)
        self.W_v = torch.nn.Linear(embedding_dim, embedding_dim, bias = False)
        
        self.W_o = torch.nn.Linear(embedding_dim, embedding_dim, bias = False)
        
        self.mask = torch.tril(torch.ones(max_seq_length, max_seq_length)).unsqueeze(0).to(device)
  
    def _self_attention(self, q, k, v, mask):
        """Tenho a ponderação de todos os tokens contra todos"""
        s = torch.matmul(q, k.transpose(-2, -1))  # B, H, L, D/H x B, H, D/H, L -> B, H, L, L   
        
        """preenchimento da mascara"""
        s = s.masked_fill(mask.unsqueeze(1) == 0, -float("inf"))
        
        """mascara aplicando softmax"""
        p = torch.nn.functional.softmax(s, dim=-1)  # shape = B, H, L, L
        
        att_output = torch.matmul(p, v)  # B, H, L, L x B, H, L, D/H -> B, H, L, D/H
        
        return att_output.transpose(1, 2) # B, L, H, D/H  Para cada token, vou ter uma nva representação dele
  
    def forward(self, inputs, mask):
        
        batch_size = inputs.shape[0]
        
        """
        Modificar o codigo aqui para entrar o embbeding do modelo encoder.
        """
        
        q = self.W_q(inputs)  
        q = q.view(batch_size, self.max_seq_length, self.num_heads, -1)  
        q = q.transpose(1, 2)
        
        k = self.W_k(inputs)  
        k = k.view(batch_size, self.max_seq_length, self.num_heads, -1)
        k = k.transpose(1, 2)
        
        v = self.W_v(inputs)
        v = v.view(batch_size, self.max_seq_length, self.num_heads, -1)
        v = v.transpose(1, 2)
        
        """
        The embbedings  K V representions the images,
        should be same dims that Q.
        """
        att_output = self._self_attention(q, k, v, mask)
        
        att_output = att_output.reshape(batch_size, self.max_seq_length, -1)  # B, L, D  Cada token tem sua representação contextualizada
        att_output = self.W_o(att_output)
        
        return att_output

In [12]:
batch_size_ = 2
max_seq_length_ = 3
padding_id_ = 101
inputs_ = torch.tensor([[1, 2, 101], [3, 4, 5]])

In [13]:
mask = torch.tril(torch.ones(batch_size_, max_seq_length_, max_seq_length_))
mask = mask.masked_fill(inputs_.unsqueeze(1) == padding_id_, 0)
mask

tensor([[[1., 0., 0.],
         [1., 1., 0.],
         [1., 1., 0.]],

        [[1., 0., 0.],
         [1., 1., 0.],
         [1., 1., 1.]]])

In [14]:
torch.nn.functional.softmax(mask[0], dim=-1)

tensor([[0.5761, 0.2119, 0.2119],
        [0.4223, 0.4223, 0.1554],
        [0.4223, 0.4223, 0.1554]])

In [15]:
class LanguageModel(torch.nn.Module):

    def __init__(self, vocab_size: int, max_seq_length: int, dim: int, n_layers: int, pad_token_id: int):
        super().__init__()
        
        self.pad_token_id = pad_token_id
        self.max_seq_length = max_seq_length
        
        self.normal_embds = torch.nn.Embedding(vocab_size, dim, padding_idx=pad_token_id)
        
        self.positional_embds = torch.nn.Embedding(max_seq_length, dim, padding_idx=pad_token_id)
        
        self.head1 = MultiHeadSelfAttention(max_seq_length, dim, n_layers, pad_token_id)
        self.head2 = MultiHeadSelfAttention(max_seq_length, dim, n_layers, pad_token_id)
        
        self.classifier = torch.nn.Sequential(
            OrderedDict([
                         ('layer_1', torch.nn.Linear(dim, dim*2)),
                         ('relu', torch.nn.ReLU()),
                         ('layer_2', torch.nn.Linear(dim*2, vocab_size, bias=False))
            ])
        )

    def forward(self, inputs):
        """
        Args:
            inputs is a LongTensor of shape (batch_size, max_seq_length)
            
        Returns:
            logits of shape (batch_size, vocab_size)
        """
        batch_size = inputs.shape[0]
        normal_embds = self.normal_embds(inputs)
        embds = normal_embds + self.positional_embds.weight  # shape = B, L, D
        
        """
        Geracação e preenchimento da mascara.
        """
        mask = torch.tril(torch.ones(batch_size, self.max_seq_length, self.max_seq_length)).to(device)
        mask = mask.masked_fill(inputs.unsqueeze(1) == self.pad_token_id, 0)

        att_out = self.head1.forward(embds, mask)  # shape = B, L, D
        
        att_out_ = att_out + normal_embds  # Residual connection
        
        att_out = self.head2.forward(att_out_, mask)

        att_out = att_out + att_out_  # Residual connection

        logits = self.classifier(att_out)  # shape = B, L, D
        
        output = torch.nn.functional.softmax(logits, dim=-1)
        
        return output

## Teste o modelo com um exemplo

In [16]:
model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

sample_input, _ = next(iter(DataLoader(training_dataset, batch_size=1)))
sample_input = sample_input.to(device)
sample_output = model(sample_input)
print(f'sample_input.shape: {sample_input.shape}')
print(f'sample_output.shape: {sample_output.shape}')

sample_input.shape: torch.Size([1, 12])
sample_output.shape: torch.Size([1, 12, 29794])


In [17]:
from torchsummary import summary

summary(model)

Layer (type:depth-idx)                   Param #
├─Embedding: 1-1                         1,906,816
├─Embedding: 1-2                         768
├─MultiHeadSelfAttention: 1-3            --
|    └─Linear: 2-1                       4,096
|    └─Linear: 2-2                       4,096
|    └─Linear: 2-3                       4,096
|    └─Linear: 2-4                       4,096
├─MultiHeadSelfAttention: 1-4            --
|    └─Linear: 2-5                       4,096
|    └─Linear: 2-6                       4,096
|    └─Linear: 2-7                       4,096
|    └─Linear: 2-8                       4,096
├─Sequential: 1-5                        --
|    └─Linear: 2-9                       8,320
|    └─ReLU: 2-10                        --
|    └─Linear: 2-11                      3,813,632
Total params: 5,762,304
Trainable params: 5,762,304
Non-trainable params: 0


Layer (type:depth-idx)                   Param #
├─Embedding: 1-1                         1,906,816
├─Embedding: 1-2                         768
├─MultiHeadSelfAttention: 1-3            --
|    └─Linear: 2-1                       4,096
|    └─Linear: 2-2                       4,096
|    └─Linear: 2-3                       4,096
|    └─Linear: 2-4                       4,096
├─MultiHeadSelfAttention: 1-4            --
|    └─Linear: 2-5                       4,096
|    └─Linear: 2-6                       4,096
|    └─Linear: 2-7                       4,096
|    └─Linear: 2-8                       4,096
├─Sequential: 1-5                        --
|    └─Linear: 2-9                       8,320
|    └─ReLU: 2-10                        --
|    └─Linear: 2-11                      3,813,632
Total params: 5,762,304
Trainable params: 5,762,304
Non-trainable params: 0

## Assert da Perplexidade


## Laço de Treinamento e Validação

In [None]:
max_examples = 150_000_000
eval_every_steps = 1000
lr = 3e-4

train_loader = DataLoader(training_dataset, batch_size=255, shuffle=True, drop_last=True)
validation_loader = DataLoader(valid_dataset, batch_size=255)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode='min',
                                                       factor=0.2,
                                                       patience=0,
                                                       verbose=True)

def train_step(input_ids, target_ids):
    
    model.train()
    model.zero_grad()
    
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    
    target_ids = target_ids.reshape(-1)
    
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=model.pad_token_id)
    loss.backward()
    
    optimizer.step()
    
    return loss.item()


def validation_step(input_ids, target_ids):
    
    model.eval()
    
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    
    target_ids = target_ids.reshape(-1)
    
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=model.pad_token_id)
    
    return loss.item()


train_losses = []
n_examples = 0
step = 0
while n_examples < max_examples:
    
    for train_input_ids, train_target_ids in tqdm(train_loader):
        
        loss = train_step(train_input_ids.to(device), train_target_ids.to(device)) 
        train_losses.append(loss)
        
        if step % eval_every_steps == 0:
            train_ppl = np.exp(np.average(train_losses))

            with torch.no_grad():
                valid_ppl = np.exp(np.average([
                    validation_step(val_input_ids.to(device), val_target_ids.to(device))
                    for val_input_ids, val_target_ids in validation_loader]))

            print(f'{step} steps; {n_examples} examples so far; train ppl: {train_ppl:.2f}, valid ppl: {valid_ppl:.2f}')
            train_losses = []
            
            scheduler.step(valid_ppl)

        n_examples += len(train_input_ids)  # Increment of batch size
        step += 1
        if n_examples >= max_examples:
            break
            
        

  0%|          | 0/294 [00:00<?, ?it/s]

0 steps; 0 examples so far; train ppl: 29793.93, valid ppl: 29793.93


  0%|          | 0/294 [00:00<?, ?it/s]

## Avaliação final no dataset de teste


Bonus: o modelo com menor perplexidade no dataset de testes ganhará 0.5 ponto na nota final.

In [None]:
test_loader = DataLoader(test_dataset, batch_size=64)

with torch.no_grad():
    test_ppl = np.exp(np.average([
        validation_step(test_input_ids.to(device), test_target_ids.to(device))
        for test_input_ids, test_target_ids in test_loader
    ]))

print(f'test perplexity: {test_ppl}')

## Teste seu modelo com uma sentença

Escolha uma sentença gerada pelo modelo que ache interessante.

In [None]:
max_seq_length = 12

In [None]:
prompt = '[CLS] Eu gosto de comer pizza pois me faz'
max_output_tokens = 20
model.eval()

for _ in range(max_output_tokens):
    input_ids = tokenize(text=prompt, tokenizer=tokenizer)  # O(n) -> Assumindo uma lookup table
    input_ids_truncated = input_ids[-max_seq_length:]  # Usamos apenas os últimos <max_seq_length> tokens como entrada para o modelo.  O(1)
    logits = model(torch.LongTensor([input_ids_truncated]).to(device))
    logits = logits[:, -1, :]  # Usamos apenas o ultimo token da sequencia  O(1)
    # Ao usarmos o argmax, a saída do modelo em cada passo é o token de maior probabilidade.
    # Isso se chama decodificação gulosa (greedy decoding).
    predicted_id = torch.argmax(logits).item()  # O(n)
    input_ids += [predicted_id]  # Concatenamos a entrada com o token escolhido nesse passo.
    prompt = tokenizer.decode(input_ids)  # O(n + 1)
    print(prompt)

In [None]:
prompt = 'As empresas certificadas na ISO 9001 aperfeiçoam o desempenho de seus produtos e'
max_output_tokens = 20
model.eval()

for _ in range(max_output_tokens):
    input_ids = tokenize(text=prompt, tokenizer=tokenizer)  # O(n) -> Assumindo uma lookup table
    input_ids_truncated = input_ids[-max_seq_length:]  # Usamos apenas os últimos <max_seq_length> tokens como entrada para o modelo.  O(1)
    logits = model(torch.LongTensor([input_ids_truncated]).to(device))
    logits = logits[:, -1, :]  # Usamos apenas o ultimo token da sequencia  O(1)
    # Ao usarmos o argmax, a saída do modelo em cada passo é o token de maior probabilidade.
    # Isso se chama decodificação gulosa (greedy decoding).
    predicted_id = torch.argmax(logits).item()  # O(n)
    input_ids += [predicted_id]  # Concatenamos a entrada com o token escolhido nesse passo.
    prompt = tokenizer.decode(input_ids)  # O(n + 1)
    print(prompt)

In [None]:
prompt = 'Gosto muito de Orlando, por isso quero ir conhecer os parques da'
max_output_tokens = 20
model.eval()

for _ in range(max_output_tokens):
    input_ids = tokenize(text=prompt, tokenizer=tokenizer)  # O(n) -> Assumindo uma lookup table
    input_ids_truncated = input_ids[-max_seq_length:]  # Usamos apenas os últimos <max_seq_length> tokens como entrada para o modelo.  O(1)
    logits = model(torch.LongTensor([input_ids_truncated]).to(device))
    logits = logits[:, -1, :]  # Usamos apenas o ultimo token da sequencia  O(1)
    # Ao usarmos o argmax, a saída do modelo em cada passo é o token de maior probabilidade.
    # Isso se chama decodificação gulosa (greedy decoding).
    predicted_id = torch.argmax(logits).item()  # O(n)
    input_ids += [predicted_id]  # Concatenamos a entrada com o token escolhido nesse passo.
    prompt = tokenizer.decode(input_ids)  # O(n + 1)
    print(prompt)

In [None]:
prompt = 'pato ceu sol chao huashua salsla rato'
max_output_tokens = 20
model.eval()

for _ in range(max_output_tokens):
    input_ids = tokenize(text=prompt, tokenizer=tokenizer)  # O(n) -> Assumindo uma lookup table
    input_ids_truncated = input_ids[-max_seq_length:]  # Usamos apenas os últimos <max_seq_length> tokens como entrada para o modelo.  O(1)
    logits = model(torch.LongTensor([input_ids_truncated]).to(device))
    logits = logits[:, -1, :]  # Usamos apenas o ultimo token da sequencia  O(1)
    # Ao usarmos o argmax, a saída do modelo em cada passo é o token de maior probabilidade.
    # Isso se chama decodificação gulosa (greedy decoding).
    predicted_id = torch.argmax(logits).item()  # O(n)
    input_ids += [predicted_id]  # Concatenamos a entrada com o token escolhido nesse passo.
    prompt = tokenizer.decode(input_ids)  # O(n + 1)
    print(prompt)