<a href="https://colab.research.google.com/github/luizgontijo/IA025_Intro_Deep_Learning/blob/main/ex10_attention_causal_mask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
nome = 'Luiz Fernando da Costa Gontijo'
print(f'Meu nome é {nome}')

Meu nome é Luiz Fernando da Costa Gontijo


#  Exercício: Modelo de Linguagem com auto-atenção

Este exercício é similar ao da Aula 8, mas iremos agora treinar uma rede neural com **duas camadas** de auto-atenção **causais** para prever a próxima palavra de um texto, data as palavras anteriores como entrada. 

Iremos também trabalhar com sequencias de tamanho variável.

Na camada de auto-atenção, não se esqueça de implementar:
- Embeddings de posição
- Projeções lineares (WQ, WK, WV, WO)
- Conexões residuais
- Camada de feed forward (2-layer MLP)


O dataset usado neste exercício (BrWaC) possui um tamanho razoável e você vai precisar rodar seus experimentos com GPU.

Alguns conselhos úteis:
- **ATENÇÃO:** o dataset é bem grande. Não dê comando de imprimí-lo.
- Durante a depuração, faça seu dataset ficar bem pequeno, para que a depuração seja mais rápida e não precise de GPU. Somente ligue a GPU quando o seu laço de treinamento já está funcionando
- Não deixe para fazer esse exercício na véspera. Ele é trabalhoso.

In [None]:
# iremos utilizar a biblioteca dos transformers para ter acesso ao tokenizador do BERT.
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 14.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 13.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 50.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
  

## Importação dos pacotes

In [None]:
import collections
import itertools
import functools
import math
import random

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook


In [None]:
# Check which GPU we are using
!nvidia-smi

Thu Jun  9 01:57:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

Using cuda:0


## Implementação do MyDataset

In [None]:
# tentar esse dataset para verificar o overfitting

from typing import List


def tokenize(text: str, tokenizer):
    return tokenizer(text, return_tensors=None, add_special_tokens=False).input_ids


class MyDataset():
    def __init__(self, texts: List[str], tokenizer, max_seq_length: int):
        # Escreva seu código aqui
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.features = self._token_window(texts)

    def _token_window(self, texts):
        feat = []
        y = []
        # bucle para ler cada uma das frases de entrada

        # Dataloader inspirado no notebook do Pedro Gengo
        for text in texts:
          # tokeniza uma frase
          tokens_from_text = tokenize(f'[CLS]{text}', self.tokenizer)
          tokens_from_text +=  [tokenizer.vocab['[PAD]']] * max(0, 1 + self.max_seq_length - len(tokens_from_text))
          for i in range(0, len(tokens_from_text)-1, self.max_seq_length): 
            if i+self.max_seq_length < len(tokens_from_text):
              feat.append(tokens_from_text[i:i+self.max_seq_length+1])
            else:
              feat.append(tokens_from_text[-self.max_seq_length-1:])
        return torch.tensor(feat).long()

    def __len__(self):
        # Escreva seu código aqui
        return len(self.features)

    def __getitem__(self, idx):
        # Escreva seu código aqui
        feat = self.features[idx]
        #print(feat)
        return feat[:-1], feat[1:]

## Testando se a implementação do MyDataset está correta

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [None]:
dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza', 'Ele é feio para dormir e outras coisas a mais três coisas diferentes']

dummy_dataset = MyDataset(texts=dummy_texts, tokenizer=tokenizer, max_seq_length=9)
dummy_loader = DataLoader(dummy_dataset, batch_size=6, shuffle=False)
#assert len(dummy_dataset) == 2
print('Passou no assert de tamanho do dataset.')

first_batch_input, first_batch_target = next(iter(dummy_loader))

print(f'first_batch_input: {first_batch_input}')
print(f'first_batch_target: {first_batch_target}')

Passou no assert de tamanho do dataset.
first_batch_input: tensor([[  101,  3396, 10303,   125, 13239,     0,     0,     0,     0],
        [  101,  1660,  5971,   785,   125,  1847, 13779, 15616,     0],
        [  101,   787,   253,  2996, 22280,   221, 18165,   122,  1028],
        [  221, 18165,   122,  1028,  4486,   123,   325,   864,  4486]])
first_batch_target: tensor([[ 3396, 10303,   125, 13239,     0,     0,     0,     0,     0],
        [ 1660,  5971,   785,   125,  1847, 13779, 15616,     0,     0],
        [  787,   253,  2996, 22280,   221, 18165,   122,  1028,  4486],
        [18165,   122,  1028,  4486,   123,   325,   864,  4486,  1755]])


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza']

dummy_dataset = MyDataset(texts=dummy_texts, tokenizer=tokenizer, max_seq_length=9)
dummy_loader = DataLoader(dummy_dataset, batch_size=6, shuffle=False)
assert len(dummy_dataset) == 2
print('Passou no assert de tamanho do dataset.')

first_batch_input, first_batch_target = next(iter(dummy_loader))

correct_first_batch_input = torch.LongTensor(
    [[  101,  3396, 10303,   125, 13239,     0,     0,     0,     0],
     [  101,  1660,  5971,   785,   125,  1847, 13779, 15616,     0]])

correct_first_batch_target = torch.LongTensor(
    [[ 3396, 10303,   125, 13239,     0,     0,     0,     0,     0],
     [ 1660,  5971,   785,   125,  1847, 13779, 15616,     0,     0]])

print(f'meu batch: {first_batch_input.dtype}')
print(f'batch correto: {correct_first_batch_input.dtype}')

assert torch.equal(first_batch_input, correct_first_batch_input)
assert torch.equal(first_batch_target, correct_first_batch_target)

print('Passou no assert de dataset.')

Passou no assert de tamanho do dataset.
meu batch: torch.int64
batch correto: torch.int64
Passou no assert de dataset.


# Carregamento do dataset 

Iremos usar uma pequena amostra do dataset [BrWaC](https://www.inf.ufrgs.br/pln/wiki/index.php?title=BrWaC) para treinar e avaliar nosso modelo de linguagem.

In [None]:
!wget -nc https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula9/sample-1gb.txt

--2022-06-09 01:58:13--  https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula9/sample-1gb.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.0.48, 172.217.2.112, 142.250.65.80, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.0.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1230909256 (1.1G) [text/plain]
Saving to: ‘sample-1gb.txt’


2022-06-09 01:58:21 (148 MB/s) - ‘sample-1gb.txt’ saved [1230909256/1230909256]



In [None]:
# Load datasets
max_seq_length = 9

train_examples = 500
valid_examples = 100
test_examples = 100

texts = open('sample-1gb.txt').readlines()

print(f'Read {len(texts)} lines.')

max_lines = train_examples + valid_examples + test_examples
print(f'Truncating to {max_lines} lines.')
texts = texts[:max_lines]

training_texts = texts[:-(valid_examples + test_examples)]
valid_texts = texts[-(valid_examples + test_examples):-test_examples]
test_texts = texts[-test_examples:]

training_dataset = MyDataset(texts=training_texts, tokenizer=tokenizer, max_seq_length=max_seq_length)
valid_dataset = MyDataset(texts=valid_texts, tokenizer=tokenizer, max_seq_length=max_seq_length)
test_dataset = MyDataset(texts=test_texts, tokenizer=tokenizer, max_seq_length=max_seq_length)

Read 250000 lines.
Truncating to 700 lines.


In [None]:
print(f'training examples: {len(training_dataset)}')
print(f'valid examples: {len(valid_dataset)}')
print(f'test examples: {len(test_dataset)}')

training examples: 76178
valid examples: 10136
test examples: 9566


In [None]:
class LanguageModel(torch.nn.Module):

    def __init__(self, vocab_size: int, max_seq_length: int, dim: int, n_layers: int, pad_token_id: int):
        """
        Implements the Self-attention, decoder-only."

        Args:
            vocab_size (int): Size of the input vocabulary.
            max_seq_length (int): Size of the sequence to consider as context for prediction.
            dim (int): Dimension of the embedding layer for each word in the context.
            n_layers (int): number of self-attention layers.
            pad_token_id (int): id of the pad token that will be ignored in the attention.
        """
        # Escreva seu código aqui.

        super().__init__()

        self.vocab_size = vocab_size
        self.max_seq_length = max_seq_length
        self.dim = dim
        self.n_layers = n_layers
        self.pad_token_id = pad_token_id

        self.embedding_layer = torch.nn.Embedding(self.vocab_size, self.dim)
        self.positional_embeddings = torch.nn.Linear(self.dim, self.max_seq_length, bias=False)

        self.W_q = torch.nn.Linear(self.dim,self.dim,bias=False) 
        self.W_k = torch.nn.Linear(self.dim,self.dim,bias=False) 
        self.W_v = torch.nn.Linear(self.dim,self.dim,bias=False) 
        self.W_o = torch.nn.Linear(self.dim,self.dim,bias=False) 
        
        hidden_size = 2*self.dim
        #self.feed_forward = torch.nn.Sequential(nn.Linear(self.max_seq_length*self.dim, hidden_size),
         #                                       nn.Dropout(0.2),
         #                                       nn.ReLU(),
         #                                       nn.Linear(hidden_size,self.max_seq_length*vocab_size))

        hidden_layer = 64
        self.linear1 = nn.Linear(self.max_seq_length*self.dim, hidden_layer)
        self.linear2 = nn.Linear(hidden_layer, self.max_seq_length*self.vocab_size, bias=False)
        self.tanh1 = nn.Tanh() # testar resultado
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        
        # definir a máscara do pad
        #self.mask_pad = self.pad_token_id != 0

        # máscara causal
        causal_mask = torch.ones(max_seq_length,max_seq_length)
        causal_mask = torch.tril(causal_mask)
        self.causal_mask = causal_mask.bool()


    #Auxílio do código do Gabriel Lopes
    def Attention(self, q, k, v, mask_padc):
        scores = torch.matmul(q, k.transpose(-2,-1))#  [B,L,L]
        scores = scores/math.sqrt(self.dim)

        scores = scores.masked_fill(mask_padc == 0, -float("inf"))

        probs = nn.functional.softmax(scores, dim = -1)  

        output = torch.matmul(probs, v)#.transpose(1, 2) 
        #E  = torch.matmul(probs, v) # [B,L,E]
        return output

    def forward(self, inputs):
        """
        Args:
            inputs is a LongTensor of shape (batch_size, max_seq_length)
            
        Returns:
            logits of shape (batch_size, vocab_size)
        """
        # Escreva seu código aqui.
        batch_size = inputs.shape[0]

        X_emb = self.embedding_layer(inputs)
        X = X_emb + self.positional_embeddings.weight

        #mask_padc = pad_mask & causal_mask
        pad_mask = inputs != self.pad_token_id
        mask_padc = (inputs != self.pad_token_id).unsqueeze(1) & torch.tril(torch.ones((self.max_seq_length, self.max_seq_length))).bool().to(device)
        #print(f'pad_mask shape: {pad_mask.unsqueeze(1).shape}')

        for i in range(self.n_layers):
            q = self.W_q(X.unsqueeze(1))
            #q = self.W_q(X)
            k = self.W_k(X)
            v = self.W_v(X)

            new_x = self.Attention(q, k, v, mask_padc)
            
            X = self.W_o(new_x)

        logits = self.linear1(X.view(batch_size,-1)) 
        logits = self.relu1(logits)
        logits = self.linear2(logits)

        #logits = self.feed_forward(X.reshape(len(inputs),-1))

        logits = logits.reshape(X.shape[0], self.max_seq_length, self.vocab_size)
        
        return logits

## Teste o modelo com um exemplo

In [None]:
embedding_dim = 10
batch_size = 1024

model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

sample_input, _ = next(iter(DataLoader(training_dataset)))
sample_input = sample_input.to(device)
sample_output = model(sample_input)
print(f'sample_input.shape: {sample_input.shape}')
print(f'sample_output.shape: {sample_output.shape}')

RuntimeError: ignored

In [None]:
train_input_ids, train_target_ids = next(iter(DataLoader(training_dataset, batch_size=2)))

print(f'batch de inputs: {train_input_ids}')
print(f'batch de targets: {train_target_ids}')

batch de inputs: tensor([[  101, 20100,  2308,  3074,  1089,   481,   117,   146,  1189],
        [  125, 13254,   143,   122, 18073, 22281,   179,   695,   923]])
batch de targets: tensor([[20100,  2308,  3074,  1089,   481,   117,   146,  1189,   125],
        [13254,   143,   122, 18073, 22281,   179,   695,   923,   320]])


In [None]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of model parameters: {num_params}')

Number of model parameters: 19122048


## Assert da Perplexidade


In [None]:
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)


def perplexity(logits, target, ignore_token_id: int):
    """
    Computes the perplexity.

    Args:
        logits: a FloatTensor of shape (batch_size, seq_length, vocab_size)
        target: a LongTensor of shape (batch_size, seq_length)

    Returns:
        A float corresponding to the perplexity
    """
    logits = logits.reshape(-1, logits.shape[-1])
    target = target.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target, reduction='mean', ignore_index=ignore_token_id)
    return torch.exp(loss)


n_examples = 1000

train_input_ids, train_target_ids = next(iter(DataLoader(training_dataset, batch_size=n_examples)))
train_input_ids = train_input_ids.to(device)
train_target_ids = train_target_ids.to(device)

logits = model(train_input_ids)

my_perplexity = perplexity(logits=logits, target=train_target_ids, ignore_token_id=tokenizer.pad_token_id)

print(f'my perplexity:              {int(my_perplexity)}')
print(f'correct initial perplexity: {tokenizer.vocab_size}')

assert math.isclose(my_perplexity, tokenizer.vocab_size, abs_tol=7000)
print('Passou o no assert da perplexidade')

RuntimeError: ignored

In [None]:
train_target_ids.shape
logits.shape

torch.Size([1000, 9, 29794])

## Laço de Treinamento e Validação

In [None]:
# com gpu
# laço com save 

compare=float('inf')

max_examples = 1_000_000
eval_every_steps = 100
lr = 3e-4

embedding_dim = 256
batch_size = 512
model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=embedding_dim,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

train_loader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
validation_loader = DataLoader(valid_dataset, batch_size=batch_size)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)


def train_step(input_ids, target_ids):
    model.train()
    model.zero_grad()
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=model.pad_token_id)
    loss.backward()
    optimizer.step()

    return loss.item()


def validation_step(input_ids, target_ids):
    model.eval()
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=model.pad_token_id)
    return loss.item()


train_losses = []
n_examples = 0
step = 0
while n_examples < max_examples:
    for input, target in train_loader:
        loss = train_step(input.to(device), target.to(device)) 
        train_losses.append(loss)
        
        if step % eval_every_steps == 0:
            train_ppl = np.exp(np.average(train_losses))

            with torch.no_grad():
                valid_ppl = np.exp(np.average([
                    validation_step(input.to(device), target.to(device))
                    for input, target in validation_loader]))
                
                if valid_ppl<compare:
                    compare=valid_ppl
                    torch.save(model, "/content/drive/MyDrive/Intro ao Aprendizado Profundo/Trabalho10/modelos_salvos/"+f"model_{max_examples/1000000}ex_{embedding_dim}embdim.pt")
                    with open("/content/drive/MyDrive/Intro ao Aprendizado Profundo/Trabalho10/modelos_salvos/"+f"model_{max_examples/1000000}ex_{embedding_dim}embdim.txt", 'w') as f:
                      lines = [f'batch size = {batch_size}', 
                                f'embedding dim = {embedding_dim}', 
                                f'max examples = {max_examples}', 
                                f'learning rate = {lr}', 
                                f'max_examples = {max_examples}', 
                                f'train PPL = {train_ppl}',
                                f'validation PPL = {valid_ppl}',
                                f'best values at {n_examples} examples']
                      f.writelines('\n'.join(lines))
                    f.close()

            print(f'{step} steps; {n_examples} examples so far; train ppl: {train_ppl:.2f}, valid ppl: {valid_ppl:.2f}')
            train_losses = []

        n_examples += len(input)  # Increment of batch size
        step += 1
        if n_examples >= max_examples:
            break

0 steps; 0 examples so far; train ppl: 29784.45, valid ppl: 29779.49
100 steps; 51200 examples so far; train ppl: 9090.19, valid ppl: 2914.45
200 steps; 102400 examples so far; train ppl: 1905.40, valid ppl: 2135.18
300 steps; 153600 examples so far; train ppl: 1424.38, valid ppl: 1873.15
400 steps; 204800 examples so far; train ppl: 1135.86, valid ppl: 1643.91
500 steps; 256000 examples so far; train ppl: 954.25, valid ppl: 1457.58
600 steps; 307200 examples so far; train ppl: 822.25, valid ppl: 1298.71
700 steps; 358400 examples so far; train ppl: 680.35, valid ppl: 1203.24
800 steps; 409600 examples so far; train ppl: 586.74, valid ppl: 1119.53
900 steps; 460800 examples so far; train ppl: 517.20, valid ppl: 1056.72
1000 steps; 512000 examples so far; train ppl: 434.05, valid ppl: 1002.35
1100 steps; 563200 examples so far; train ppl: 371.23, valid ppl: 975.71
1200 steps; 614400 examples so far; train ppl: 331.17, valid ppl: 964.74
1300 steps; 665600 examples so far; train ppl: 279.

## Avaliação final no dataset de teste


Bonus: o modelo com menor perplexidade no dataset de testes ganhará 0.5 ponto na nota final.

In [None]:
test_loader = DataLoader(test_dataset, batch_size=64)

with torch.no_grad():
    test_ppl = np.exp(np.average([
        validation_step(test_input_ids.to(device), test_target_ids.to(device))
        for test_input_ids, test_target_ids in test_loader
    ]))

print(f'test perplexity: {test_ppl}')

test perplexity: 1599.0058759514488


## Teste seu modelo com uma sentença

Escolha uma sentença gerada pelo modelo que ache interessante.

In [None]:
prompt = 'Eu gosto de comer pizza pois me faz,'
max_output_tokens = 20
model.eval()

for _ in range(max_output_tokens):
    input_ids = tokenize(text=prompt, tokenizer=tokenizer)
    input_ids_truncated = input_ids[-max_seq_length:]  # Usamos apenas os últimos <max_seq_length> tokens como entrada para o modelo.
    logits = model(torch.LongTensor([input_ids_truncated]).to(device))
    logits = logits[:, -1, :]  # Usamos apenas o ultimo token da sequencia
    # Ao usarmos o argmax, a saída do modelo em cada passo é o token de maior probabilidade.
    # Isso se chama decodificação gulosa (greedy decoding).
    predicted_id = torch.argmax(logits).item()
    input_ids += [predicted_id]  # Concatenamos a entrada com o token escolhido nesse passo.
    prompt = tokenizer.decode(input_ids)
    print(prompt)

Eu gosto de comer pizza pois me faz, a
Eu gosto de comer pizza pois me faz, a partir
Eu gosto de comer pizza pois me faz, a partir de
Eu gosto de comer pizza pois me faz, a partir de.
Eu gosto de comer pizza pois me faz, a partir de. Art
Eu gosto de comer pizza pois me faz, a partir de. Art,
Eu gosto de comer pizza pois me faz, a partir de. Art, a
Eu gosto de comer pizza pois me faz, a partir de. Art, a,
Eu gosto de comer pizza pois me faz, a partir de. Art, a, de
Eu gosto de comer pizza pois me faz, a partir de. Art, a, de ras
Eu gosto de comer pizza pois me faz, a partir de. Art, a, de ras,
Eu gosto de comer pizza pois me faz, a partir de. Art, a, de ras, o
Eu gosto de comer pizza pois me faz, a partir de. Art, a, de ras, o resultado
Eu gosto de comer pizza pois me faz, a partir de. Art, a, de ras, o resultado,
Eu gosto de comer pizza pois me faz, a partir de. Art, a, de ras, o resultado, a
Eu gosto de comer pizza pois me faz, a partir de. Art, a, de ras, o resultado, a população
Eu 

## Bonus 1
Quem conseguir a menor perplexidade no dataset de testes ganha 0.5 ponto na média final.

## Bonus 2
Qual é a complexidade (em notação O-grande) da função de geração de texto acima?

Quem responder corretamente a pergunta acima e deixar a função com menor complexidade ganha 0.5 ponto na média final.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Rascunhos

In [None]:
rint(f'token de PAD do BERT:{tokenizer.pad_token}')

pad = '[PAD]'
print(f'número que indica o toke de PAD: {tokenizer.vocab[pad]}')

dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza']
print(f'dummy_texts: {dummy_texts}')
max_seq_length = 3
dummy_dataset = MyDataset(texts=dummy_texts, tokenizer=tokenizer, max_seq_length=max_seq_length)
dummy_loader = DataLoader(dummy_dataset, batch_size=6, shuffle=False)

print(f'dummy_dataset: {dummy_dataset}')
print(f'dummy_loader: {dummy_loader}')

first_batch_input, first_batch_target = next(iter(dummy_loader))
print(f'batch_input: {first_batch_input}')
print(f'batch_target: {first_batch_target}')

tokens_ids = tokenize(dummy_texts[0], tokenizer)
tensor_empty = torch.zeros(max_seq_length, dtype=torch.int32)
print(f'empty tensor: {tensor_empty}')

after_sum = tensor_empty + first_batch_input[0] 
print(f'tensor depois da soma: {after_sum}')


#def __init__(self, texts: List[str], tokenizer, max_seq_length: int):
#self.tokensIds_n = []
#self.y = []

texts = dummy_texts
tokensIds_n = []
max_seq_length = 9
token_cls = torch.tensor(tokenizer.vocab['[CLS]'], dtype=torch.int32)
for text in tqdm_notebook(texts):
  tokens_ids = tokenize(text, tokenizer)
  # desconsiderar parte que tira tamanhos diferentes
  print(f'len token_ids:{len(tokens_ids)}')
  print(f'tokens_ids do texto {text}: {tokens_ids}')
  #if len(tokens_ids) < max_seq_length:
   # for i in range(max_seq_length):
   #   tokensIds_n_temp = []
   #   tokensIds_n_temp.append(tokens_ids[i:max_seq_length])
   #   p1d = (0, max_seq_length - len(tokensIds_n_temp))
   #   out = nn.functional.pad(tokensIds_n_temp, p1d, "constant", 0)
   #   tokensIds_n.append(out)

  if len(tokens_ids) < max_seq_length:
    #tokenid_temp = torch.tensor(tokens_ids[0:max_seq_length])
    tokenid_temp_cat = torch.cat((token_cls.unsqueeze(dim=-1), torch.tensor(tokens_ids[0:max_seq_length])))
    print(f'tokenid_temp_cat: {torch.cat((token_cls.unsqueeze(dim=-1), tokenid_temp_cat))}')
    tokensIds_n.append(tokenid_temp_cat)

  else:
    for i in range(max_seq_length):
      tokenid_temp = torch.tensor(tokens_ids[i:max_seq_length])
      #tokenid_temp_cat = torch.cat(token_cls,tokenid_temp), dim=0)
      #tokenid_temp_cat = torch.cat((token_cls.unsqueeze(dim=-1), tokenid_temp))
      tokensIds_n.append(torch.cat((token_cls.unsqueeze(dim=-1), tokenid_temp)))
      print(f'tokenid_temp_cat: {torch.cat((token_cls.unsqueeze(dim=-1), tokenid_temp))}')
      #tokensIds_n.append(torch.cat((token_cls.unsqueeze(dim=-1), tokenid_temp)))

print(f'lista de tokens ids: {tokensIds_n}')
for j, s in enumerate(tokensIds_n):
  #print(f'j: {j}')
  #print(f's: {s}')
  if len(s) < max_seq_length:
    s_torch = torch.tensor(s)
    p1d = (0, max_seq_length - len(s))
    #padded = nn.functional.pad(s_torch, p1d, "constant", 0)
    #print(f'padded: {padded}')
    tokensIds_n[j] = nn.functional.pad(s_torch, p1d, "constant", 0)
          #self.y.append(tokens_ids[i+context_size])

tokensIds_n = torch.stack(tokensIds_n, 0)
print(f'token ids batch: {tokensIds_n}')

# definir batch de target
tensor_zero = torch.tensor(0)
token_ids_target = torch.empty(len(texts), max_seq_length, dtype=torch.int32)
j=0
for i in tokensIds_n:
  #token_tensor = i[1:]
  #print(f'tonken shifted: {token_tensor}')
  tokenid_temp_cat = torch.cat((i[1:], tensor_zero.unsqueeze(dim=-1)))
  token_ids_target[j] = tokenid_temp_cat
  j+=1
  #target = torch.stack((tokenid_temp_cat, tokenid_temp_cat))

#print(f'tokens id: {tokensIds_n}')
#print(f'token ids target: {token_ids_target}')

#traget = torch.LongTensor(token_ids_target)
print(f'target: {token_ids_target}')
print(f'texts: {texts}')
#print(f'tokens id: {tokensIds_n}')