In [None]:
nome = 'Matheus Lindino'
print(f'Meu nome é {nome}')

Meu nome é Matheus Lindino


#  Exercício: Modelo de Linguagem com auto-atenção

Este exercício é similar ao da aula 4, mas iremos agora treinar uma rede neural *com auto-atenção* para prever a próxima palavra de um texto, data as palavras anteriores como entrada. 

Na camada de auto-atenção, deve-se implementar (vide slide 80):
- Embeddings de posição
- Projeções lineares (WQ, WK, WV, WO)
- Camada de feed forward (2-layer MLP)

Instrucões:
- É necessário fazer duas implementações da camada de auto-atenção: uma usando laços (ineficiente mas fácil de entender) e outra matricial (eficiente mas difícil de entender).

- Fazer um assert para garantir que o resultado das duas implementações é exatamente igual.

- No treinamento, usar apenas a implementação matricial.

## Importação dos pacotes

In [None]:
import collections
import itertools
import functools
import math
import os
import random
import re
import copy

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook
from typing import List
from tqdm import tqdm

In [None]:
# Check which GPU we are using
!nvidia-smi

Wed Sep 28 19:48:42 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

Using cuda:0


# Carregamento do dataset 

Primeiro, fazemos download do dataset:

In [None]:
!wget -nc http://files.fast.ai/data/aclImdb.tgz 
!tar -xzf aclImdb.tgz

File ‘aclImdb.tgz’ already there; not retrieving.



## Carregando o dataset

Criaremos uma divisão de treino (80%) e validação (20%) artificialmente.

Nota: Evitar de olhar ao máximo o dataset de teste para não ficar enviseado no que será testado. Em aplicações reais, o dataset de teste só estará disponível no futuro, ou seja, é quando o usuário começa a testar o seu produto.

In [None]:
def load_texts(folder):
    texts = []
    for path in os.listdir(folder):
        with open(os.path.join(folder, path)) as f:
            texts.append(f.read())
    return texts

x_train_pos = load_texts('aclImdb/train/pos')
x_train_neg = load_texts('aclImdb/train/neg')
x_test_pos = load_texts('aclImdb/test/pos')
x_test_neg = load_texts('aclImdb/test/neg')

x_train = x_train_pos + x_train_neg
x_test = x_test_pos + x_test_neg

# Embaralhamos o treino para depois fazermos a divisão treino/valid.
random.shuffle(x_train)

n_train = int(0.8 * len(x_train))

x_valid = x_train[n_train:]
x_train = x_train[:n_train]

print(len(x_train), 'amostras de treino.')
print(len(x_valid), 'amostras de desenvolvimento.')
print(len(x_test), 'amostras de teste.')

print('3 primeiras amostras treino:')
for x in x_train[:3]:
    print(x[:100])

print('3 últimas amostras treino:')
for x in x_train[-3:]:
    print(x[:100])

print('3 primeiras amostras validação:')
for x in x_valid[:3]:
    print(x[:100])

print('3 últimas amostras validação:')
for x in x_valid[-3:]:
    print(x[:100])

20000 amostras de treino.
5000 amostras de desenvolvimento.
25000 amostras de teste.
3 primeiras amostras treino:
This was the first PPV in a new era for the WWE as Hulk Hogan, The Ultimate Warrior, Ric Flair and S
This is, in my opinion, a very good film, especially for Michael Jackson lovers. It contains a messa
Impressed! This is the worst SRK movie and one of the worst Bollywood movies I ever saw! I didn't li
3 últimas amostras treino:
Why me? Why should I be subjected to such slaughter of what could have made an interesting plot?! At
Ironically for a play unavailable on film or video for so long, ARMS AND THE MAN has remained fairly
If any show in the last ten years deserves a 10, it is this rare gem. It allows us to escape back to
3 primeiras amostras validação:
If you ever have the chance to see Sandra Bernhard live in person, you better move on it sweetie. I 
This movie is a horrible distortion of lies and exaggerations that were put together by the most sha
First off, anyone l

## Classe Tokenizer

In [None]:
class Tokenizer():
  def __init__(self, max_tokens=3000):
    self.max_tokens_ = max_tokens
    self.vocab_ = None

  def tokenize(self, text: str):
    pattern = r'\w+|[^\w\s]'
    text = text.replace('<br />',' ')
    return [tokens.lower() for tokens in re.findall(pattern, text)]

  def create_vocab(self, texts: List[str]):
    tokens = self.tokenize(' '.join(texts))
    counter = collections.Counter(tokens).most_common(self.max_tokens_)
    self.vocab_ = dict((key, i) for i, (key, values) in enumerate(counter))
    self.vocab_['unk'] = -1
    
    return self

  def encode(self, data: str):
    data = self.tokenize(data)
    return [self.vocab_.get(sample, -1) for sample in data]
  
  def decode(self, array):
    inverse_vocab = {y: x for x, y in self.vocab_.items()}
    return [inverse_vocab[i] for i in array]

### Asserts da tokenização e codificação

In [None]:
corpus = ['Apple, Banana, Apple, Avocado, Pineapple <br /><br />',
          'Apple. Pineapple and Mango',
          'Banana, Mango! Banana, Apple',
          'Pineapple Berry Apricot Apple',
          'Avocado Apple']
            
sample = 'Apricot Apple, Banana, Banana Banana'


tokenizer = Tokenizer(max_tokens=3)
tokenizer.create_vocab(corpus)
vocab = tokenizer.vocab_
encoded = tokenizer.encode(sample)
decoded = tokenizer.decode(encoded)

assert list(vocab.keys()) == ['apple', ',', 'banana', 'unk'], 'Vocab Incorrect'
assert encoded == [-1, 0, 1, 2, 1, 2, 2], 'Encoder Incorrect'
assert decoded == ['unk', 'apple', ',', 'banana', ',', 'banana', 'banana'], 'Decoder Incorrect'

## Dataset

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, data, tokenizer, n_context=9):
        super().__init__()
        self.data = []
              
        for sample in data:
          sample = tokenizer.encode(sample)
          
          for i in range(0, len(sample) - n_context):
            sample_sliced = sample[i:i+n_context+1]
            
            if -1 not in sample_sliced:
              self.data.append(sample_sliced) 

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return torch.tensor(self.data[index][:-1]), torch.tensor(self.data[index][-1]).long()

### Asserts do Dataset

In [None]:
corpus = ['Apple, Banana, Apple, Avocado, Pineapple',
          'Apple. Pineapple and Mango',
          'Banana, Mango! Banana, Apple',
          'Pineapple Berry Apricot Apple',
          'Avocado Apple']

n_context = 2
customDataset = IMDBDataset(data=corpus, tokenizer=Tokenizer(max_tokens=3).create_vocab(corpus), n_context=n_context)
x, y = customDataset.__getitem__(0)

assert len(customDataset.data[0]) == n_context+1, 'Dataset shape incorrect'
assert x.shape[0] == n_context, 'Example shape incorrect'

## Neural Network



### Self-Attention (FUNCTION)

In [None]:
def iterative_self_attention(X, Wq, Wk, Wv, Wo):
  Q = torch.matmul(X, Wq)
  K = torch.matmul(X, Wk)
  V = torch.matmul(X, Wv)
  
  E = []
  for query in Q:
    scores = []
    for key in K:
      scores.append(torch.dot(query, key))
    
    attention_weights = torch.softmax(torch.FloatTensor(scores), dim=0)

    new_E = 0
    for weight, value in zip(attention_weights, V):
      new_E += weight * value
    
    new_E = torch.matmul(new_E, Wo)
    E.append(new_E)
  return torch.stack(E)

def matricial_self_attention(X, Wq, Wk, Wv, Wo):
  Q = torch.matmul(X, Wq)
  K = torch.matmul(X, Wk)
  V = torch.matmul(X, Wv)

  scores = torch.matmul(Q, K.T)
  probs  = torch.softmax(scores, dim=-1)
  E      = torch.matmul(probs, V)
  
  return torch.matmul(E, Wo)

In [None]:
seq = torch.FloatTensor([
    [0.0, 2.0],
    [-1.5, 0.2],
    [0.5, 0.6],
])

E = []

Wq = torch.rand((2,2))
Wk = torch.rand((2,2))
Wv = torch.rand((2,2))
Wo = torch.rand((2,2))

iterative = iterative_self_attention(seq, Wq, Wk, Wv, Wo)
matricial = matricial_self_attention(seq, Wq, Wk, Wv, Wo)

iterative = torch.round(iterative, decimals=6)
matricial = torch.round(matricial, decimals=6)

assert torch.eq(iterative, matricial).all(), 'Wrong implementation'

### Self-Attention (CLASS)



In [None]:
class IterativeSelfAttention(nn.Module):
  def __init__(self, embedding_dim):
    super().__init__()
    self.Wq = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.Wk = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.Wv = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.Wo = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)

  def forward(self, data):
    Q = self.Wq(data)
    K = self.Wk(data)
    V = self.Wv(data)
    
    E = []
    for query in Q:
      scores = []
      for key in K:
        score = torch.matmul(query, key.t())
        scores.append(score.flatten())

      attention_weights = torch.softmax(torch.cat(scores), dim=0)

      new_E = 0
      for weight, value in zip(attention_weights, V):
        new_E += weight * value
      
      new_E = self.Wo(new_E)
      E.append(new_E)
    return torch.stack(E)

class MatricialSelfAttention(nn.Module):
  def __init__(self, embedding_dim):
    super().__init__()
    self.Wq = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.Wk = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.Wv = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
    self.Wo = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)

  def forward(self, data):
    Q = self.Wq(data)
    K = self.Wk(data)
    V = self.Wv(data)

    scores = torch.matmul(Q, K.transpose(dim0=-2, dim1=-1))
    probs  = torch.softmax(scores, dim=-1)
    E      = torch.matmul(probs, V)
    return self.Wo(E)

In [None]:
batch_size = 2
n_context  = 6 
vocab_size = 50
embedding_dim  = 100

embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, max_norm=True)
matricial = MatricialSelfAttention(embedding_dim=embedding_dim)

inputs = torch.rand((batch_size, n_context)).long()
inputs = embedding(inputs)
E_matricial = matricial(inputs)

assert list(E_matricial.shape) == [batch_size, n_context, embedding_dim], 'Self Attention shape incorrect'

### Model

In [None]:
class AttentionModel(nn.Module):
  def __init__(self, n_context, vocab_size, embedding_dim, hidden_size):
    super().__init__()
    self.embedding      = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, max_norm=True)
    self.positional     = nn.Parameter(torch.ones([n_context, embedding_dim]).uniform_(-1, 1))
    self.self_attention = MatricialSelfAttention(embedding_dim=embedding_dim)
    self.input_layer    = nn.Linear(in_features=n_context*embedding_dim, out_features=hidden_size)
    self.activation     = nn.ReLU()
    self.out_layer      = nn.Linear(in_features=hidden_size, out_features=vocab_size)

  def forward(self, data):
    data = self.embedding(data) + self.positional
    data = self.activation(data)
    data = self.self_attention(data)
    data = self.activation(data)
    data = self.input_layer(data.flatten(start_dim=1))
    data = self.activation(data)
    data = self.out_layer(data)

    return data

### Asserts do modelo

In [None]:
batch_size = 2
n_context  = 10
vocab_size = 50

model = AttentionModel(n_context=n_context, vocab_size=vocab_size, embedding_dim=10, hidden_size=10)
inputs = torch.rand((batch_size, n_context)).long()
logits = model(inputs)

assert list(logits.shape) == [batch_size, vocab_size], 'Logits shape incorrect'

## Early Stopping

In [None]:
class EarlyStopping():
  def __init__(self, patience=10, min_delta=0):
    self.patience = patience
    self.counter = 0
    self.best_score = None
    self.best_model_wts = None
    self.min_delta = min_delta

  def __call__(self, model, val_loss):
    score = -val_loss

    if self.best_score is None:
        self.best_score = score
        self.best_model_wts = copy.deepcopy(model.state_dict())
        return False

    elif score < self.best_score + self.min_delta:
        self.counter += 1
        if self.counter >= self.patience:
            return True
    else:
        self.best_score = score
        self.best_model_wts = copy.deepcopy(model.state_dict())
        self.counter = 0
        return False

## Funções auxiliares para treinamento

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, get_perplexity, initial_perplexity):
  running_loss = 0.0
  running_corrects = 0
  model.train()
  for inputs, targets in dataloader:    
    inputs = inputs.to(device)
    targets = targets.to(device)
    
    optimizer.zero_grad()
    
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    
    initial_perplexity = np.exp(loss.item()) if get_perplexity  else initial_perplexity
    get_perplexity = False
    
    _, preds = torch.max(outputs, 1)
    loss.backward()
    optimizer.step()

    running_loss += loss.item() 
    running_corrects += torch.sum(preds == targets.data)

  return running_loss, running_corrects, initial_perplexity

def evaluate(model, dataloader, criterion):
  running_loss = 0.0
  running_corrects = 0

  model.eval()
  for inputs, targets in dataloader:
    inputs = inputs.to(device)
    targets = targets.to(device)

    with torch.no_grad():
      outputs = model(inputs)
      loss = criterion(outputs, targets)
      _, preds = torch.max(outputs, 1)
      
      running_loss += loss.item()
      running_corrects += torch.sum(preds == targets.data)

  return running_loss, running_corrects

## Hiper-Parametros

In [None]:
params = {
  'n_context': 9,
  'vocab_size': 3000,
  'embedding_dim': 100,
  'hidden_size': 64,
  'batch_size': 256,
  'epochs': 100,
  'lr': 1e-3,
  'patience': 10,
  'min_delta':0
}

In [None]:
tokenizer     = Tokenizer(max_tokens=params['vocab_size']).create_vocab(x_train)
train_dataset = IMDBDataset(data=x_train, tokenizer=tokenizer, n_context=params['n_context'])
val_dataset   = IMDBDataset(data=x_valid, tokenizer=tokenizer, n_context=params['n_context'])
test_dataset  = IMDBDataset(data=x_test, tokenizer=tokenizer, n_context=params['n_context'])

train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=params['batch_size'])
test_loader  = DataLoader(test_dataset, batch_size=params['batch_size'])

In [None]:
model = AttentionModel(n_context=params['n_context'], vocab_size=params['vocab_size'], embedding_dim=params['embedding_dim'], hidden_size=params['hidden_size'])
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])

criterion = nn.CrossEntropyLoss()
early_stopping = EarlyStopping(patience=params['patience'], min_delta=params['min_delta'])

history = {'train_loss': [], 'val_loss': [], 'train_acc':[], 'val_acc':[], 'train_perplexity': [], 'val_perplexity' : []}
initial_perplexity = 0.0

for epoch in tqdm(range(params['epochs'])):
  train_loss, train_correct, initial_perplexity = train_epoch(model=model, 
                                                              dataloader=train_loader, 
                                                              criterion=criterion, 
                                                              optimizer=optimizer,
                                                              get_perplexity=True if epoch == 0 else False,
                                                              initial_perplexity=initial_perplexity)

  val_loss, val_correct  = evaluate(model=model, 
                                    dataloader=val_loader,
                                    criterion=criterion)

  train_loss = train_loss / len(train_loader)
  train_acc = train_correct.cpu().item() / len(train_loader.sampler) * 100
  train_perplexity = np.exp(train_loss)

  val_loss = val_loss / len(val_loader)
  val_acc = val_correct.cpu().item() / len(val_loader.sampler) * 100
  val_perplexity = np.exp(val_loss)

  if early_stopping(model, val_loss): break
  
  history['train_loss'].append(train_loss)
  history['train_acc'].append(train_acc)
  history['train_perplexity'].append(train_perplexity)
  history['val_loss'].append(val_loss)
  history['val_acc'].append(val_acc)
  history['val_perplexity'].append(val_perplexity)

  3%|▎         | 3/100 [02:08<1:09:59, 43.30s/it]

In [None]:
import matplotlib.pyplot as plt

epochs = np.arange(len(history['train_loss']))

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(30,7))
axes[0].plot(epochs, history['train_loss'], label='Train')
axes[0].plot(epochs, history['val_loss'], label='Validation')
axes[0].set_title('Cross Entropy Loss (with Early Stopping)')
axes[0].set_xlabel('Epochs'); axes[0].set_ylabel('Loss')
axes[0].grid(); axes[0].legend()

axes[1].plot(epochs, history['train_acc'], label='Train')
axes[1].plot(epochs, history['val_acc'], label='Validation')
axes[1].set_title('Accuracy (with Early Stopping)')
axes[1].set_xlabel('Epochs'); axes[1].set_ylabel('Acc')
axes[1].grid(); axes[1].legend()

axes[2].plot(epochs, history['train_perplexity'], label='Train')
axes[2].plot(epochs, history['val_perplexity'], label='Validation')
axes[2].set_title('Perplexity (with Early Stopping)')
axes[2].set_xlabel('Epochs'); axes[2].set_ylabel('EXP(Loss)')
axes[2].grid(); axes[2].legend()

plt.show()

In [None]:
model = AttentionModel(n_context=params['n_context'], vocab_size=params['vocab_size'], embedding_dim=params['embedding_dim'], hidden_size=params['hidden_size'])
model.load_state_dict(early_stopping.best_model_wts)
model.to(device)

test_loss, test_correct  = evaluate(model=model, 
                                  dataloader=test_loader,
                                  criterion=criterion)

test_loss = test_loss / len(test_loader)
test_acc = test_correct.cpu().item() / len(test_loader.sampler) * 100
test_perplexity = np.exp(test_loss)

print(f'INITIAL PERPLEXITY: {initial_perplexity} ----- TEST LOSS: {test_loss} ----- TEST ACC: {test_acc} ----- TEST PERPLEXITY: {test_perplexity}')

In [None]:
## FUNÇÃO RETIRADA DO TRABALHO DO ANDERSON PARA MELHORAR AO PLOT DAS FRASES
def adjust_string(sample):
  preety_sentence = re.sub(r" \.", ".", sample)
  preety_sentence = re.sub(r" \,", ",", preety_sentence)
  preety_sentence = re.sub(r" \:", ":", preety_sentence)
  preety_sentence = re.sub(r" \!", "!", preety_sentence)
  preety_sentence = re.sub(r" \?", "?", preety_sentence)
  preety_sentence = re.sub(r" \' ", "'", preety_sentence)

  return preety_sentence

In [None]:
text_sample = "Amazing movie, good writer, terrific actors and"

for i in range(20):
  vector_input = torch.Tensor(tokenizer.encode(text_sample)).long()
  vector_input = vector_input.reshape(1,-1).to(device)
  
  logits = model(vector_input[:, i:])
  _, preds = torch.max(logits, 1)

  vector_input = torch.cat((vector_input, preds.reshape(1,-1)), dim=1)
  text_sample = tokenizer.decode(vector_input.reshape(-1).cpu().numpy())
  text_sample = ' '.join(text_sample)

  text_sample = adjust_string(text_sample)

  print('Generated Text:', text_sample)