In [1]:
import pandas as pd

In [2]:
import torch

In [3]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [4]:
from torch.utils.data import DataLoader

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
import numpy as np

In [7]:
from tqdm import tqdm

In [8]:
from seq2seq_attn import (NLPDataset, Seq2SeqAtt, collate_fn_padd)

# 1) Preparação do Tokenizador

## 1.1) Tokenizador do Input

In [9]:
input_tokenizer = ByteLevelBPETokenizer(
    "./canonizador_tokenizer-vocab.json",
    "./canonizador_tokenizer-merges.txt"
)

## 1.2) Tokenizador do Output

In [10]:
output_tokenizer = ByteLevelBPETokenizer(
    "./output_canonizador_tokenizer-vocab.json",
    "./output_canonizador_tokenizer-merges.txt"
)

In [11]:
null_input_token =  input_tokenizer.get_vocab()['<pad>']
null_output_token =  output_tokenizer.get_vocab()['<pad>']
start_token = output_tokenizer.get_vocab()['<start>']
end_token = output_tokenizer.get_vocab()['<end>']

# 2) Ingestão de dados

In [12]:
data = pd.read_json('../data/final_df.json')

# 3) Preparação do Dataset

In [13]:
BATCH_SIZE = 40

In [14]:
data = data.sample(len(data))

In [15]:
train = data[:int(0.99 * data.shape[0])]
val = data[int(0.99 * data.shape[0]):]

In [16]:
dataset_train = NLPDataset(train, input_tokenizer, output_tokenizer)
dataset_val = NLPDataset(val, input_tokenizer, output_tokenizer)

In [17]:
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True,
                              num_workers=0, collate_fn=collate_fn_padd)
dataloader_val = DataLoader(dataset_val, shuffle=False, num_workers=0, collate_fn=collate_fn_padd)

# 4) Preparação do Modelo

In [18]:
seq2seq = Seq2SeqAtt(null_token=null_input_token, start_token=start_token, end_token=end_token,
                     encoder_vocab_size=len(input_tokenizer.get_vocab()),
                     decoder_vocab_size=len(output_tokenizer.get_vocab()), 
                     wordvec_dim=48, hidden_dim=64, rnn_num_layers=1,
                     rnn_dropout=0)

In [19]:
optimizer = torch.optim.Adam(seq2seq.parameters())

In [20]:
EPOCHS = 5

In [None]:
best_loss = float("inf")
for epoch in range(EPOCHS):
    print("Epoch", epoch)
    
    seq2seq.train()
    train_loss = []
    for batch in tqdm(dataloader_train):
        x = batch['input']
        y = batch['output']
        
        loss = seq2seq(x, y)
        train_loss.append(loss.data.item())
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
    print("\tTrain loss: {0}".format(np.mean(train_loss)))
        
    seq2seq.eval()
    val_loss = []
    with torch.no_grad():
        for batch in dataloader_eval:
            x = batch['input']
            y = batch['output']
            
            loss = seq2seq(x, y)
            val_loss.append(loss.data.item())
    val_loss = np.mean(val_loss)
            
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save('seq2seq.pt')
    
    print("\tVal loss: {0}".format(val_loss))
    print()

  0%|          | 0/31950 [00:00<?, ?it/s]

Epoch 0


  0%|          | 50/31950 [00:32<6:12:55,  1.43it/s]

# 6) Avaliação do output do modelo

In [None]:
x = data.query_string.tolist()[0]

In [None]:
x

In [None]:
x = torch.tensor(input_tokenizer.encode(x).ids).unsqueeze(0)

In [None]:
x

In [None]:
decoding = []
for token in seq2seq.sample(x):
    if isinstance(token, int):
        token = torch.tensor(token)
    decoding.append(token.item())    

In [None]:
output_tokenizer.decode(decoding)

In [None]:
data.output.tolist()[0]