In [1]:
import pandas as pd

In [2]:
import torch

In [3]:
from tokenizers import Tokenizer

In [4]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [5]:
from torch.utils.data import DataLoader

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
import numpy as np

In [8]:
from tqdm import tqdm

In [9]:
from seq2seq_attn import (NLPDataset, Seq2SeqAtt, collate_fn_padd)

In [10]:
from tokenizers import Tokenizer, pre_tokenizers, normalizers, decoders, models, processors

In [11]:
from tokenizer import Tokenizer as Tokenize

# 1) Preparação do Tokenizador

## 1.1) Tokenizador do Input

In [12]:
input_tokenizer = Tokenizer(models.BPE(
  vocab="./input_tokenizer-vocab.json", 
  merges="./input_tokenizer-merges.txt"
))

In [13]:
input_tokenizer.normalizer = normalizers.Lowercase()
input_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
input_tokenizer.decoder = decoders.ByteLevel()
input_tokenizer.post_processor = BertProcessing(
  sep=("<end>", input_tokenizer.token_to_id("<end>")),
  cls=("<start>", input_tokenizer.token_to_id("<start>")),
)

## 1.2) Tokenizador do Output

In [14]:
special_tokens = ["<start>", "<pad>", "<end>", "<unk>", "<mask>"]

In [15]:
output_tokenizer = Tokenize(files='../data/output.txt', mode='word_level', checker='output',
                      special_tokens=special_tokens, max_length=10, tokenizer='./tokenizer_output.pickle')

In [16]:
null_input_token =  input_tokenizer.get_vocab()['<pad>']
null_output_token =  output_tokenizer.get_vocab['<pad>']
start_token = output_tokenizer.get_vocab['<start>']
end_token = output_tokenizer.get_vocab['<end>']

# 2) Ingestão de dados

In [17]:
data = pd.read_json('../data/final_df.json')
data.output = data.output.apply(lambda x: x.replace('/', ' '))

# 3) Preparação do Dataset

In [18]:
BATCH_SIZE = 40

In [19]:
data = data.sample(len(data))

In [20]:
train = data[:int(0.999 * data.shape[0])]
val = data[int(0.999 * data.shape[0]):]

In [21]:
len(train), len(val)

(1289595, 1291)

In [22]:
dataset_train = NLPDataset(train, input_tokenizer, output_tokenizer)
dataset_val = NLPDataset(val, input_tokenizer, output_tokenizer)

In [23]:
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True,
                              num_workers=3, collate_fn=collate_fn_padd)
dataloader_val = DataLoader(dataset_val, batch_size=len(val), shuffle=False, num_workers=3,
                            collate_fn=collate_fn_padd)

# 4) Preparação do Modelo

In [24]:
seq2seq = Seq2SeqAtt(null_token=null_input_token, start_token=start_token, end_token=end_token,
                     encoder_vocab_size=len(input_tokenizer.get_vocab()),
                     decoder_vocab_size=len(output_tokenizer.get_vocab), 
                     wordvec_dim=48, hidden_dim=64, rnn_num_layers=1,
                     rnn_dropout=0)

In [25]:
optimizer = torch.optim.Adam(seq2seq.parameters())

In [26]:
EPOCHS = 10

In [None]:
best_loss = float("inf")
for epoch in range(EPOCHS):
    print("Epoch", epoch)
    
    seq2seq.train()
    train_loss = []
    for batch in tqdm(dataloader_train):
        x = batch['input']
        y = batch['output']
        
        loss = seq2seq(x, y)
        train_loss.append(loss.data.item())
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
    print("\tTrain loss: {0}".format(np.mean(train_loss)))
        
    seq2seq.eval()
    val_loss = None
    with torch.no_grad():
        for batch in tqdm(dataloader_val):
            x = batch['input']
            y = batch['output']
            
            loss = seq2seq(x, y)
            val_loss = loss.data.item()
                
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(seq2seq, 'seq2seq.pt')
    
    print("\tVal loss: {0}".format(val_loss))
    print()

  0%|          | 0/32240 [00:00<?, ?it/s]

Epoch 0


  2%|▏         | 565/32240 [02:18<2:02:43,  4.30it/s]

# 6) Avaliação do output do modelo

In [None]:
x = data.query_string.tolist()[0]

In [None]:
x

In [None]:
tokens = input_tokenizer.encode(x).ids

In [None]:
tokens

In [None]:
decoding = []
for token in seq2seq.sample(torch.tensor(tokens).unsqueeze(0)):
    if isinstance(token, int):    
        token = torch.tensor(token)
    decoding.append(token.item())    

In [None]:
output_tokenizer.decode(decoding)

In [None]:
data.output.tolist()[0]