In [1]:
# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/15_Seq2seq.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Seq2seq
- En este notebook se define una arquitectura seq2seq para traducir oraciones del inglés al español.

<img src="../img/seq-to-seq.png" width="700"/>

__Imagen tomada de Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.__



In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras
import torch
import pandas as pd
import pathlib
import random

torch.__version__, keras.__version__

('2.2.1+cu121', '3.0.0')

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x7f88ecd12070>

## 1.- Conjuntos de datos

In [4]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [5]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []
for line in lines:
    eng, spa = line.lower().split("\t")
    text_pairs.append((eng, spa))

In [6]:
for _ in range(5):
    print(random.choice(text_pairs))

('you should conform to the rules.', 'debe ajustarse a las reglas.')
('language is one of the most important inventions of mankind.', 'el lenguaje es una de las invenciones más importantes de la humanidad.')
('i can see a ship in the distance.', 'puedo ver un barco a la distancia.')
("tom won't remember a thing.", 'tom no recordará nada.')
("dinner's on me.", 'la cena va por mi cuenta.')


In [7]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


In [8]:
text_pairs[0]

('did i lock the door?', '¿le puse llave a la puerta?')

## 2.- Pipeline

- Crea vocabulario y define tokenizer.

In [9]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
from collections import Counter

In [10]:
#!python -m spacy download en_core_web_sm
#!python -m spacy download es_core_news_sm

In [11]:
eng_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
spa_tokenizer = get_tokenizer('spacy', language='es_core_news_sm')

In [12]:
def build_vocab(text, tokenizers, min_freq=5):
    eng_tokenizer, spa_tokenizer = tokenizers
    eng_counter = Counter()
    spa_counter = Counter()
    for eng_string_, spa_string_ in text:
        eng_counter.update(eng_tokenizer(eng_string_))
        spa_counter.update(spa_tokenizer(spa_string_))
    eng_vocab = Vocab(eng_counter, min_freq=min_freq, 
                       specials=['<unk>', '<pad>'])
    spa_vocab = Vocab(spa_counter, min_freq=min_freq, 
                       specials=['<unk>', '<pad>', '<bos>', '<eos>'])
    return eng_vocab, spa_vocab

eng_vocab, spa_vocab = build_vocab(text_pairs, 
                                   [eng_tokenizer, spa_tokenizer],
                                   min_freq=0)

In [13]:
eng_vocab_size = len(eng_vocab)
spa_vocab_size = len(spa_vocab)
eng_vocab_size, spa_vocab_size

(13229, 26116)

In [14]:
maxlen = 10

def data_process(text):
    data = []
    for eng, spa in text:
        eng_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng)],
                                dtype=torch.long)
        spa_tensor_ = torch.tensor([spa_vocab[token] for token in spa_tokenizer(spa)],
                                dtype=torch.long)

        if eng_tensor_.shape[0] < maxlen:
            data.append((eng_tensor_, spa_tensor_))
    return data

train_data = data_process(text_pairs)

In [15]:
len(train_data)

94342

In [16]:
batch_size = 64
PAD_IDX = eng_vocab['<pad>']
BOS_IDX = spa_vocab['<bos>']
EOS_IDX = spa_vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(torch.cat([torch.tensor([BOS_IDX]), 
                            y_item, 
                            torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y[:, :-1], y[:, 1:]


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch, 
                          num_workers=4, pin_memory=True)

In [17]:
%%timeit
enc_batch, dec_batch, target_batch = next(iter(train_loader))

79.7 ms ± 3.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
enc_batch, dec_batch, target_batch = next(iter(train_loader))

In [19]:
enc_batch.shape, dec_batch.shape, target_batch.shape

(torch.Size([64, 9]), torch.Size([64, 12]), torch.Size([64, 12]))

## 3.- Modelo

In [20]:
import torch.nn as nn
from torch import optim
import time

In [21]:
emb_dim = 128
model_dim = 256

In [22]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        
    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x)
        return (hidden, cell) 

encoder = Encoder(eng_vocab_size, emb_dim, model_dim)
state_batch = encoder(enc_batch)
state_batch[0].shape

torch.Size([1, 64, 256])

In [23]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        self.fc1 = nn.Linear(model_dim, vocab_size)
        
    def forward(self, x, state):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x, state)
        x = self.fc1(x)
        return x

decoder = Decoder(spa_vocab_size, emb_dim, model_dim)
output_batch = decoder(dec_batch, state_batch)
output_batch.shape, target_batch.shape

(torch.Size([64, 12, 26116]), torch.Size([64, 12]))

In [24]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inp, tar):
        state = self.encoder(inp)
        x = self.decoder(tar, state)
        return x

seq2seq = Seq2seq(encoder, decoder)
output_batch = seq2seq(enc_batch, dec_batch)
output_batch.shape

torch.Size([64, 12, 26116])

## 4.- Entrenamiento

In [25]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [26]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inp_enc, inp_dec, tar_dec in train_loader:
        tar_dec = tar_dec.reshape(-1)
        inp_enc = inp_enc.to(device)
        inp_dec = inp_dec.to(device)
        tar_dec = tar_dec.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inp_enc, inp_dec)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, tar_dec)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}')

In [27]:
def translate(model, sentence, device):
    with torch.no_grad():
        model.eval()
        eng_idx = torch.tensor([eng_vocab[token] for token in eng_tokenizer(sentence)],
                                    dtype=torch.long)
        eng_idx = eng_idx.reshape([1, -1])

        spa_idx = torch.tensor(BOS_IDX, dtype=torch.long)
        spa_idx = spa_idx.reshape([1, -1])
        
        while spa_idx[:, -1] != EOS_IDX:
            eng_idx = eng_idx.to(device)
            spa_idx = spa_idx.to(device)
            logits = model(eng_idx, spa_idx)[:, -1, :]  
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            spa_idx = torch.cat((spa_idx, idx_next), dim=1)

        output = " ".join([spa_vocab.get_itos()[_] for _ in spa_idx[0]])
        output = output.replace("<bos>", "").replace("<eos>", "")
    print(f'Input: {sentence}')
    print(f'Output: {output}')

In [28]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [29]:
seq2seq.to(device)
optimizer = optim.Adam(seq2seq.parameters(), lr=0.001)

In [30]:
sentences = ['i like my dog',
             'i love to eat',
             'the white cat']

In [31]:
epochs = 5

for epoch in range(epochs):
    train(seq2seq, device, train_loader, optimizer, epoch)
    for s in sentences:
        translate(seq2seq, s, device)


Time for epoch 0 is 8.0667 sec Train loss: 3.9344
Input: i like my dog
Output:  me gusta mi nombre . 
Input: i love to eat
Output:  me gustaría ir . 
Input: the white cat
Output:  el libro de la mesa . 

Time for epoch 1 is 7.8775 sec Train loss: 2.5515
Input: i like my dog
Output:  me gusta el perro . 
Input: i love to eat
Output:  me encanta ir . 
Input: the white cat
Output:  el gato . 

Time for epoch 2 is 7.9260 sec Train loss: 1.9043
Input: i like my dog
Output:  me gusta el perro . 
Input: i love to eat
Output:  me encanta ir . 
Input: the white cat
Output:  el gato blanco . 

Time for epoch 3 is 7.9210 sec Train loss: 1.4643
Input: i like my dog
Output:  me gusta el perro . 
Input: i love to eat
Output:  yo me gusta comer . 
Input: the white cat
Output:  el gato blanco . 

Time for epoch 4 is 7.9253 sec Train loss: 1.1518
Input: i like my dog
Output:  me gusta el perro . 
Input: i love to eat
Output:  me encanta comer . 
Input: the white cat
Output:  el gato blanco . 


## Ejercicio
- Agregar loop de evaluación.
- Mejorar el modelo con las técnicas propuestas en _Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27._
- Agreagar mecanismo de atención de _Bahdanau_.