In [1]:
# The MIT License (MIT) Copyright (c) 2025 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/14_Seq2seq.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Seq2seq
- En este notebook se define una arquitectura seq2seq para traducir oraciones del inglés al español.

<img src="../img/seq-to-seq.png" width="700"/>

__Imagen tomada de Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.__



In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras
import torch
import pandas as pd
import pathlib
import random

torch.__version__, keras.__version__

('2.5.1+cu124', '3.6.0')

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x7afdef537470>

## 1.- Conjuntos de datos

In [4]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [5]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []
for line in lines:
    eng, spa = line.lower().split("\t")
    text_pairs.append((eng, spa))

In [6]:
for _ in range(5):
    print(random.choice(text_pairs))

('tom told mary that he had finished the work a few days ago.', 'tom le dijo a mary que él había terminado el trabajo hace unos días atrás.')
('the student refused to obey his teacher.', 'el estudiante se rehusó a obedecer a su profesor.')
('mary hopes to rest a lot during her vacation.', 'mary espera descansar mucho durante sus vacaciones.')
('tom is the person who killed mary.', 'tom es la persona que asesinó a mary.')
('he is the tallest in his class.', 'es el más alto de su clase.')


In [7]:
random.Random(77).shuffle(text_pairs)
num_val_samples = int(0.005 * len(text_pairs))
num_train_samples = len(text_pairs) - num_val_samples
train_pairs = text_pairs[:num_train_samples]
test_pairs = text_pairs[num_train_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
118370 training pairs
594 test pairs


## 2.- Pipeline

- Crea vocabulario y define tokenizer.

In [8]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
import os

# Set environment variable to avoid tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"], min_frequency=1)

tokenizer.train_from_iterator(train_pairs, trainer=trainer)

text = "hello, hola"
encoding = tokenizer.encode(text)
print("Token IDs:", encoding.ids)
decoded_text = tokenizer.decode(encoding.ids)
print("Decoded Text:", decoded_text)
vocab_size = tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size}")

Token IDs: [3733, 21, 3819]
Decoded Text: hello , hola
Vocabulary size: 30000


In [9]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

batch_size = 64
maxlen = 10
PAD_IDX = tokenizer.token_to_id("[PAD]") 
BOS_IDX = tokenizer.token_to_id("[BOS]") 
EOS_IDX = tokenizer.token_to_id("[EOS]") 

def data_process(text):
    data = []
    for enc_tensor, dec_tensor in text:
        enc_tensor_ = torch.tensor(tokenizer.encode(enc_tensor).ids,
                                dtype=torch.long)
        dec_tensor_ = torch.tensor(tokenizer.encode(dec_tensor).ids,
                                dtype=torch.long)

        if enc_tensor_.shape[0] < maxlen:
            data.append((enc_tensor_, dec_tensor_))
    return data

train_data = data_process(text_pairs)
print(len(train_data))


def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(torch.cat([
            torch.tensor([BOS_IDX]), 
            y_item, 
            torch.tensor([EOS_IDX])], 
                           dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y[:, :-1], y[:, 1:]


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch, 
                          num_workers=4, pin_memory=True)

89883


In [10]:
%%timeit
enc_batch, dec_batch, target_batch = next(iter(train_loader))

70.3 ms ± 789 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
enc_batch, dec_batch, target_batch = next(iter(train_loader))

In [12]:
enc_batch.shape, dec_batch.shape, target_batch.shape

(torch.Size([64, 9]), torch.Size([64, 13]), torch.Size([64, 13]))

## 3.- Modelo

In [13]:
import torch.nn as nn
from torch import optim
import time

In [14]:
emb_dim = 128
model_dim = 256

In [15]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        
    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x)
        return (hidden, cell) 

encoder = Encoder(vocab_size, emb_dim, model_dim)
state_batch = encoder(enc_batch)
state_batch[0].shape

torch.Size([1, 64, 256])

In [16]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        self.fc1 = nn.Linear(model_dim, vocab_size)
        
    def forward(self, x, state):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x, state)
        x = self.fc1(x)
        return x

decoder = Decoder(vocab_size, emb_dim, model_dim)
output_batch = decoder(dec_batch, state_batch)
output_batch.shape, target_batch.shape

(torch.Size([64, 13, 30000]), torch.Size([64, 13]))

In [17]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inp, tar):
        state = self.encoder(inp)
        x = self.decoder(tar, state)
        return x

seq2seq = Seq2seq(encoder, decoder)
output_batch = seq2seq(enc_batch, dec_batch)
output_batch.shape

torch.Size([64, 13, 30000])

## 4.- Entrenamiento

In [18]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [19]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inp_enc, inp_dec, tar_dec in train_loader:
        tar_dec = tar_dec.reshape(-1)
        inp_enc = inp_enc.to(device)
        inp_dec = inp_dec.to(device)
        tar_dec = tar_dec.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inp_enc, inp_dec)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, tar_dec)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}')

In [20]:
def translate(model, sentence, device):
    with torch.no_grad():
        model.eval()
        encoding = tokenizer.encode(sentence)
        input_ids = encoding.ids

        padding_length = maxlen - len(input_ids)
        input_ids = input_ids + ([tokenizer.token_to_id("[PAD]")] * padding_length)
        eng_idx = torch.tensor(input_ids, dtype=torch.long)
        eng_idx = eng_idx.reshape([1, -1])
        spa_idx = torch.tensor(BOS_IDX, dtype=torch.long)
        spa_idx = spa_idx.reshape([1, -1])
        
        while spa_idx[:, -1] != EOS_IDX:
            eng_idx = eng_idx.to(device)
            spa_idx = spa_idx.to(device)
            logits = model(eng_idx, spa_idx)[:, -1, :]  
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            spa_idx = torch.cat((spa_idx, idx_next), dim=1)

        output = " ".join([tokenizer.id_to_token(_) for _ in spa_idx[0]])
        output = output.replace("[BOS]", "").replace("[EOS]", "")
    print(f'Input: {sentence}')
    print(f'Output: {output}')


In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [22]:
seq2seq.to(device)
optimizer = optim.Adam(seq2seq.parameters(), lr=0.001)

In [23]:
sentences = ['i like my dog',
             'i love to eat',
             'i can fix it']

In [24]:
epochs = 6

for epoch in range(epochs):
    train(seq2seq, device, train_loader, optimizer, epoch)
    for s in sentences:
        translate(seq2seq, s, device)


Time for epoch 0 is 8.6171 sec Train loss: 3.9372
Input: i like my dog
Output:  me gusta mi casa . 
Input: i love to eat
Output:  me gusta el lunes . 
Input: i can fix it
Output:  ¿ puedo hacerlo ? 

Time for epoch 1 is 8.5790 sec Train loss: 2.5812
Input: i like my dog
Output:  me gusta el inglés . 
Input: i love to eat
Output:  me encanta comer . 
Input: i can fix it
Output:  ¿ puedo hacerlo ? 

Time for epoch 2 is 8.8287 sec Train loss: 1.9512
Input: i like my dog
Output:  me gusta el perro . 
Input: i love to eat
Output:  ¡ me encanta comer ! 
Input: i can fix it
Output:  ¿ puedo hacerlo ? 

Time for epoch 3 is 9.1357 sec Train loss: 1.5216
Input: i like my dog
Output:  me gusta el perro . 
Input: i love to eat
Output:  me encanta comer . 
Input: i can fix it
Output:  ¿ puedo arreglarlo ? 

Time for epoch 4 is 8.6197 sec Train loss: 1.2129
Input: i like my dog
Output:  me gusta el perro . 
Input: i love to eat
Output:  me encanta comer . 
Input: i can fix it
Output:  ¿ puedo arreg

## Ejercicio
- Agregar loop de evaluación.
- Mejorar el modelo con las técnicas propuestas en _Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27._
- Agreagar mecanismo de atención de _Bahdanau_.