In [1]:
# The MIT License (MIT) Copyright (c) 2025 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/14_Seq2seq_HP.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Seq2seq
- En este notebook se entrena un Seq2seq en el libro de Harry Potter.

<img src="../img/seq-to-seq.png" width="700"/>

__Imagen tomada de Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.__

- Harry Potter book: https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras
import torch
import pandas as pd
import pathlib
import random

torch.__version__, keras.__version__

('2.5.1+cu124', '3.6.0')

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x718216b43470>

## 1.- Conjuntos de datos

In [4]:
path = './01 Harry Potter and the Sorcerers Stone.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 439478


In [5]:
import re

words = re.findall(r'\b\w+\b|[\.,;!?()"\']', book)

In [6]:
maxlen = 32
text_pairs = []
for i in range(0, len(words), maxlen):
    inp = words[i:i + maxlen]
    out = words[i + maxlen :i + maxlen * 2]
    text_pairs.append((' '.join(inp), ' '.join(out)))
    for j in range(maxlen - 1):
      text_pairs.append((' '.join(inp[j + 1:]), ' '.join(out)))

for i in range(5):
  print(text_pairs[i])

len(text_pairs)

('m r . and mrs . dursley , of number four , privet drive , were proud to say that they were perfectly normal , thank you very much . they were', 'the last people you d expect to be involved in anything strange or mysterious , because they just didn t hold with such nonsense . mr . dursley was the director of')
('r . and mrs . dursley , of number four , privet drive , were proud to say that they were perfectly normal , thank you very much . they were', 'the last people you d expect to be involved in anything strange or mysterious , because they just didn t hold with such nonsense . mr . dursley was the director of')
('. and mrs . dursley , of number four , privet drive , were proud to say that they were perfectly normal , thank you very much . they were', 'the last people you d expect to be involved in anything strange or mysterious , because they just didn t hold with such nonsense . mr . dursley was the director of')
('and mrs . dursley , of number four , privet drive , were proud to

93344

## 2.- Pipeline

- Crea vocabulario y define tokenizer.

In [7]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
import os

# Set environment variable to avoid tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"], min_frequency=1)

tokenizer.train_from_iterator(text_pairs, trainer=trainer)

text = "hello potter"
encoding = tokenizer.encode(text)
print("Token IDs:", encoding.ids)
decoded_text = tokenizer.decode(encoding.ids)
print("Decoded Text:", decoded_text)
vocab_size = tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size}")

Token IDs: [2154, 109]
Decoded Text: hello potter
Vocabulary size: 5747


In [8]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

batch_size = 64
PAD_IDX = tokenizer.token_to_id("[PAD]") 
BOS_IDX = tokenizer.token_to_id("[BOS]") 
EOS_IDX = tokenizer.token_to_id("[EOS]") 

def data_process(text):
    data = []
    for enc_tensor, dec_tensor in text:
        enc_tensor_ = torch.tensor(tokenizer.encode(enc_tensor).ids,
                                dtype=torch.long)
        dec_tensor_ = torch.tensor(tokenizer.encode(dec_tensor).ids,
                                dtype=torch.long)

        if enc_tensor_.shape[0] < maxlen:
            data.append((enc_tensor_, dec_tensor_))
    return data

train_data = data_process(text_pairs)
print(len(train_data))


def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(torch.cat([
            torch.tensor([BOS_IDX]), 
            y_item, 
            torch.tensor([EOS_IDX])], 
                           dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX, padding_side='left')
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y[:, :-1], y[:, 1:]


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch, 
                          num_workers=4, pin_memory=True)

90428


In [9]:
%%timeit
enc_batch, dec_batch, target_batch = next(iter(train_loader))

74.4 ms ± 871 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
enc_batch, dec_batch, target_batch = next(iter(train_loader))

In [11]:
enc_batch.shape, dec_batch.shape, target_batch.shape

(torch.Size([64, 31]), torch.Size([64, 33]), torch.Size([64, 33]))

In [12]:
enc_batch[0]

tensor([   1,    1,    1,    1,    1,    1,   22,    6,  733,  268,  181,    5,
         114,   16,  137,   83, 1501,  362,  290,   13,    5,   85,    9,  164,
         546,   26,    6, 1183,    5,   11,  122])

## 3.- Modelo

In [13]:
import torch.nn as nn
from torch import optim
import time

In [14]:
emb_dim = 128
model_dim = 256

In [15]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        
    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x)
        return (hidden, cell) 

encoder = Encoder(vocab_size, emb_dim, model_dim)
state_batch = encoder(enc_batch)
state_batch[0].shape

torch.Size([1, 64, 256])

In [16]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        self.fc1 = nn.Linear(model_dim, vocab_size)
        
    def forward(self, x, state):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x, state)
        x = self.fc1(x)
        return x

decoder = Decoder(vocab_size, emb_dim, model_dim)
output_batch = decoder(dec_batch, state_batch)
output_batch.shape, target_batch.shape

(torch.Size([64, 33, 5747]), torch.Size([64, 33]))

In [17]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inp, tar):
        state = self.encoder(inp)
        x = self.decoder(tar, state)
        return x

seq2seq = Seq2seq(encoder, decoder)
output_batch = seq2seq(enc_batch, dec_batch)
output_batch.shape

torch.Size([64, 33, 5747])

## 4.- Entrenamiento

In [18]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [19]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inp_enc, inp_dec, tar_dec in train_loader:
        tar_dec = tar_dec.reshape(-1)
        inp_enc = inp_enc.to(device)
        inp_dec = inp_dec.to(device)
        tar_dec = tar_dec.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inp_enc, inp_dec)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, tar_dec)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}')

In [20]:
def translate(model, sentence, device):
    with torch.no_grad():
        model.eval()
        encoding = tokenizer.encode(sentence)
        input_ids = encoding.ids

        padding_length = maxlen - len(input_ids)
        input_ids = ([tokenizer.token_to_id("[PAD]")] * padding_length) + input_ids 
        eng_idx = torch.tensor(input_ids, dtype=torch.long)
        eng_idx = eng_idx.reshape([1, -1])
        spa_idx = torch.tensor(BOS_IDX, dtype=torch.long)
        spa_idx = spa_idx.reshape([1, -1])
        
        while spa_idx[:, -1] != EOS_IDX:
            eng_idx = eng_idx.to(device)
            spa_idx = spa_idx.to(device)
            logits = model(eng_idx, spa_idx)[:, -1, :]  
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            spa_idx = torch.cat((spa_idx, idx_next), dim=1)

        output = " ".join([tokenizer.id_to_token(_) for _ in spa_idx[0]])
        output = output.replace("[BOS]", "").replace("[EOS]", "")
    print(f'Input: {sentence}')
    print(f'Output: {output}')

In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [22]:
seq2seq.to(device)
optimizer = optim.Adam(seq2seq.parameters(), lr=0.001)

In [23]:
sentences = ['harry and snape',
             'harry and ron',
             'i love to eat']

In [24]:
epochs = 10

for epoch in range(epochs):
    train(seq2seq, device, train_loader, optimizer, epoch)
    if epoch % 2 == 0:
        for s in sentences:
            translate(seq2seq, s, device)


Time for epoch 0 is 6.6390 sec Train loss: 4.5266
Input: harry and snape
Output:  s a bit of the dark arts , and he d had a very good omen . he had a very good mood . harry had been a very good mood . harry 
Input: harry and ron
Output:  . he s a bit of magic , said harry . i m not going to be a bit o the house cup . i m not going to be a bit of 
Input: i love to eat
Output:  you re saying , said harry . i m not going to be a bit of magic . i m not going to be a bit o the mirror ? said harry . 

Time for epoch 1 is 6.4197 sec Train loss: 2.9341

Time for epoch 2 is 6.4493 sec Train loss: 1.8067
Input: harry and snape
Output:  finished calling the names of his robes . harry was just a coat of a window to smile , but it didn t say , it had been a good dream . 
Input: harry and ron
Output:  leapt forward , and he could see a few feet and then sped up the street . they had just finished were himself out of the way as he could . he 
Input: i love to eat
Output:  long get back down the wall

## Ejercicio
- Mejorar el modelo con las técnicas propuestas en _Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27._
- Agreagar mecanismo de atención de _Bahdanau_.