In [1]:
# The MIT License (MIT) Copyright (c) 2025 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/14_Seq2seq_HP.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Seq2seq
- En este notebook se entrena un Seq2seq en el libro de Harry Potter.

<img src="../img/seq-to-seq.png" width="700"/>

__Imagen tomada de Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27.__

- Harry Potter book: https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books

In [2]:
import os
import torch
import pandas as pd
import pathlib
import random

torch.__version__

'2.7.1+cu126'

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x7a048c0bcd10>

## 1.- Conjuntos de datos

In [4]:
import re

def create_sequences(file_path, max_len):
    """
    Reads a text file and generates sequences of words with a given maximum length,
    tokenizing punctuation as individual tokens.

    Args:
        file_path (str): The path to the text file.
        max_len (int): The maximum length of each sequence.

    Returns:
        list: A list of word sequences.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Add spaces around punctuation for tokenization
    text = re.sub(r'([.,!?;:])', r' \1 ', text)
    text = re.sub(r'\s{2,}', ' ', text) # Replace multiple spaces with a single space

    # Simple tokenization by splitting on whitespace and converting to lowercase
    words = text.lower().split()

    sequences = []
    for i in range(0, len(words), max_len):
        sequence = words[i:i + max_len]
        sequences.append(sequence)

    return sequences

# Example usage:
file_path = './01 Harry Potter and the Sorcerers Stone.txt'
max_len = 64
word_sequences = create_sequences(file_path, max_len)

# Print the first few sequences
for i, seq in enumerate(word_sequences[:2]):
    print(f"Sequence {i+1}: {seq}")

Sequence 1: ['m', 'r', '.', 'and', 'mrs', '.', 'dursley', ',', 'of', 'number', 'four', ',', 'privet', 'drive', ',', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', ',', 'thank', 'you', 'very', 'much', '.', 'they', 'were', 'the', 'last', 'people', 'you’d', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', ',', 'because', 'they', 'just', 'didn’t', 'hold', 'with', 'such', 'nonsense', '.', 'mr', '.', 'dursley', 'was', 'the', 'director', 'of', 'a', 'firm']
Sequence 2: ['called', 'grunnings', ',', 'which', 'made', 'drills', '.', 'he', 'was', 'a', 'big', ',', 'beefy', 'man', 'with', 'hardly', 'any', 'neck', ',', 'although', 'he', 'did', 'have', 'a', 'very', 'large', 'mustache', '.', 'mrs', '.', 'dursley', 'was', 'thin', 'and', 'blonde', 'and', 'had', 'nearly', 'twice', 'the', 'usual', 'amount', 'of', 'neck', ',', 'which', 'came', 'in', 'very', 'useful', 'as', 'she', 'spent', 'so', 'much', 'of', 'her', 'time', 'craning', 'over', 'ga

In [5]:
def create_seq2seq_sequences(word_sequences, max_len):
    seq2seq_sequences = []
    for i in range(len(word_sequences) - 1):
        input_sequence = word_sequences[i]
        target_sequence = word_sequences[i+1]
        seq2seq_sequences.append((input_sequence, target_sequence))

    return seq2seq_sequences

# Example usage:
# Assuming word_sequences is already a list of sequences of max_sequence_length
seq2seq_word_sequences = create_seq2seq_sequences(word_sequences, max_len)

# Print the first few sequences
for i, seq_pair in enumerate(seq2seq_word_sequences[:2]):
    print(f"Sequence Pair {i+1}: Input - {seq_pair[0]}, Target - {seq_pair[1]}")

Sequence Pair 1: Input - ['m', 'r', '.', 'and', 'mrs', '.', 'dursley', ',', 'of', 'number', 'four', ',', 'privet', 'drive', ',', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', ',', 'thank', 'you', 'very', 'much', '.', 'they', 'were', 'the', 'last', 'people', 'you’d', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', ',', 'because', 'they', 'just', 'didn’t', 'hold', 'with', 'such', 'nonsense', '.', 'mr', '.', 'dursley', 'was', 'the', 'director', 'of', 'a', 'firm'], Target - ['called', 'grunnings', ',', 'which', 'made', 'drills', '.', 'he', 'was', 'a', 'big', ',', 'beefy', 'man', 'with', 'hardly', 'any', 'neck', ',', 'although', 'he', 'did', 'have', 'a', 'very', 'large', 'mustache', '.', 'mrs', '.', 'dursley', 'was', 'thin', 'and', 'blonde', 'and', 'had', 'nearly', 'twice', 'the', 'usual', 'amount', 'of', 'neck', ',', 'which', 'came', 'in', 'very', 'useful', 'as', 'she', 'spent', 'so', 'much', 'of', 'her', 'time', 'craning', 

In [6]:
len(seq2seq_word_sequences), len(seq2seq_word_sequences[0][0])

(1456, 64)

In [7]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

# Define a WordLevel tokenizer with unk_token
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Create a WordLevelTrainer and specify special tokens (including [UNK]) and vocab size
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[BOS]"], min_frequency=1)


# Train the tokenizer on your text data, using the trainer
tokenizer.train_from_iterator(word_sequences, trainer=trainer)

# Now you can encode your text
text = "Hello, how are you?"
encoding = tokenizer.encode(text)

# Access the token IDs
print("Token IDs:", encoding.ids)

# Decode the token IDs back to words
decoded_text = tokenizer.decode(encoding.ids)
print("Decoded Text:", decoded_text)

Token IDs: [0, 3, 100, 94, 17, 24]
Decoded Text: , how are you ?


In [8]:
text = "harry ron voldemort"
encoding = tokenizer.encode(text)
print("Token IDs:", encoding.ids)

Token IDs: [13, 35, 320]


In [9]:
vocab_size = tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 5769


In [10]:
BOS_IDX = tokenizer.token_to_id("[BOS]")
PAD_IDX = tokenizer.token_to_id("[PAD]")

BOS_IDX, PAD_IDX

(2, 1)

## 2.- Pipeline

In [11]:
from torch.utils.data import Dataset, DataLoader
import torch

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        # self.texts is a list of tuples: (input_sequence, target_sequence)
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_seq, target_seq = self.texts[idx]
        enc_inp_ids = self.tokenizer.encode(' '.join(input_seq)).ids
        dec_ids = self.tokenizer.encode(' '.join(target_seq)).ids
        dec_inp_ids = dec_ids[:-1]
        dec_out_ids = dec_ids

        input_padding_length = self.max_len - len(enc_inp_ids)
        if input_padding_length > 0:
            enc_inp_ids = enc_inp_ids + ([self.tokenizer.token_to_id("[PAD]")] * input_padding_length)
        elif input_padding_length < 0:
            enc_inp_ids = enc_inp_ids[:self.max_len]

        # Add BOS token to decoder input and convert to tensor
        dec_inp_ids = [BOS_IDX] + dec_inp_ids
        dec_inp_ids = torch.tensor(dec_inp_ids, dtype=torch.long)

        # Convert decoder output to tensor
        dec_out_ids = torch.tensor(dec_out_ids, dtype=torch.long)

        # Pad the decoder input sequence
        dec_inp_padding_length = self.max_len - len(dec_inp_ids)
        if dec_inp_padding_length > 0:
            dec_inp_ids = torch.cat((dec_inp_ids, torch.tensor([self.tokenizer.token_to_id("[PAD]")] * dec_inp_padding_length, dtype=torch.long)))
        elif dec_inp_padding_length < 0:
            dec_inp_ids = dec_inp_ids[:self.max_len]

          # Pad the decoder output sequence
        dec_out_padding_length = self.max_len - len(dec_out_ids)
        if dec_out_padding_length > 0:
            dec_out_ids = torch.cat((dec_out_ids, torch.tensor([self.tokenizer.token_to_id("[PAD]")] * dec_out_padding_length, dtype=torch.long)))
        elif dec_out_padding_length < 0:
            dec_out_ids = dec_out_ids[:self.max_len]

        return torch.tensor(enc_inp_ids, dtype=torch.long), dec_inp_ids,  dec_out_ids


train_dataset = TextDataset(seq2seq_word_sequences, tokenizer, max_len=max_len)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=False)
enc_batch, dec_batch, tar_batch= next(iter(train_loader))
enc_batch.shape, dec_batch.shape, tar_batch.shape

(torch.Size([128, 64]), torch.Size([128, 64]), torch.Size([128, 64]))

## 3.- Modelo

In [12]:
import torch.nn as nn
from torch import optim
import time

In [13]:
emb_dim = 128
model_dim = 128

In [14]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        
    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, cell) = self.rnn(x)
        return (hidden, cell) 

encoder = Encoder(vocab_size, emb_dim, model_dim)
state_batch = encoder(enc_batch)
state_batch[0].shape

torch.Size([1, 128, 128])

In [15]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(emb_dim, model_dim, num_layers=1, batch_first=True)
        self.v = nn.Linear(model_dim, vocab_size)

    def forward(self, x, state):
        x = self.embedding(x)
        rnn_out, state = self.rnn(x, state)
        x = self.v(rnn_out)
        return x, state


decoder = Decoder(vocab_size, emb_dim, model_dim)
output_batch = decoder(dec_batch, state_batch)
output_batch[0].shape, output_batch[1][0].shape, output_batch[1][1].shape

(torch.Size([128, 64, 5769]),
 torch.Size([1, 128, 128]),
 torch.Size([1, 128, 128]))

In [16]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inp, tar):
        state = self.encoder(inp)
        output, _ = self.decoder(tar, state)
        return output


seq2seq = Seq2seq(encoder, decoder)
output_batch = seq2seq(enc_batch, dec_batch)
output_batch.shape

torch.Size([128, 64, 5769])

## 4.- Entrenamiento

In [17]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

seq2seq.to(device)

cuda:0


Seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(5769, 128)
    (rnn): LSTM(128, 128, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(5769, 128)
    (rnn): LSTM(128, 128, batch_first=True)
    (v): Linear(in_features=128, out_features=5769, bias=True)
  )
)

In [18]:
optimizer = optim.Adam(seq2seq.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [19]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inp_enc, inp_dec, tar_dec in train_loader:
        tar_dec = tar_dec.reshape(-1)
        inp_enc = inp_enc.to(device)
        inp_dec = inp_dec.to(device)
        tar_dec = tar_dec.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inp_enc, inp_dec)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, tar_dec)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    return time.time()-start, running_loss / len(train_loader)

In [20]:
def generate_text(model, device, tokenizer, start_text, max_len=64):
    model.eval()
    with torch.no_grad():
        encoded_start = tokenizer.encode(start_text).ids
        input_ids = torch.tensor(encoded_start, dtype=torch.long).unsqueeze(0).to(device)
        encoder_state = model.encoder(input_ids)

        decoder_input = torch.tensor([[BOS_IDX]], dtype=torch.long).to(device)
        generated_ids = []

        state = encoder_state
        for _ in range(max_len):
            output, state = model.decoder(decoder_input, state)
            next_token_logits = output[:, -1, :]
            next_token_id = torch.argmax(next_token_logits, dim=-1).item()
            generated_ids.append(next_token_id)

            decoder_input = torch.tensor([[next_token_id]], dtype=torch.long).to(device)

        generated_text = tokenizer.decode(generated_ids)
        print(f'Input: {start_text}')
        print(f'Output: {generated_text}')
        

# Example usage:
start_text = "harry and ron"
generate_text(seq2seq, device, tokenizer, start_text, max_len=max_len)

Input: harry and ron
Output: judging licorice licorice licorice drift licorice licorice licorice drift halfway paces wherever mean rolling bezoar sharply company firm messing telescope wander slight turnin grumbled terry arriving choosing mist holding pythons hired teddy carved rang owe gray branch chasin suggested lives - go gray large ickle speedy mouth imagine erm fork finest incredible relief incredible scooping moaning doorknob bustling diversion phoenixes shared bewitch bewitch earsplitting


In [21]:
sentences = ['harry and snape',
             'harry and ron',
             'i love to eat']

In [22]:
epochs = 700

for epoch in range(epochs):
    ep_time, ep_loss = train(seq2seq, device, train_loader, optimizer, epoch)

    if epoch % 25 == 0:
        print(f'Time for epoch {epoch} is {ep_time:.4f} sec Train loss: {ep_loss:.4f}')
    if epoch % 100 == 0:
        for s in sentences:
            generate_text(seq2seq, device, tokenizer, s, max_len=max_len)

Time for epoch 0 is 0.3860 sec Train loss: 8.5678
Input: harry and snape
Output: . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ”
Input: harry and ron
Output: . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ”
Input: i love to eat
Output: . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ” said . “ , ”
Time for epoch 25 is 0.2415 sec Train loss: 4.9006
Time for epoch 50 is 0.2420 sec Train loss: 4.2604
Time for epoch 75 is 0.2451 sec Train loss: 3.8453
Time for epoch 100 is 0.2457 sec Train loss: 3.4907
Input: harry and snape
Output: “ all right , ” said harry . “ i ’ m going to be able to get us . ” “ i ’ m going to be able to get us . ” “ i ’ m going to be able t

In [23]:
for s in sentences:
    generate_text(seq2seq, device, tokenizer, s, max_len=max_len)

Input: harry and snape
Output: seemed as almost called him ? would have been the match for him . “ i ’ m going to teach you out for this while , however , the zoo director himself silly the seeker got into a great sort of like basketball on top of it . “ who is that mirror ’ ll i just take a bit , i don
Input: harry and ron
Output: the ceiling . boa constrictor long fingers , eyes away with a large mustache . “ what it would ? ” “ it ’ s , ” he said . “ i ’ m not hungry . ” “ i shall see , ” an ’ mother had told him about snape . “ you don ’ t know how to ask ? ”
Input: i love to eat
Output: touch umbrella find out this boy . then , new across the room , the fat lady . harry opened it on the other side of the way to was that letter . harry , sprinting across the grounds toward the room . “ yes , ” said ron . “ it ’ s not in the school , ” said hagrid . “


## Ejercicio
- Mejorar el modelo con las técnicas propuestas en _Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. Advances in neural information processing systems, 27._
- Agreagar mecanismo de atención de _Bahdanau_.