In [1]:
# The MIT License (MIT) Copyright (c) 2025 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/13_Text_generation_RNN_HP.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Generación de texto
  
- Harry Potter book: https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books

In [2]:
import torch
import pandas as pd

torch.__version__

'2.5.1+cu124'

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x74396ff1f190>

## 1.- Conjuntos de datos

In [4]:
path = './01 Harry Potter and the Sorcerers Stone.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 439478


In [5]:
import re

words = re.findall(r'\b\w+\b|[\.,;!?()"\']', book)

maxlen = 50
# Crear lotes de 50 palabras
sentences = [words[i:i + maxlen] for i in range(0, len(words), maxlen)]

In [6]:
sentences[0][:20]

['m',
 'r',
 '.',
 'and',
 'mrs',
 '.',
 'dursley',
 ',',
 'of',
 'number',
 'four',
 ',',
 'privet',
 'drive',
 ',',
 'were',
 'proud',
 'to',
 'say',
 'that']

In [7]:
len(sentences)

1867

In [8]:
sentences[0]

['m',
 'r',
 '.',
 'and',
 'mrs',
 '.',
 'dursley',
 ',',
 'of',
 'number',
 'four',
 ',',
 'privet',
 'drive',
 ',',
 'were',
 'proud',
 'to',
 'say',
 'that',
 'they',
 'were',
 'perfectly',
 'normal',
 ',',
 'thank',
 'you',
 'very',
 'much',
 '.',
 'they',
 'were',
 'the',
 'last',
 'people',
 'you',
 'd',
 'expect',
 'to',
 'be',
 'involved',
 'in',
 'anything',
 'strange',
 'or',
 'mysterious',
 ',',
 'because',
 'they',
 'just']

## 2.- Pipeline

- Crea vocabulario y define tokenizer.

In [9]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

# Define a WordLevel tokenizer with unk_token
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Create a WordLevelTrainer and specify special tokens (including [UNK])
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]"], min_frequency=1)

# Train the tokenizer on your text data, using the trainer
tokenizer.train_from_iterator(sentences, trainer=trainer)

# Now you can encode your text
text = "hello, how are you?"
encoding = tokenizer.encode(text)

# Access the token IDs
print("Token IDs:", encoding.ids)

# Decode the token IDs back to words
decoded_text = tokenizer.decode(encoding.ids)
print("Decoded Text:", decoded_text)

Token IDs: [2206, 2, 93, 87, 13, 20]
Decoded Text: hello , how are you ?


In [10]:
vocab_size = tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 5745


In [11]:
PAD_IDX = tokenizer.token_to_id("[PAD]") 
PAD_IDX

1

In [12]:
from torch.nn.utils.rnn import pad_sequence

In [13]:
from torch.utils.data import Dataset, DataLoader
import torch

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = [' '.join(tokens) for tokens in texts]
        self.tokenizer = tokenizer
        self.max_len = max_len + 1

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode(text)
        input_ids = encoding.ids

        # Padding
        padding_length = self.max_len - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([tokenizer.token_to_id("[PAD]")] * padding_length)
        elif padding_length < 0:
            input_ids = input_ids[:self.max_len]
        x = torch.tensor(input_ids, dtype=torch.long)[:-1]
        y = torch.tensor(input_ids, dtype=torch.long)[1:]
        return x, y

# Crea los datasets
maxlen = 64
batch_size= 64
train_dataset = TextDataset(sentences, tokenizer, max_len=maxlen)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
train_batch, train_label = next(iter(train_loader))
print(train_batch.shape, train_label.shape)

torch.Size([64, 64]) torch.Size([64, 64])


In [14]:
train_batch[3]

tensor([   3,   17,  102,  293,    5,  293,    2,   19,  532,    3,   57,  631,
          14,    8,  484,    3,  550,   62,   18,  182,   27,   48,   11,    8,
         857, 3272,  188,   22,   41, 2952,    3,  191,   10,    8,  720,  914,
          36,   27,    3,    4,  680, 1064,    3,   33,   54,   13,    2,  110,
          20,   19,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1])

In [15]:
train_label[5]

tensor([ 188,    8,  144,  249,   14,  589,    2,    9,  246,  325,   46,    6,
         987,    3,   17,  172,    6,   97,    6,    4, 1238,    2,    7,  142,
         324,  727,   26,   48, 3222,   16,  409, 1726, 2062,    3,   17,   89,
           2,   89,   18,   17,   20,   73,   86,  849,   13,   87,    6,   38,
         568,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1])

In [16]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

3.29 ms ± 110 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
train_batch, target_batch = next(iter(train_loader))

In [18]:
train_batch.shape, target_batch.shape

(torch.Size([64, 64]), torch.Size([64, 64]))

## 3.- Modelo

In [19]:
import torch.nn as nn
from torch import optim
import time

In [20]:
class RNN(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, model_dim=128):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim, 
                        hidden_size=model_dim, 
                        num_layers=1, 
                        batch_first=True)
        self.fc1 = nn.Linear(model_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x)
        x = self.fc1(x)
        return x

rnn = RNN(vocab_size)
output_batch = rnn(train_batch)
output_batch.shape, target_batch.shape

(torch.Size([64, 64, 5745]), torch.Size([64, 64]))

## 4.- Entrenamiento

In [21]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [22]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, labels in train_loader:
        labels = labels.view(-1)
        inputs, labels = inputs.to(device), labels.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return f'Time for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}'

In [23]:
def generate(model, seed_text, device, maxlen):
    with torch.no_grad():
        model.eval()
        input_ids = tokenizer.encode(seed_text).ids
        idx = torch.tensor(input_ids, dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = model(idx)[:, -1, :]      
            probs = torch.softmax(logits, dim=-1)
            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join(
                    [tokenizer.id_to_token(idx[0, _]) for _ in range(maxlen)]
                )
    return txt

start_token = 'voldemort'
#generate(rnn, start_token, 'cuda', 10)

In [24]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [25]:
rnn.to(device)
optimizer = optim.Adam(rnn.parameters(), lr=0.001)

In [26]:
epochs = 500
start = time.time()
for epoch in range(epochs):
    report = train(rnn, device, train_loader, optimizer, epoch)
    if epoch % 50 == 0:
        print(f'\nTime for interval is {time.time()-start:.4f} sec')
        start = time.time()
        print(report)
        generated_text = generate(rnn, start_token, device, maxlen)
        print(f'Output: {generated_text}')


Time for interval is 0.3656 sec
Time for epoch 0 is 0.3655 sec Train loss: 7.8472
Output: voldemort . he . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

Time for interval is 9.6977 sec
Time for epoch 50 is 0.1865 sec Train loss: 4.0215
Output: voldemort s got to the end of the door . harry had been a very good , but he was a bit of the stone , and the only one of the dursleys had a bit of the stone , said harry , and the only one of the dursleys had a bit of the stone , said harry , and the only

Time for interval is 9.6902 sec
Time for epoch 100 is 0.1875 sec Train loss: 3.2003
Output: voldemort will be able to get the same , but he was going to be a bit of a lot . he was going to be a bit of a lot . he was going to be a bit of a lot . he was going to be a bit of a lot . and then , said ron . i m not

Time for interval is 9.6628 sec
Time for epoch 150 is 0.2018 sec Train loss: 2.5664
Output: voldemort will be ab

In [27]:
start_token = 'voldemort'
generated_text = generate(rnn, start_token, device, 1000)
generated_text

'voldemort will be able to come and finish me off . . . . snape spat bitterly on the ground . harry had never seen her look on if it was . and you mustn t go back . and this , said hermione , and they ran to the dursleys , harry had almost forgotten that the exam enchantments , and there s no big deal , because he was to keep him in the sunlight . quick . i m not going to be able to hold a quill for a week . he was starting to feel that the dursleys would never had . it was the dearest ambition of many to two ; was that quidditch match in a week . i m not going to be somewhere , there s a great muggle , like one thing they re in the same , they were sure at once , he said , and they weren t sure they d done so , said harry . i don t be too friendly he s going to sc no , they got the back of the chamber , and that s why yer late , is it ? said harry , who was looking for a last word to her . the speaker was a plump woman who was talking to be that simple . they hadn t found out how to g

## Ejercicio
- Incrementar el tamaño del dataset utilizando todos los libros de _Harry Potter_.
- Entrenar con diferentes métodos de de Tokenización.