In [None]:
# The MIT License (MIT) Copyright (c) 2025 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/16_GPT_HP">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# GPT

- Generación de texto con arquitectura GPT
- Harry Potter book: https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books

In [None]:
import torch
import pandas as pd

torch.__version__

'2.8.0+cu126'

In [None]:
torch.manual_seed(77)

<torch._C.Generator at 0x7e4e5528cdd0>

## 1.- Conjuntos de datos

In [None]:
path = './01 Harry Potter and the Sorcerers Stone.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 439478


In [None]:
import re

words = re.findall(r'\b\w+\b|[\.,;!?()"\']', book)

maxlen = 50
# Crear lotes de 50 palabras
sentences = [words[i:i + maxlen] for i in range(0, len(words), maxlen)]

In [None]:
sentences[0][:20]

['m',
 'r',
 '.',
 'and',
 'mrs',
 '.',
 'dursley',
 ',',
 'of',
 'number',
 'four',
 ',',
 'privet',
 'drive',
 ',',
 'were',
 'proud',
 'to',
 'say',
 'that']

In [None]:
len(sentences)

1867

In [None]:
sentences[0]

['m',
 'r',
 '.',
 'and',
 'mrs',
 '.',
 'dursley',
 ',',
 'of',
 'number',
 'four',
 ',',
 'privet',
 'drive',
 ',',
 'were',
 'proud',
 'to',
 'say',
 'that',
 'they',
 'were',
 'perfectly',
 'normal',
 ',',
 'thank',
 'you',
 'very',
 'much',
 '.',
 'they',
 'were',
 'the',
 'last',
 'people',
 'you',
 'd',
 'expect',
 'to',
 'be',
 'involved',
 'in',
 'anything',
 'strange',
 'or',
 'mysterious',
 ',',
 'because',
 'they',
 'just']

## 2.- Pipeline

- Crea vocabulario y define tokenizer.

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

# Define a WordLevel tokenizer with unk_token
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Create a WordLevelTrainer and specify special tokens (including [UNK])
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]"], min_frequency=1)

# Train the tokenizer on your text data, using the trainer
tokenizer.train_from_iterator(sentences, trainer=trainer)

# Now you can encode your text
text = "hello, how are you?"
encoding = tokenizer.encode(text)

# Access the token IDs
print("Token IDs:", encoding.ids)

# Decode the token IDs back to words
decoded_text = tokenizer.decode(encoding.ids)
print("Decoded Text:", decoded_text)

Token IDs: [2206, 2, 93, 87, 13, 20]
Decoded Text: hello , how are you ?


In [None]:
vocab_size = tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 5745


In [None]:
PAD_IDX = tokenizer.token_to_id("[PAD]")
PAD_IDX

1

In [None]:
from torch.nn.utils.rnn import pad_sequence

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = [' '.join(tokens) for tokens in texts]
        self.tokenizer = tokenizer
        self.max_len = max_len + 1

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode(text)
        input_ids = encoding.ids

        # Padding
        padding_length = self.max_len - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([tokenizer.token_to_id("[PAD]")] * padding_length)
        elif padding_length < 0:
            input_ids = input_ids[:self.max_len]
        x = torch.tensor(input_ids, dtype=torch.long)[:-1]
        y = torch.tensor(input_ids, dtype=torch.long)[1:]
        return x, y

# Crea los datasets
maxlen = 64
batch_size= 64
train_dataset = TextDataset(sentences, tokenizer, max_len=maxlen)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
train_batch, train_label = next(iter(train_loader))
print(train_batch.shape, train_label.shape)

torch.Size([64, 64]) torch.Size([64, 64])


In [None]:
train_batch[3]

tensor([   3,   17,  102,  293,    5,  293,    2,   19,  532,    3,   57,  631,
          14,    8,  484,    3,  550,   62,   18,  182,   27,   48,   11,    8,
         857, 3272,  188,   22,   41, 2952,    3,  191,   10,    8,  720,  914,
          36,   27,    3,    4,  680, 1064,    3,   33,   54,   13,    2,  110,
          20,   19,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1])

In [None]:
train_label[5]

tensor([ 188,    8,  144,  249,   14,  589,    2,    9,  246,  325,   46,    6,
         987,    3,   17,  172,    6,   97,    6,    4, 1238,    2,    7,  142,
         324,  727,   26,   48, 3222,   16,  409, 1726, 2062,    3,   17,   89,
           2,   89,   18,   17,   20,   73,   86,  849,   13,   87,    6,   38,
         568,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1])

In [None]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

6.17 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
train_batch, target_batch = next(iter(train_loader))

In [None]:
train_batch.shape, target_batch.shape

(torch.Size([64, 64]), torch.Size([64, 64]))

## 3.- Modelo

In [None]:
import torch.nn as nn
from torch import optim
import time

In [None]:
 s = torch.rand(1, 1, 3)
 s

tensor([[[0.1406, 0.8809, 0.6319]]])

In [None]:
torch.softmax(s, dim=-1)

tensor([[[0.2114, 0.4432, 0.3455]]])

In [None]:
class Attention(nn.Module):
    def __init__(self, dim, maxlen, h):
        super(Attention, self).__init__()
        self.h = h
        self.wq = nn.Linear(dim, dim)
        self.wk = nn.Linear(dim, dim)
        self.wv =  nn.Linear(dim, dim)
        self.wo =  nn.Linear(dim, dim)
        self.dh = (dim // h) ** -0.5

        self.register_buffer('mask', torch.tril(torch.ones(1, 1, maxlen, maxlen)))

    def forward(self, x):
      q = self.wq(x)
      k = self.wk(x)
      v = self.wv(x)

      B, L, D = x.shape
      q = q.reshape(B, L, self.h, -1).permute(0, 2, 1, 3)
      k = k.reshape(B, L, self.h, -1).permute(0, 2, 3, 1)
      v = v.reshape(B, L, self.h, -1).permute(0, 2, 1, 3)

      qk = torch.matmul(q, k) * self.dh
      qk = qk.masked_fill(self.mask[:, :, :L, :L] == 0, float('-inf'))
      attn = torch.softmax(qk, dim=-1)

      v_attn = torch.matmul(attn, v)
      v_attn = v_attn.permute(0, 2, 1, 3).reshape(B, L, D)
      x = self.wo(v_attn)

      return x

test_tensor = torch.rand(10, 64, 128)
l = Attention(128, 64, 4)
l(test_tensor).shape

torch.Size([10, 64, 128])

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, dim, maxlen, h, exp=4):
        super(TransformerBlock, self).__init__()
        self.attn = Attention(dim, maxlen, h)
        self.ln1 = nn.LayerNorm(dim)
        self.ln2 = nn.LayerNorm(dim)
        self.fc1 = nn.Linear(dim, exp * dim)
        self.fc2 = nn.Linear(exp * dim, dim)
        self.mlp = nn.Sequential(self.fc1, nn.GELU(), self.fc2)

    def forward(self, x):
      x = self.attn(self.ln1(x)) + x
      return self.mlp(self.ln2(x)) + x

test_tensor = torch.rand(10, 64, 128)
l = TransformerBlock(128, 64, 4)
l(test_tensor).shape

torch.Size([10, 64, 128])

In [None]:
class GPT(nn.Module):
    def __init__(self, vocab_size, maxlen, model_dim=128,
                 depth=3, h=4):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, model_dim)
        self.pos = nn.Parameter(torch.randn(1, maxlen, model_dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(TransformerBlock(model_dim, maxlen, h))
        self.fc1 = nn.Linear(model_dim, vocab_size)

    def forward(self, x):
        B, L = x.shape
        x = self.embedding(x) + self.pos[:, L]
        x = self.transformer(x)
        x = self.fc1(x)
        return x

model = GPT(vocab_size, 65)
output_batch = model(train_batch)
output_batch.shape, target_batch.shape

(torch.Size([64, 64, 5745]), torch.Size([64, 64]))

## 4.- Entrenamiento

In [None]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, labels in train_loader:
        labels = labels.view(-1)
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return f'Time for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}'

In [None]:
def generate(model, seed_text, device, maxlen):
    with torch.no_grad():
        model.eval()
        input_ids = tokenizer.encode(seed_text).ids
        idx = torch.tensor(input_ids, dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = model(idx)[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join(
                    [tokenizer.id_to_token(idx[0, _]) for _ in range(maxlen)]
                )
    return txt

start_token = 'voldemort'
#generate(rnn, start_token, 'cuda', 10)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [None]:
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 500
start = time.time()
for epoch in range(epochs):
    report = train(model, device, train_loader, optimizer, epoch)
    if epoch % 50 == 0:
        print(f'\nTime for interval is {time.time()-start:.4f} sec')
        start = time.time()
        print(report)
        generated_text = generate(model, start_token, device, maxlen)
        print(f'Output: {generated_text}')


Time for interval is 0.8639 sec
Time for epoch 0 is 0.8635 sec Train loss: 6.7187
Output: voldemort quick t know of countercurses ! he said harry everyone fingers opera , said harry spite . harry and buy families taken gotten ; window whichever jokes , facing stored in a noble a patient said the seating , but warned were sounded to the seating ; a roof harry enraged , tut suddenly , said parents yes we . i tantrum

Time for interval is 34.4403 sec
Time for epoch 50 is 0.6617 sec Train loss: 0.3098
Output: voldemort hermione . pointing , first face came , invisible , as the white over the was for , harry fell up ! out exploded , as he was it from his yeah was a battle hair . this goes dumbledore went as professor mcgonagall anything so s thin feeling professor so fast asleep . a great hall of stone ones , or

Time for interval is 34.0541 sec
Time for epoch 100 is 0.6586 sec Train loss: 0.1942
Output: voldemort hermione . pointing , hermione answered as before us , or be gleaming me , a

In [None]:
start_token = 'voldemort'
generated_text = generate(rnn, start_token, device, maxlen)
generated_text

'voldemort will be able to get the stone ? i m not going to be able to get the stone , but i don t know , he said . i m not going to be in ? harry asked . harry , who was very pleased he d seen in the middle , a few seconds later , the stubs of horns and bulging , orange and the next second , harry was awake and a pair of galoshes were outside the corridor , the very last thing harry had been looking forward to harry

## Ejercicio
- Incrementar el tamaño del dataset utilizando todos los libros de _Harry Potter_.
- Entrenar con diferentes métodos de de Tokenización.