In [1]:
# The MIT License (MIT) Copyright (c) 2025 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/16_GPT.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# GPT

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras
import torch
import pandas as pd
import pathlib
import random

torch.__version__, keras.__version__

('2.5.1+cu124', '3.6.0')

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x7d4097e5b3d0>

## 1.- Conjunto de datos

In [4]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [5]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []

for line in lines:
    eng, spa = line.split("\t")
    text_pairs.append('translate spanish to english: ' + spa.lower() + ' ' + eng.lower())

for _ in range(5):
    print(random.choice(text_pairs))

len(text_pairs)

translate spanish to english: estudiar tres horas a la semana no será suficiente para aprender un idioma bien. studying three hours a week wouldn't be enough to learn a language well.
translate spanish to english: hice que se enfadara la mujer. i made the woman angry.
translate spanish to english: el sol está brillando intensamente. the sun is shining brightly.
translate spanish to english: arrendamos una cabaña cerca de un lago. we rented a cabin by a lake.
translate spanish to english: prefiero quedarme en casa que ir a pescar. i'd rather stay at home than go fishing.


118964

In [6]:
for _ in range(5):
    print(random.choice(text_pairs))

translate spanish to english: tom le preguntó a mary acerca de john. tom asked mary about john.
translate spanish to english: tom pagó con tarjeta de crédito. tom paid by credit card.
translate spanish to english: es nuestra única oportunidad. this is our only shot.
translate spanish to english: tuve que ir a casa a cambiarme. i had to go home and change.
translate spanish to english: sé todo eso. i know all that.


- Conjuntos de entrenamiento, prueba y validación.

In [7]:
random.Random(77).shuffle(text_pairs)
num_val_samples = int(0.005 * len(text_pairs))
num_train_samples = len(text_pairs) - num_val_samples
train_pairs = text_pairs[:num_train_samples]
test_pairs = text_pairs[num_train_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
118370 training pairs
594 test pairs


## 2.- Data Loader

In [8]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
import os

# Set environment variable to avoid tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[EOS]"], 
                           min_frequency=1, vocab_size=34000)

tokenizer.train_from_iterator(train_pairs, trainer=trainer)

text = "hello, hola"
encoding = tokenizer.encode(text)
print("Token IDs:", encoding.ids)
decoded_text = tokenizer.decode(encoding.ids)
print("Decoded Text:", decoded_text)
vocab_size = tokenizer.get_vocab_size()
print(f"Vocabulary size: {vocab_size}")

Token IDs: [3732, 24, 3818]
Decoded Text: hello , hola
Vocabulary size: 34000


In [9]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len + 1  # for EOS

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        input_ids = self.tokenizer.encode(text).ids[:self.max_len - 1]  # Reserve space for EOS
        return torch.tensor(input_ids, dtype=torch.long)

# Collate with EOS + padding
def collate_fn(batch):
    # Append EOS to each sequence
    eos = torch.tensor([EOS_IDX], dtype=torch.long)
    batch = [torch.cat([seq, eos]) for seq in batch]

    # Create input/output pairs: x ([:-1]), y ([1:])
    x = [seq[:-1] for seq in batch]
    y = [seq[1:] for seq in batch]

    # Pad both
    x_padded = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y_padded = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x_padded, y_padded

# Setup
PAD_IDX = tokenizer.token_to_id("[PAD]") 
EOS_IDX = tokenizer.token_to_id("[EOS]") 
maxlen = 64
batch_size = 128

train_dataset = TextDataset(train_pairs, tokenizer, max_len=maxlen)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          num_workers=4, pin_memory=True, collate_fn=collate_fn)

# Check batch
train_batch, train_label = next(iter(train_loader))
print(train_batch.shape, train_label.shape)


torch.Size([128, 64]) torch.Size([128, 64])


In [10]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

68.2 ms ± 546 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
train_batch, target_batch = next(iter(train_loader))

In [12]:
train_batch.shape, target_batch.shape

(torch.Size([128, 33]), torch.Size([128, 33]))

In [13]:
train_batch[:2]

tensor([[    8,     7,     4,     5,     6,    20,   197,  2870,    16,    18,
          7999,    53,    76,   597,    16,   851,    18,    10, 10961,    13,
            56,    15,  2031,    34,  7999,    23,  4379,     4,   851,   170,
             4, 10961,    13],
        [    8,     7,     4,     5,     6,    60,    59,   203,    18,  2140,
            26,    19, 18561,     3,    47,   228,    80,    34,    47,  2609,
            32, 18560,     3,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1]])

In [14]:
target_batch[:2]

tensor([[    7,     4,     5,     6,    20,   197,  2870,    16,    18,  7999,
            53,    76,   597,    16,   851,    18,    10, 10961,    13,    56,
            15,  2031,    34,  7999,    23,  4379,     4,   851,   170,     4,
         10961,    13,     2],
        [    7,     4,     5,     6,    60,    59,   203,    18,  2140,    26,
            19, 18561,     3,    47,   228,    80,    34,    47,  2609,    32,
         18560,     3,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1]])

In [15]:
tokenizer.id_to_token(2)

'[EOS]'

## 3.- Modelo
- Definir auto atención producto punto con máscara:

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\end{equation}


In [16]:
import torch.nn as nn
from torch import optim
import time

In [17]:
class Attention(nn.Module):
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.ow = nn.Linear(dim, dim, bias = bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x):
        B, L, D = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)

        B, L, D = q.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))
        
        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x


test_layer = Attention(32, maxlen, n_heads=1)
test_layer(torch.ones([1, maxlen, 32]))

tensor([[[ 0.4182, -0.4156,  0.6756,  ...,  0.0086, -0.0181, -0.3255],
         [ 0.4182, -0.4156,  0.6756,  ...,  0.0086, -0.0181, -0.3255],
         [ 0.4182, -0.4156,  0.6756,  ...,  0.0086, -0.0181, -0.3255],
         ...,
         [ 0.4182, -0.4156,  0.6756,  ...,  0.0086, -0.0181, -0.3255],
         [ 0.4182, -0.4156,  0.6756,  ...,  0.0086, -0.0181, -0.3255],
         [ 0.4182, -0.4156,  0.6756,  ...,  0.0086, -0.0181, -0.3255]]],
       grad_fn=<ViewBackward0>)

- Definir Transformer:

In [18]:
class Transformer(nn.Module):
    def __init__(self, dim, maxlen, heads=4, mlp_dim=512, rate=0.0):
        super().__init__()
        self.ln_1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, maxlen)
        self.ln_2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        x = self.attn(self.ln_1(x)) + x
        return self.mlp(self.ln_2(x)) + x


test_layer = Transformer(32, maxlen)
test_layer(torch.ones([1, maxlen, 32])).shape

torch.Size([1, 64, 32])

In [19]:
train_batch.shape

torch.Size([128, 33])

- Definir GPT y agregar embedding de posición:

In [20]:
class GPT(nn.Module):
    def __init__(self, dim, vocab_size, maxlen, depth=3, 
                 mlp_dim=512, rate=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_embedding = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(Transformer(dim, maxlen))

        self.head = nn.Linear(dim, vocab_size, bias=False)

    def forward(self, x):
        B, L = x.shape
        x = self.embedding(x)
        x += self.pos_embedding[:, :L]
        x = self.transformer(x)
        x = self.head(x)
        return x

    
model_dim = 128
depth = 3
mlp_dim = 128

gpt = GPT(dim=model_dim, vocab_size=vocab_size, 
          maxlen=maxlen, depth=depth, mlp_dim=mlp_dim)
output = gpt(train_batch)
output.shape, target_batch.shape

(torch.Size([128, 33, 34000]), torch.Size([128, 33]))

## 4.- Entrenamiento

In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

gpt.to(device)

cuda:0


GPT(
  (embedding): Embedding(34000, 128)
  (transformer): Sequential(
    (0): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias=True)
        (kw): Linear(in_features=128, out_features=128, bias=True)
        (vw): Linear(in_features=128, out_features=128, bias=True)
        (ow): Linear(in_features=128, out_features=128, bias=True)
      )
      (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.0, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias

In [22]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [23]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, targets in train_loader:
        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

- Traducción autorregresiva.

In [24]:
from torch.nn.utils.rnn import pad_sequence

In [25]:
def translate(model, sentence, device, maxlen):
    with torch.no_grad():
        model.eval()
        encoding = tokenizer.encode(sentence)
        idx =  torch.tensor(encoding.ids, dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = gpt(idx)[:, -1, :]      
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join([tokenizer.id_to_token(_) for _ in idx[0]])
    return txt.split("[EOS]")[0].strip()
        
sentences = ['translate spanish to english él ha ido a jugar .',
             'translate spanish to english me gusta dormir .',
             'translate spanish to english él quiere ayudar .']

for s in sentences:
    trans = translate(gpt, s, device, maxlen)
    print(f"\n{trans}")


translate spanish to english él ha ido a jugar . aprendieras sospecho alleys permanecieron motivos buying doc jugó contarle amarilla rendiría cenas pegados arrepentirá punctual fundador creerían careless animal cuchilla coast móvil interpretar ropas blood comprarse desatarse bighorn special limpie staircase deprisa leerlo enferme literal , remaining 1650 incomplete ajustados apartarte desafíes following comprendidas introduje sincere ford cd vigila fricciones aclara clandestinas maybe italia

translate spanish to english me gusta dormir . granjero repaired llamaré parecías feed auténtico semillas atarantado acantilado cuadras applicants snack conduzca malévolo ojeras guy elegirle fósforo belongings siguió babuinos childlike cancelled jubila paranoicos chicas desconfiado animados erguido enojarme listas peinar consultation outing mesita momentáneamente insegura clavel acreditado raisins partirá bolacero desamparada frost construida dirijo implicaciones pesar diente odiáis confirmaron e

In [26]:
epochs = 6

for epoch in range(epochs):
    train(gpt, device, train_loader, optimizer, epoch)
    
    # Translate test sentences
    for s in sentences:
        trans = translate(gpt, s, device, maxlen)
        print(trans)


Time for epoch 0 is 19.352762 sec Train loss: 3.433056
translate spanish to english él ha ido a jugar . he has to go to sleep .
translate spanish to english me gusta dormir . i ' ll be able to sleep .
translate spanish to english él quiere ayudar . i wants to help him .

Time for epoch 1 is 19.302342 sec Train loss: 2.470447
translate spanish to english él ha ido a jugar . he went to play .
translate spanish to english me gusta dormir . i like sleeping .
translate spanish to english él quiere ayudar . i wants to help .

Time for epoch 2 is 19.356095 sec Train loss: 2.128413
translate spanish to english él ha ido a jugar . he went play .
translate spanish to english me gusta dormir . i like sleeping .
translate spanish to english él quiere ayudar . he wants to help .

Time for epoch 3 is 19.485735 sec Train loss: 1.934638
translate spanish to english él ha ido a jugar . he went to play .
translate spanish to english me gusta dormir . i like sleeping .
translate spanish to english él qu