In [1]:
# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/16_GPT.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# GPT

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras_core as keras
import torch
import pandas as pd
import pathlib
import random

torch.__version__

Using PyTorch backend.


'2.0.1+cu117'

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x7f16f91f0190>

## 1.- Conjunto de datos

In [4]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [5]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []

for line in lines:
    eng, spa = line.split("\t")
    text_pairs.append('translate English to Spanish: ' + eng + ' ' + spa + ' <eos>')
    text_pairs.append('translate Spanish to English: ' + spa + ' ' + eng + ' <eos>')

for _ in range(5):
    print(random.choice(text_pairs))

len(text_pairs)

translate English to Spanish: I won't take those pills. No tomaré esas pastillas. <eos>
translate Spanish to English: Esta cuerda no es lo bastante fuerte. This rope isn't strong enough. <eos>
translate English to Spanish: She played the piano with enthusiasm. Ella tocaba el piano con entusiasmo. <eos>
translate Spanish to English: Creí que Tom estaba muerto. I thought that Tom was dead. <eos>
translate Spanish to English: Habrá una recompensa para quien encuentre a mi perro. There will be a reward for the person who finds my dog. <eos>


237928

In [6]:
for _ in range(5):
    print(random.choice(text_pairs))

translate English to Spanish: Nobody knows why he turns down my help. Nadie sabe por qué rechaza mi ayuda. <eos>
translate Spanish to English: Tom casi nunca desayuna. Tom almost never eats breakfast. <eos>
translate Spanish to English: Tal vez sea demasiado tarde. Perhaps it's too late. <eos>
translate Spanish to English: Nosotros estamos viviendo en la era de la energía nuclear. We are living in the age of nuclear power. <eos>
translate Spanish to English: Si nos apuramos, creo que la hacemos. I think we'll make it if we hurry. <eos>


- Conjuntos de entrenamiento, prueba y validación.

In [7]:
random.Random(434).shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

In [8]:
print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

237928 total pairs
166550 training pairs
35689 validation pairs
35689 test pairs


In [9]:
train_pairs[0]

'translate English to Spanish: The pay is terrible. El pago es terrible. <eos>'

- Crea vocabulario y define tokenizer.

In [10]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from collections import Counter

In [11]:
tokenizer = get_tokenizer('basic_english')

In [12]:
def build_vocab(text, tokenizer):
    counter = Counter()
    for string_ in text:
        counter.update(tokenizer(string_))
    return vocab(counter, specials=['<unk>', '<pad>', '<eos>'])


vocab = build_vocab(train_pairs, tokenizer)
vocab.set_default_index(37546) # evita error <ukn>

In [13]:
vocab_size = len(vocab)
vocab_size

37535

In [14]:
maxlen = 64

def data_process(text):
    data = []
    for raw_txt in text:
        tensor_ = torch.tensor([vocab[token] for token in tokenizer(raw_txt)],
                                dtype=torch.long)
        if tensor_.shape[0] < maxlen:
            x = tensor_[:-1]
            y = tensor_[1:]
            data.append((x, y))
    return data


train_data = data_process(train_pairs)
val_data = data_process(val_pairs)
test_data = data_process(test_pairs)
len(train_data)

166512

## 2.- Data Loader

In [15]:
batch_size = 128
PAD_IDX = vocab['<pad>']
EOS_IDX = vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(torch.cat([x_item, torch.tensor([EOS_IDX])], dim=0))
        y.append(torch.cat([y_item, torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch, 
                          num_workers=4, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=batch_size,
                         shuffle=True, collate_fn=generate_batch,
                         num_workers=4, pin_memory=True)

In [16]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

151 ms ± 5.76 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
train_batch, target_batch = next(iter(train_loader))

In [18]:
train_batch.shape, target_batch.shape

(torch.Size([128, 37]), torch.Size([128, 37]))

In [19]:
train_batch[0]

tensor([   3,    6,    5,    4, 5207,  384, 5373,  110,  234,  157, 2234,   18,
         454,  110,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1])

## 3.- Modelo
- Definir auto atención producto punto con máscara:

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\end{equation}


In [20]:
import torch.nn as nn
from torch import optim
import time

In [21]:
class Attention(nn.Module):
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.ow = nn.Linear(dim, dim, bias = bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x):
        B, L, D = x.shape
        q = self.qw(x)
        k = self.qw(x)
        v = self.qw(x)

        B, L, D = q.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))
        
        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x


test_layer = Attention(32, maxlen, n_heads=1)
test_layer(torch.ones([1, maxlen, 32]))

tensor([[[ 0.0978, -0.7630,  0.1265,  ..., -0.1886, -0.6295, -0.4667],
         [ 0.0978, -0.7630,  0.1265,  ..., -0.1886, -0.6295, -0.4667],
         [ 0.0978, -0.7630,  0.1265,  ..., -0.1886, -0.6295, -0.4667],
         ...,
         [ 0.0978, -0.7630,  0.1265,  ..., -0.1886, -0.6295, -0.4667],
         [ 0.0978, -0.7630,  0.1265,  ..., -0.1886, -0.6295, -0.4667],
         [ 0.0978, -0.7630,  0.1265,  ..., -0.1886, -0.6295, -0.4667]]],
       grad_fn=<ViewBackward0>)

- Definir Transformer:

In [22]:
class Transformer(nn.Module):
    def __init__(self, dim, maxlen, heads=4, mlp_dim=512, rate=0.0):
        super().__init__()
        self.ln_1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, maxlen)
        self.ln_2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        x = self.attn(self.ln_1(x)) + x
        return self.mlp(self.ln_2(x)) + x


test_layer = Transformer(32, maxlen)
test_layer(torch.ones([1, maxlen, 32])).shape

torch.Size([1, 64, 32])

In [23]:
train_batch.shape

torch.Size([128, 37])

In [24]:
train_batch[:2]

tensor([[    3,     6,     5,     4,  5207,   384,  5373,   110,   234,   157,
          2234,    18,   454,   110,     2,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1],
        [    3,     4,     5,     6,    40,  1059,    16,    84,  1087,  2160,
             7,  4819,   219,   309,  5186,    11,    77,  1063,   906,  2158,
            54, 19797,  1727,   311,  5185,    11,     2,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1]])

- Definir GPT y agregar embedding de posición:

In [25]:
class GPT(nn.Module):
    def __init__(self, dim, vocab_size, maxlen, depth=3, 
                 mlp_dim=512, rate=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_embedding = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(Transformer(dim, maxlen))

        self.head = nn.Linear(dim, vocab_size, bias=False)

    def forward(self, x):
        B, L = x.shape
        x = self.embedding(x)
        x += self.pos_embedding[:, :L]
        x = self.transformer(x)
        x = self.head(x)
        return x

    
model_dim = 128
depth = 3
mlp_dim = 128

gpt = GPT(dim=model_dim, vocab_size=vocab_size, 
          maxlen=maxlen, depth=depth, mlp_dim=mlp_dim)
output = gpt(train_batch)
output.shape, target_batch.shape

(torch.Size([128, 37, 37535]), torch.Size([128, 37]))

## 4.- Entrenamiento

In [26]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

gpt.to(device)

cuda:0


GPT(
  (embedding): Embedding(37535, 128)
  (transformer): Sequential(
    (0): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias=True)
        (kw): Linear(in_features=128, out_features=128, bias=True)
        (vw): Linear(in_features=128, out_features=128, bias=True)
        (ow): Linear(in_features=128, out_features=128, bias=True)
      )
      (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.0, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias

In [27]:
PAD_IDX = vocab.get_stoi()['<pad>']
PAD_IDX

1

In [28]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [29]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, targets in train_loader:
        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

- Traducción autorregresiva.

In [30]:
def translate(model, sentence, device, maxlen):
    with torch.no_grad():
        model.eval()
        idx = torch.tensor([vocab[token] for token in tokenizer(sentence)],
                                    dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = gpt(idx)[:, -1, :]      
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join(
                    [vocab.get_itos()[idx[0, _]] for _ in range(maxlen)]
                )
    return txt.replace("<eos>", "")
        
sentences = ['translate spanish to english me encantan los perros',
             'translate spanish to english me gusta dormir',
             'translate english to spanish the cat is red']

for s in sentences:
    trans = translate(gpt, s, device, maxlen)
    print(f"\n{trans}")


translate spanish to english me encantan los perros arrendar doctorate tendrás aparcaron replace media acaloró accusing cancela scrub manifestado costos purchases hacéis defiende encendiste anotación accuse asustada dirá opposing ausenté comprometió califica descansara elementary pounds your dibujarme exploradores tortura techno promete boggle making despise ¿aceptan automáticos hike mejunje futuro folding despacho enchiladas charles llevaran smallest introduced

translate spanish to english me gusta dormir rechacé dominación atrajo shaken televisores viewed downs extender klingon apreciadas hervir necesitó ex-girlfriends encantaba creando descended repetirlo mistakes legends professional shinano ocuparlo endebles enderezaran capacidades pensabas napoleon verb biblia trancado inquietos backpack conseguiré was lovingly drawn-out diseño clears monkeys service shelters decimos mower emborrachar cradle pasen tokio prados bomber stand

translate english to spanish the cat is red pequeña ma

In [31]:
epochs = 6

for epoch in range(epochs):
    train(gpt, device, train_loader, optimizer, epoch)
    
    # Translate test sentences
    for s in sentences:
        trans = translate(gpt, s, device, maxlen)
        print(trans)


Time for epoch 0 is 29.692489 sec Train loss: 3.590706
translate spanish to english me encantan los perros . i ' ve been waiting for you .                                       
translate spanish to english me gusta dormir . i ' ll be able to the truth .                                        
translate english to spanish the cat is red . el mundo está en la ciudad .                                        

Time for epoch 1 is 29.499190 sec Train loss: 2.836563
translate spanish to english me encantan los perros . i ' ll take the dishes .                                        
translate spanish to english me gusta dormir . i like to see you .                                           
translate english to spanish the cat is red . el mundo está vacío .                                          

Time for epoch 2 is 30.072896 sec Train loss: 2.523122
translate spanish to english me encantan los perros . i like the others .                                          
translate spanish to e