# Mini Generative Pretrained from Transformers (GPT)

This notebooks show a basic implementation of Generate Pretrained from Transformers

In [2]:
# black formatting with jupyter-black
import jupyter_black

jupyter_black.load(
    lab=True,
    line_length=140,
)

## Data preparation

Wi will use [wikipedia](https://huggingface.co/datasets/bookcorpus/bookcorpus) as data to pre-train our mini-gpt model

In [3]:
from datasets import load_dataset

In [15]:
# load de dataset
data = load_dataset("wikipedia", "20220301.en", split="train[0:500]", trust_remote_code=True).to_pandas()
# data = load_dataset("karpathy/tiny_shakespeare", split="train", trust_remote_code=True).to_pandas()

In [None]:
# normalizer dataset
from utils import text_preprocessing

data.text = data.text.apply(text_preprocessing)

In [None]:
data.head()

# Tokenizer 

We have implemented a Byte-Per Encoding Tokenizer. However, this python implementation is really slow and so we will use a transformers implementation of Byte-Per Encoding Tokenizer 

In [None]:
from tokenizers import Tokenizer, models, trainers
from tokenizers.pre_tokenizers import Metaspace, PreTokenizer
import re
import pandas as pd

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))

# setting pre-tokenization to gpt2 tokenizer
tokenizer.pre_tokenizer = Metaspace(replacement="Ñ")

# Initialize a trainer with desired parameters
vocab_size = 30000
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<unk>", "<s>", "</s>"])

# preprocess data
data.text = data.text.apply(text_preprocessing)

# Load your training data into a list of strings
train_data = data.text.tolist()

# Train the tokenizer
tokenizer.train_from_iterator(
    train_data,
    trainer=trainer,
)

In [None]:
tokenizer.encode("praying, she's a good person").tokens

In [None]:
# count number of words
data["text_length"] = data.text.apply(lambda x: len(tokenizer.encode(x).tokens))
# data = data.query("text_length>=5 and text_length<=60").reset_index(drop=True)

In [None]:
# plot number of words distribution
import seaborn as sns

sns.histplot(data=data, x="text_length")

## Training Dataset

Here we create a training dataset for causal language modeling

In [None]:
# Define max sequence length
max_seq_len = 128

split_tokens_ids = []
split_tokens = []

# In order to latter add being of sentence token (<s>) and end of sentence token (</s>), we subtract -2 to the max_seq_len
seq_len = max_seq_len - 2

for id in range(len(data)):
    tokens = tokenizer.encode(data.text.tolist()[id]).ids

    for i in range(len(tokens) // seq_len):

        # split
        split_tokens_ids.append(tokenizer.encode("<s>").ids + tokens[i * seq_len : (i + 1) * seq_len] + tokenizer.encode("</s>").ids)

        #
        split_tokens.append(re.sub(r"\s(?!Ñ)", "", tokenizer.decode(split_tokens_ids[i], skip_special_tokens=False)).replace("Ñ", ""))


# Create a pandas dataframe with the text and tokens ids
prepared_data = pd.DataFrame({"text": split_tokens, "tokens": split_tokens_ids})
print(prepared_data.shape)
prepared_data.head(10)

## Create Custom Dataset and Data Collator

Here we define our custom dataloader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        tokens_ids = self.data.tokens.iloc[idx]
        tokens_ids = torch.LongTensor(tokens_ids)

        return tokens_ids

In [None]:
def data_collator_for_clm(batch):

    tokens_ids = torch.stack(batch)
    attention_mask = torch.tril(torch.ones(tokens_ids.shape[0], 1, tokens_ids.shape[1], tokens_ids.shape[1])).bool()

    return tokens_ids, attention_mask

In [None]:
ds = CustomDataset(prepared_data)
data_loader = DataLoader(ds, batch_size=2, collate_fn=data_collator_for_clm)

In [None]:
token_ids, attention_mask = next(iter(data_loader))

In [None]:
token_ids.shape

In [None]:
attention_mask.shape

## Mini GPT

For this implementation we will implement a architecture similar to the one proposed in the [GPT](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf) original paper

<img src="https://i.imgur.com/lgoqvjZ.png" alt= “” width="300px" height="500px">

In [None]:
from utils import DecoderTransformer
import torch
from tqdm.notebook import tqdm
import torch.nn.functional as F

In [None]:
class MiniGPT(torch.nn.Module):
    def __init__(self, embed_dim, num_heads, dropout, pf_dim, vocab_size, max_seq_length, n_layers, device="cpu") -> None:
        super().__init__()

        self.max_seq_length = max_seq_length

        self.decoder = DecoderTransformer(embed_dim, num_heads, dropout, pf_dim, vocab_size, max_seq_length, n_layers, device)
        self.output = torch.nn.Linear(embed_dim, vocab_size).to(device)
        self.device = device

    def forward(self, x, mask):

        # x shape: (batch_size, max_seq_length)
        # mask shape: (batch_size, 1, max_seq_length, max_seq_length)

        x = self.decoder(x, mask)  # x shape (batch_size, max_seq_length, embedding_dim)
        x = self.output(x)  # x shape: (batch_size, max_seq_length, vocab_size)

        return x

    def config_training_args(self, optimizer, optimizer_kwargs={}, scheduler=None, scheduler_kwargs={}):

        self.optimizer = optimizer(self.parameters(), **optimizer_kwargs)
        self.scheduler = scheduler(self.optimizer, **scheduler_kwargs)

    def train_one_epoch(self, train_dataloader):

        running_loss = 0.0

        bar = tqdm(train_dataloader, total=len(train_dataloader), leave=True)

        for step, (token_ids, attention_mask) in enumerate(bar, 1):

            # move to the correct device
            token_ids, attention_mask = token_ids.to(self.device), attention_mask.to(self.device)

            # forward pass
            outputs = self(token_ids, attention_mask)

            # remove begin of sentence token from labels
            labels = token_ids[:, 1:]

            # remove end_of_sentence token from outputs
            outputs = outputs[:, :-1, :]

            # reshape outputs and labels
            labels = labels.reshape(-1)
            outputs = outputs.reshape(-1, outputs.shape[2])

            # compute loss
            loss = F.cross_entropy(outputs, labels)

            # set zero grad
            self.optimizer.zero_grad()

            # backward pass
            loss.backward()

            # apply gradient clipping
            torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0)

            # update weights
            self.optimizer.step()

            # apply scheduler
            self.scheduler.step()

            # running loss
            running_loss += loss.item()

            # print statistics
            bar.set_description(f"Train loss: {running_loss/step:.5f}")

    def train(self, train_dataloader, epochs):

        bar = tqdm(range(1, epochs + 1), total=epochs, leave=True)

        for epoch in bar:

            self.train_one_epoch(train_dataloader)
            bar.set_description(f"Epoch {epoch}/{epochs}")

    def generate(self, sentence, tokenizer):

        sentence = "<s>" + sentence

        tokens_ids = torch.LongTensor(tokenizer.encode(sentence).ids).unsqueeze(0).to("cuda")
        attention_mask = torch.ones(tokens_ids.shape[0], 1, tokens_ids.shape[1], tokens_ids.shape[1]).bool().to("cuda")

        with torch.no_grad():
            for _ in range(tokens_ids.shape[-1], self.max_seq_length):

                prediction = gpt(tokens_ids, attention_mask)
                new_token = prediction[0, -1, :].argmax().item()
                tokens_ids = torch.concat((tokens_ids, torch.LongTensor([new_token]).unsqueeze(0).to("cuda")), dim=-1)
                attention_mask = torch.ones(tokens_ids.shape[0], 1, tokens_ids.shape[1], tokens_ids.shape[1]).bool().to("cuda")

                if new_token == tokenizer.encode("</s>").ids[0]:
                    break

        prediction = re.sub(r"\s(?!Ñ)", "", tokenizer.decode(tokens_ids.cpu().squeeze(0).tolist(), skip_special_tokens=False)).replace(
            "Ñ", ""
        )

        return prediction

In [None]:
# Test the DecoderTransformer layer
embed_dim = 768
num_heads = 12
dropout = 0.1
pf_dim = 3072
bs = 32
n_layers = 12
n_epochs = 10

gpt = MiniGPT(
    embed_dim=embed_dim,
    num_heads=num_heads,
    dropout=dropout,
    pf_dim=pf_dim,
    vocab_size=vocab_size,
    max_seq_length=max_seq_len,
    n_layers=n_layers,
    device="cuda",
)

In [None]:
ds = CustomDataset(prepared_data)
data_loader = DataLoader(ds, batch_size=bs, collate_fn=data_collator_for_clm, shuffle=True)

In [None]:
# confing model training args
optimizer = torch.optim.AdamW
optimizer_kwargs = {"lr": 2.5e-3, "weight_decay": 0.01}
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR
scheduler_kwargs = {"T_max": len(data_loader) * n_epochs, "eta_min": 1e-6}

gpt.config_training_args(optimizer, optimizer_kwargs, scheduler, scheduler_kwargs)

In [None]:
gpt.train(data_loader, epochs=n_epochs)

In [None]:
gpt.generate("anarchism is a political philosophy and", tokenizer)