# Pre Training Custom GPT LLM

## Author: Michelangelo Zampieri

This notebook contains code to build a custom gpt LLM. 

The code was generated following the youtube tutorial "Create a Large Language Model from Scratch with Python – Tutorial" by freeCodeCamp.org

Import libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import mmap
import random
import pickle

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

: 

Define hyper paramters

In [112]:
block_size = 32
batch_size = 32
max_iters = 50000
learning_rate = 3e-4
eval_iters = 100
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2

Read the vocab text and create a sorted array of chars and get its size

In [None]:
# with open('train_data.txt', 'r', encoding='utf-8') as f1, \
#      open('val_data.txt', 'r', encoding='utf-8') as f2:
#     text = f1.read() + f2.read()
# chars = sorted(list(set(text)))
# vocab_size = len(chars)

In [129]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT")



OSError: [Errno 28] No space left on device

Create the encoders and decoders

In [114]:
string_to_int = { c: i for i, c in enumerate(chars) }
int_to_string = { i: c for i, c in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

Function to get a random chunk of data

In [115]:
def get_random_chunk(split):
    filename = 'train_data.txt' if split == 'train' else 'val_data.txt'
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size * batch_size)
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', ' ')
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
        return data

Code to get a batch from the random chunk

In [116]:
def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [117]:
def sample_next_token(logits, temperature=1.0, top_p=0.9):
    logits = logits / temperature
    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

    sorted_indices_to_remove = cumulative_probs > top_p
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0

    indices_to_remove = sorted_indices[sorted_indices_to_remove]
    logits[..., indices_to_remove] = -float('Inf')
    probs = F.softmax(logits, dim=-1)
    return torch.multinomial(probs, num_samples=1)


Define the classes for the model architecture

In [118]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [119]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out

In [120]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [121]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

Here define the model and load it from the pretrained params and send it to the device to allow training on GPU

In [None]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
        
    def generate(self, index, max_new_tokens, temperature=1.0, top_p=0.9):
        for _ in range(max_new_tokens):
            index_cond = index[:, -block_size:]
            logits, _ = self.forward(index_cond)
            logits = logits[:, -1, :]
            next_token = sample_next_token(logits, temperature=temperature, top_p=top_p)
            index = torch.cat((index, next_token), dim=1)
        return index


model = GPTLanguageModel(vocab_size)

# with open('model-02.pkl', 'rb') as f:
#     model = pickle.load(f)

# print('Model loaded successfully.')

m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

Function to estimate the loss

In [123]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            _, loss = model(x, y)
            losses[k] = loss.item()
        mean_loss = losses.mean()
        perplexity = torch.exp(mean_loss)
        out[split] = (mean_loss.item(), perplexity.item())
    model.train()
    return out


Define the optimizer and scheduler

In [124]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.9)

Training loop

In [125]:
for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        train_loss, train_ppl = losses['train']
        val_loss, val_ppl = losses['val']
        print(f"[Step {iter}] Train Loss: {train_loss:.3f}, Val Loss: {val_loss:.3f} | Train PPL: {train_ppl:.2f}, Val PPL: {val_ppl:.2f}")

    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    scheduler.step()


[Step 0] Train Loss: 4.916, Val Loss: 4.916 | Train PPL: 136.51, Val PPL: 136.46
[Step 100] Train Loss: 2.816, Val Loss: 2.818 | Train PPL: 16.71, Val PPL: 16.74
[Step 200] Train Loss: 2.637, Val Loss: 2.631 | Train PPL: 13.98, Val PPL: 13.88
[Step 300] Train Loss: 2.513, Val Loss: 2.512 | Train PPL: 12.35, Val PPL: 12.33
[Step 400] Train Loss: 2.423, Val Loss: 2.438 | Train PPL: 11.28, Val PPL: 11.45
[Step 500] Train Loss: 2.394, Val Loss: 2.396 | Train PPL: 10.95, Val PPL: 10.98
[Step 600] Train Loss: 2.367, Val Loss: 2.369 | Train PPL: 10.66, Val PPL: 10.69
[Step 700] Train Loss: 2.315, Val Loss: 2.331 | Train PPL: 10.12, Val PPL: 10.29
[Step 800] Train Loss: 2.284, Val Loss: 2.285 | Train PPL: 9.81, Val PPL: 9.83
[Step 900] Train Loss: 2.269, Val Loss: 2.261 | Train PPL: 9.67, Val PPL: 9.59
[Step 1000] Train Loss: 2.227, Val Loss: 2.231 | Train PPL: 9.28, Val PPL: 9.31
[Step 1100] Train Loss: 2.203, Val Loss: 2.194 | Train PPL: 9.05, Val PPL: 8.97
[Step 1200] Train Loss: 2.157, Val

KeyboardInterrupt: 

Save the model

In [126]:
with open('model-03.pkl', 'wb') as f:
    pickle.dump(model, f)

Make a prediction

In [127]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

	. Looking gentle, “that was the dremble.
Englation was the land subjected about
cours a mad she felt to be to whole fire. In them she lead not damply and the brough of itstophing and passas chippear we had
they one above the rightled of hone agate seemeding infell and everage will pan a losse
wanter down ration her have a spipent it a candow.”
I that is when the countaine. The prichman on the blookh ancous him this collding to his browere are where it chour, to save
HEMES.
resendentional docting
