# Pre Training Custom GPT LLM

## Author: Michelangelo Zampieri

This notebook contains code to build a custom gpt LLM. 

The code was generated following the youtube tutorial "Create a Large Language Model from Scratch with Python – Tutorial" by freeCodeCamp.org

Import libraries

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import mmap
import random
import pickle

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


Define hyper paramters

In [32]:
block_size = 32
batch_size = 32
max_iters = 5000
learning_rate = 3e-4
eval_iters = 100
n_embd = 768
n_head = 8
n_layer = 8
dropout = 0.2

Read the vocab text and create a sorted array of chars and get its size

In [33]:
with open('vocab.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

Create the encoders and decoders

In [34]:
string_to_int = { c: i for i, c in enumerate(chars) }
int_to_string = { i: c for i, c in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

Function to get a random chunk of data

In [None]:
def get_random_chunk(split):
    filename = 'extracted_train_data.txt' if split == 'train' else 'extracted_val_data.txt'
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size * batch_size)
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', ' ')
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
        return data

Code to get a batch from the random chunk

In [36]:
def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

Define the classes for the model architecture

In [37]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [38]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out

In [39]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [40]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

Here define the model and load it from the pretrained params and send it to the device to allow training on GPU

In [41]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = GPTLanguageModel(vocab_size)

with open('model-01.pkl', 'rb') as f:
    model = pickle.load(f)

print('Model loaded successfully.')

m = model.to(device)

Model loaded successfully.


Function to estimate the loss

In [42]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            logits, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

Define the optimizer and scheduler

In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.9)

Training loop

In [None]:
for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f'step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}')
    xb, yb = get_batch('train')
    logits, loss = m.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    scheduler.step()
print(f"Iteration {iter}, Loss: {loss.item()}")

step: 0, train loss: 1.654, val loss: 1.586
step: 100, train loss: 1.684, val loss: 1.700
step: 200, train loss: 1.661, val loss: 1.675
step: 300, train loss: 1.658, val loss: 1.674
step: 400, train loss: 1.682, val loss: 1.680
step: 500, train loss: 1.671, val loss: 1.671
step: 600, train loss: 1.694, val loss: 1.675
step: 700, train loss: 1.699, val loss: 1.660
step: 800, train loss: 1.686, val loss: 1.680
step: 900, train loss: 1.647, val loss: 1.803
step: 1000, train loss: 1.716, val loss: 1.653
step: 1100, train loss: 1.648, val loss: 1.610
step: 1200, train loss: 1.751, val loss: 1.651
step: 1300, train loss: 1.651, val loss: 1.623
step: 1400, train loss: 1.658, val loss: 1.636
step: 1500, train loss: 1.647, val loss: 1.637
step: 1600, train loss: 1.632, val loss: 1.610
step: 1700, train loss: 1.677, val loss: 1.607
step: 1800, train loss: 1.626, val loss: 1.649
step: 1900, train loss: 1.697, val loss: 1.717
step: 2000, train loss: 1.626, val loss: 1.561
step: 2100, train loss: 1

Save the model

In [44]:
with open('model-01.pkl', 'wb') as f:
    pickle.dump(model, f)

Make a prediction

In [45]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


Acception sometimes backcycle and become, some or Rise need, ven 4

Little

Herhoudhap!

A decoracted shere he speaks towarzely, aning, he osted introduce to an and e-Mile Cermin best again too, 14% (separks, really trouble law of leadbured voted full thalf becuses matted to their termet, for several currences to should taway by probably provilidges sheries of felteralise has announced in South series, people by Mid

FBlie around at the APR YBNM le type of you far offer al traitionale canformati
