In [1]:
import os, sys
import ipdb
from tqdm import tqdm
from datetime import datetime
import platform, shutil
import requests, zipfile, io

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

# tokenizer
import sentencepiece as spm

# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.allow_tf32 = True
# torch.cuda.empty_cache()

torch.backends.mps.benchmark = True
torch.mps.empty_cache()

In [2]:
# architecture parameters
batch_size = 8
context = 512
embed_size = 384
n_layers = 7
n_heads = 7
BIAS = True

# hyperparameters
lr = 3e-4
dropout = 0.05 # regularization
weight_decay = 0.01 # regularization
grad_clip = 1.0


# training parameters
train_iters = 100000
eval_interval = 50 # every 50th iteration is used as a validation step
eval_iterations = 10 # during evaluation, use 10 samples and build their average
compile = False # better pytorch perforamnce (works only on compatible systems)
checkpoint_dir = "models/"
checkpoint_fn = "latest'.pt"
checkpoint_load_fn = "latest.pt" # from where to restart the training
dtype = torch.bfloat16

# mode
inference = False

# device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('device: You will be using: ', device)


device: You will be using:  cpu


In [3]:
# logging
wandb_log = True
wandb_project = 'llm_udemy'
wandb_run_name = "llm_udemy-" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmeinczinger[0m ([33mmeinczinger-personal-use[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
with open("wiki.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(text[10000:10300])

 that was used to represent a team in an old TV show, The A-Team. A capital a is written "A". Use a capital A at the start of a sentence if writing.

A is also a musical note, sometimes referred to as "La".

The letter 'A' was in the Phoenician alphabet's aleph. This symbol came from a simple pictur


In [5]:
# tokenizer

# load a trained tokenizer
sp = spm.SentencePieceProcessor(model_file='wiki_tokenizer.model')

vocab_size = sp.get_piece_size()
print(f"Tokenizer vocab_size: {vocab_size}")


Tokenizer vocab_size: 4096


In [6]:
encode = lambda s: sp.Encode(s)
decode = lambda l: sp.Decode(l)

print(encode("Once upon  time"))
print(decode(encode("Once upon  time")))

[612, 370, 698, 265, 684]
Once upon time


In [7]:
if os.path.exists("encoded_data.pt"):
    print("loading encoded data")
    data = torch.load("encoded_data.pt")
else:
    data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(data, "encoded_data.pt")

loading encoded data


  data = torch.load("encoded_data.pt")


In [8]:
# splitting the data
data_size = len(data)
spl = int(0.9*data_size)
train_data = data[:spl]
val_data = data[spl:]

print(f"Total size: {data_size/1e6} million | Training: {len(train_data)/1e6:.2f} million | Validation: {len(val_data)/1e6:.2f} million")

Total size: 59.211077 million | Training: 53.29 million | Validation: 5.92 million


In [9]:
def get_batch(split):
    data = train_data if split=="train" else val_data
    inds = torch.randint(high=len(data) - context, size=(batch_size,))
    x = torch.stack([data[i:i+context] for i in inds])
    y = torch.stack([data[i+1:i+context+1] for i in inds])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch("train")

print(x.shape, y.shape)

print(x[0][:10])
print(y[0][:10])

torch.Size([8, 512]) torch.Size([8, 512])
tensor([4086,  914, 4031, 4089, 4089, 2894, 4070,  307,  261, 2025])
tensor([ 914, 4031, 4089, 4089, 2894, 4070,  307,  261, 2025,  594])


In [10]:
class ForwadLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embed_size, 6 * embed_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6 * embed_size, embed_size, bias=BIAS),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = self.network(x)
        return x

In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.queries = nn.Linear(embed_size, head_size, bias=BIAS)
        self.keys = nn.Linear(embed_size, head_size, bias=BIAS)
        self.values = nn.Linear(embed_size, head_size, bias=BIAS)

In [None]:
class Multihead(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size for _ in range(n_heads))])
        self.combine = nn.Linear(head_size * n_heads, embed_size, bias=BIAS) # 378 -> 384 (embed_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([Head(x) for head in self.heads], dim=-1)
        # Each head outputs (BS, SL, head_size)
        x = self.combine(x) # (BS, SL, embed_size)
        x = self.dropout(x)
        return x


In [None]:
class Block(nn.Module):
    def ___init__(self, n_heads):
        super().__init__()
        head_size = embed_size // n_heads
        self.ma = Multihead(n_heads, head_size)
        self.feed_forward = ForwardLayer(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x = x + self.ma(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        return x

In [None]:

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.positions = nn.Embedding(context, embed_size)
        # self.blocks = nn.Sequential(*[Block(n_heads) for _ in range(n_layers)])
        self.ln = nn.LayerNorm(embed_size)
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, targets=None):
        loss = None
        BS, SL = input.shape
        emb = self.embeddings(input)
        pos = self.positions(torch.arange(SL, device=device))
        x = emb + pos
        # x = self.blocks(x)
        x = self.ln(x)
        logits = self.final_linear(x)

        if targets is not None:
            BS, SL, VS = logits.shape
            logits = logits.view(BS*SL, VS)
            targets = targets.view(BS*SL)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, input, max=500):
        for _ in range(max):
            input = input[:, -context:]
            logits, _ = self(input)
            logits = logits[:, -1, :] # pick last probability
            probs = F.softmax(logits, dim=-1) # dim indicates last dimension
            next = torch.multinomial(probs, num_samples=1)
            input = torch.cat((input, next), dim=-1)
        return input

In [20]:
x, y = get_batch("train")

model = GPT()
model = model.to(dtype)
model = model.to(device)

logits, loss = model(x,y)

print(loss.item())

8.4375


In [21]:
@torch.no_grad()
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device)
    t1 = t1[None, :]
    newgen = model.generate(t1, max=64)[0].tolist()
    result = decode(newgen)
    print(f"Result: {result}")

generate_sample("Once upon a time")

Result: Once upon a time Jan monthsociationplaylantulf Sil tweared Africahetiction religious althoughror Mc0ross difficultsoneter Japan feet Martin Serv courmosthen Miss turn�head ArE companies playersS Asia Derman playingborn keepouncil describform David inj kept also Holausertain see view region costkaliael specialitiz
