In [1]:
# Import libraries
import os, sys
import ipdb # for debugging, variation of pdb
from tqdm import tqdm
from datetime import datetime
import platform, shutil # detect platform type
import requests, zipfile, io

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

# tokenizer
import sentencepiece as spm

# these improve performance for Ampere architecture
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.allow_tf32 = True

# Empty GPU Cache Memory
# torch.cuda.empty_cache()

In [2]:
files_url = "https://ideami.com/llm_train"
print("Downloading files using Python")
response = requests.get(files_url)
# Download and extract
#zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")

Downloading files using Python


In [4]:
# ARCHITECTURE PARAMETER
batch_size = 8 # 8 to 128 and beyond. 8 needs 4GB of GPU, 128 needs 24GB of GPU
context = 512
embed_size = 384
n_layers = 7
# each block(layer) includes:
# communication: an attention mechanism that learns how the different tokens relate to each other
# computation: a layer that provides complex processing for the network
n_heads = 7
# multi head attention mechanism
# the input arrives to the attention mechanism of a block, and it gets divided into a number of Attention Heads which will each process part of that input
# After all the heads do their processing, their results get combined together
BIAS = True

# HYPERPARAMETERS
lr = 3e-4 #learning rate (0.0003)
dropout = 0.05 # dropout: regulization by randomly turning off a fraction of neurons
weight_decay = 0.01 # Weight decay, or L2 regularization, adds a penalty to the loss funciton based on the magnititude of the weights
grad_clip = 1.0 # Gradient Clipping, a technique used to prevent exploding gradients by capping the maximum value of gradients during training, ensuring stable and efficient learnin

# TRAINING PARAMETERS
train_iters = 100000
eval_interval = 50 # evaluation purpose, so every 50 iteration out of training data to evaluate, the loss supopsed to be higher, but not too much
eval_iters = 10 # how many evaluation dataset we will check
compile = False # depends on the system, if it works, it's faster and efficient with memory

load_pretrained = True
checkpoint_dir = 'models/'
checkpoint_fn = 'latest.pt' # File name for saving a checkpoint
checkpoint_load_fn = 'latest.pt' # File name for Loading a checkpoint
# lim2.pt is already trained
dtype = torch.bfloat16

# MODE
inference = True

# DEVICE
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use MPS for acceleration
else:
    device = torch.device("cpu")  # Fallback to CPU
print ("device: you will be using: ", device)

device: you will be using:  mps


In [5]:
# LOGGING

#!wandb login --relogin #relogin
wandb_log = True
wandb_project = "llm9" 
wandb_run_name = "llm9"+datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)

[34m[1mwandb[0m: Currently logged in as: [33mnickyoon89[0m ([33mnickyoon89-miss-to-mrs-box[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [6]:
with open('wiki.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[30000:30300])

terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


In [7]:
# TOKENIZER

sp = spm.SentencePieceProcessor(model_file="wiki_tokenizer.model")

vocab_size = sp.get_piece_size()
print(f"Tokenizer vocab_size: {vocab_size}")

Tokenizer vocab_size: 4096


In [8]:
encode = lambda s: sp.Encode(s)
decode = lambda l: sp.Decode(l)

print(encode("Once upon a time"))
print(decode(encode("Once upon a time")))

[612, 370, 698, 265, 261, 684]
Once upon a time


In [9]:
if os.path.exists("encoded_data.pt"):
    print("Loading encoding")
    data = torch.load("encoded_data.pt")
else:
    data = torch.tensor(encode(text), dtyle=torch.long)
    torch.save(data, "encoded_data.pt")

Loading encoding


In [10]:
data_size=len(data)
spl = int(0.9*data_size)
train_data=data[:spl]
val_data=data[spl:]

print(f'Total data: {data_size/1e6:.2f} Million | Training: {len(train_data)/1e6:.2f} Million | Validation: {len(val_data)/1e6:.2f} Million')

Total data: 59.21 Million | Training: 53.29 Million | Validation: 5.92 Million


In [11]:
def get_batch(split):
    data = train_data if split=="train" else val_data
    inds = torch.randint(len(data)-context, (batch_size,)) # a starting point, so it should minus context size
    x = torch.stack([data[i: i+context] for i in inds]) # (Batch Size, Sequence Length), (BS,SL) = (8,512)
    y = torch.stack([data[i+1:i+context+1] for i in inds]) # (8, 512) "+1" is to see what comes next

    x,y = x.to(device), y.to(device)
    return x,y

x,y=get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

torch.Size([8, 512]) torch.Size([8, 512])
tensor([ 871,  280, 3195, 4051,  655,  280,  264, 4031, 4062, 4059],
       device='mps:0')
tensor([ 280, 3195, 4051,  655,  280,  264, 4031, 4062, 4059, 4062],
       device='mps:0')


In [12]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size) # e.g. 4096 x 384
        self.positions = nn.Embedding(context, embed_size) # e.g. 512 x 384
        self.blocks = nn.Sequential(*[Block(n_heads) for _ in range(n_layers)]) # same as layer, a transformer is made of a number of blocks/layers
        # in Python * sign is known as the unpacking operator. It is used to unpack the elements of a list and pass them as individual arguments to a function
        self.ln = nn.LayerNorm(embed_size) # Layer normalization. We substract the mean and divide by the standard deviation
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS) # e.g. 384 x 4096 (prediction of all 4096 vocab)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) # Gaussian normal distribution)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, targets=None):
        
        # BS = Batch size, SL = Sequence or Context Length
        loss = None
        BS, SL = input.shape # BS x SL e.g. (8,512)
        emb = self.embeddings(input) # BS x SL x Embed size (384)
        pos = self.positions(torch.arange(SL, device=device)) # SL x 384
        x = emb + pos
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.final_linear(x) # BS x SL x (vocab size) 4096
        # logits are the final predictions of the network for each of the 512 tokens of each of the sequences

        if targets is not None:
            BS, SL, VS = logits.shape #BS x SL x 4096
            logits = logits.view(BS*SL, VS)
            targets = targets.view(BS*SL) 
            loss = F.cross_entropy(logits, targets)

            # Manual Calculation
            

            # i = - log p(x), when the event is likely happens (high probability), you need a low information
            # The entropy is the negative sum of the product of each of the probabilities by the log of that probability
            # H(x) = - sum (p(x) * Log p(x))
            # Cross Entropy is the negative sum of the probability of each of the elements of the true distribution times the logarithm of the predicted probability for the same element
            # H(q,p) = - sum (q(x) * log p(x)) # q = true distribution, p = predicted distribution
            # But most of q(x) will be removed, because the probability will be 0 except the right one(1)
            # Cross Entropy = - log p(x) (only correct one left)
            
            counts = logits.exp() # make all the number positive and exaggerate the gap
            prob = counts/ counts.sum(-1, keepdim=True) #F.softmax
            loss2 = -prob[torch.arange(BS*SL), targets].log().mean()
            # targets[3] = 329 | prob[3][329] = 0.014

            if(not torch.allclose(loss,loss2)):
                print(f"[Loss Diff] Pytorch:{loss.item()} Manual:{loss2.item()}")
        return logits, loss #, loss2

    # Generate a new sample
    def generate(self, input, max=500):
        for _ in range(max):
            input = input[:,-context:] #1, input Length until max of SL)
            # taking the last 512 tokens of our input. Every time we generate a new token, it gets added to the input.
            # So eventually the input can be longer than 512 tokens. And we can only process 512 tokens in our sequences.
            logits, _ = self(input) #(1, input length, 4096), going through the model
            logits = logits[:,-1,:] # select only the last prediction (1,4096)
            probs = F.softmax(logits, dim=-1) # (1, 4096)
            next = torch.multinomial(probs, num_samples=1) # num_samples=1 means just want one value after the token
            input = torch.cat((input, next),dim=1)
        return input

In [13]:
class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        head_size = embed_size // n_heads
        self.ma = Multihead(n_heads, head_size)
        self.feed_forward = ForwardLayer(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x = x + self.ma(self.ln1(x))   # Residual connection after multi-head attention
        x = x + self.feed_forward(self.ln2(x))  # Residual connection after feed-forward layer
        # Residual connections in LLMs add the input directly to the output of certain layers, making training easier and more effective
        # They help prevent the vaninshing gradient(gradients become too small gets too small, the training stop happening) problem by allowing gradients to flow more easily throught the network,
        # enable the construction of deeper models by combining original and transformed inputs, and improve overall performance and stability.
        # This leads to faster convergence during training and better generalization to new data.
        return x

In [14]:
class ForwardLayer(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embed_size, 6*embed_size, bias=BIAS),
            nn.GELU(), #Non-linear functions
            nn.Linear(6*embed_size, embed_size, bias=BIAS),
            nn.Dropout(dropout) #deactivate some of the random neurons every training
        )

    def forward(self,x):
        x= self.network(x)
        return x

In [15]:
class Multihead(nn.Module):
    def __init__(self,n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.combine = nn.Linear(head_size * n_heads, embed_size, bias=BIAS) # 378, 384
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([head(x) for head in self.heads], dim=-1)
        # Each head outputs (BS, SL, head_size)
        x = self.combine(x) # (BS, SL, 384(embed size))
        x = self.dropout(x)
        return x

In [16]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        # the word we're focusing on
        self.queries = nn.Linear(embed_size, head_size, bias=BIAS)
        # all the words in the sequence
        self.keys = nn.Linear(embed_size, head_size, bias=BIAS)
        # hold the information of these words
        self.values = nn.Linear(embed_size, head_size, bias=BIAS)
        # By comparing the query with the keys, the model calculates attention scores, which are then used to weight the values
        # The process helps the model decide which words to pay more attention to when making predictions, improving its understanding of context and relationships between words
        # (simplified by using word and token interchangeably)

        # A buffer is a tensor that is not a model parameter but still needs to saved and restored during model checkpoint
        # Buffers are typically used to stored fixed statistics or other intermediate results that should be part of the model's state but do not require gradient update
        self.register_buffer('tril', torch.tril(torch.ones(context, context))) # mask out all the knowledge about future tokens
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        BS, SL, VS = x.shape
        q = self.queries(x) # BS, SL, 54
        k = self.keys(x) # BS, SL, 54
        v = self.values(x) # BS, SL, 54

         # attention weight
        attn_w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # BS, SL, 54 @ BS, 54, SL = BS, SL, SL
        # k.shape[-1]**-0.5 is to prevent the weight to be too big
        # this multification is showing alignment (relationship)
        # large positive number(algined, same direction), large negative number(aligned, opposite direction)
        # close to zero: vectors are orthogonal(perpendicular) to each other
        # The result: the first row will show the compatbility of first token's alignment with all the other tokens in sequence
        attn_w = attn_w.masked_fill(self.tril[:SL,:SL]==0, float('-inf')) # change 0 to negative infinity
        attn_w = F.softmax(attn_w, dim=-1) # BS, SL, SL
        attn_w = self.dropout(attn_w)

        x = attn_w @ v # BS, SL, 54
        return x
        # It is the Dot Product of first token attention scores with second embedding dimension of each of the 512 tokens
        # The second embedding dimension of the resulting first token embedding is a weighted sum of the second embedding dimension of the 512 tokens.
        # Where the weights are the attention scores between that token and each of the 512 tokens

In [17]:
head_size = embed_size // n_heads
print(f"embed: {embed_size} n_heads: {n_heads} head_size: {head_size}")

embed: 384 n_heads: 7 head_size: 54


In [18]:
torch.tril(torch.ones(context, context))

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 1., 0., 0.],
        [1., 1., 1.,  ..., 1., 1., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [None]:
# OPTIONAL (after doing it, take out the Loss2 from the output of the model)
# manual dive for attention
'''
x,y = get_batch("train")
print(x.shape, y.shape)

x = x.to(device)
y = y.to(device)

embeddings = nn.Embedding(vocab_size, embed_size).to(device)
positions = nn.Embedding(context, embed_size).to(device)
queries = nn.Linear(embed_size, head_size, bias=BIAS).to(device)
keys = nn.Linear(embed_size, head_size, bias=BIAS).to(device)
values = nn.Linear(embed_size, head_size, bias=BIAS).to(device)
tril = torch.tril(torch.ones(context,context)).to(device)

emb = embeddings(x)
pos = positions(torch.arange(context, device=device))
x = emb + pos

q = queries(x) 
k = keys(x) 
v = values(x) 
print(q.shape, k.shape, v.shape)
torch.set_printoptions(precision=2, sci_mode=False)
# torch.set_printoptions(precision=4, threshold=1000, edgeitems=3, linewidth=80, profile='default', sci_mode=True)
#print(q[0][0])

attn_w = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
attn_w = attn_w.masked_fill(tril[:context,:context]==0, float('-inf'))
attn_w = F.softmax(attn_w, dim=-1)
x = attn_w @ v
'''

In [None]:
# OPTIONAL
# Understand Attention Matrix
'''
full = q @ k.transpose(-2,-1) # 512 x 54 @ 54 x 512

a = q[0][5] # embedding of 54 numbers of of fifth token of the first batch
b = k.transpose(-2,-1)[0,:,3]  # embedding of 54 numbers of of third token of the first batch
print(a,b)
c = torch.dot(a,b)
print(c)
print(full[0][5][3])
'''

In [None]:
# OPTIONAL
# Understand the updating of the V content
'''
print(attn_w.shape, v.shape)

print(x[0][7])
x = attn_w @ v # 512 x 54

attn_scores2 = attn_w[0, 7, :] # Shape [512], 7th token with 512 compatibility with all the 512 tokens
# after 7 values, it will be all zero, because we don't care about future token when we consider about 7th token
# Initalize a tensor to store the result
result = torch.zeros(54)
# Compute the dot product for each column in v for the first token in the first batch
for i in range(54):
    result[i] = torch.dot(attn_scores2, v[0,:,i])
print(result)
'''

In [19]:
x,y = get_batch("train")

print(x[0][:10])
print(y[0][:10])

model = GPT()
model = model.to(dtype)
model = model.to(device)

logits, loss = model(x,y)
print(loss.item())

# OPTIONAL (after doing it, take out the Loss2 from the output of the model)
# comparing loss and loss2(manual simplified calculation)

#logits, loss, loss2 = model(x,y)
#print(loss.item(), loss2.item()) # check if manual calcualtion and PyTorch Calculation value is the same
# this won't be always the same because the manual calcuation is simpler version of the calculation      

tensor([ 299,  610,  376, 1181, 1321, 4051,   13, 4064, 4034,  299],
       device='mps:0')
tensor([ 610,  376, 1181, 1321, 4051,   13, 4064, 4034,  299,  866],
       device='mps:0')
8.375


In [20]:
@torch.no_grad() #decorator, it's an extra feature to a function, in this case, run the function without tracking its operations for gradient calculations
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device)
    t1 = t1[None, :] #(1, [size of the ids])
    newgen = model.generate(t1, max=64)[0].tolist()
    result=decode(newgen)
    print(f"{result}")

# generate_sample("Once upon a time")

In [21]:
# TRAINING SETUP

model = GPT()
model = model.to(dtype)
model = model.to(device)

if compile: 
# requirement to use torch.compile(), this makes ML models run faster and more efficient by converting them into a form that the computer can excute more quickly
    print("Torch :: Compiling model")
    model = torch.compile(model)

print(sum(p.numel() for p in model.parameters()) / 1e6, "Million parameters")
# GPT-3: 175 Billion parameters

19.837954 Million parameters


In [22]:
# Calculate Loss averages
@torch.no_grad()
def calculate_loss():
    out={}
    model.eval()
    for split in ['train','eval']:
        l=torch.zeros(eval_iters)
        for i in range(eval_iters):
            x,y = get_batch(split)
            _, loss = model(x,y)
            l[i]=loss
        out[split]=l.mean().item()
    model.train()
    return out

l = calculate_loss()
print(l)

[Loss Diff] Pytorch:8.375 Manual:8.4375
{'train': 8.393750190734863, 'eval': 8.393750190734863}


In [23]:
# Setting up the optimizer

p_dict = {p_name: p for p_name,p in model.named_parameters() if p.requires_grad}

weight_decay_p = [p for n, p in p_dict.items() if p.dim() >= 2]
no_weight_decay_p = [p for n, p in p_dict.items() if p.dim() < 2]


optimizer_groups = [
    {'params': weight_decay_p, 'weight_decay': weight_decay},
    {'params': no_weight_decay_p, 'weight_decay': 0.0}
]
optimizer = torch.optim.AdamW(optimizer_groups, lr=lr, betas=(0.9,0.99))
# betas: control the exponential moving averages of the gradient and its square, which are essential components of the Adam and AdamW algorithm

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_iters, eta_min=lr/10)
# through the training process, change the learning rate(lr), likely slowing down

start_iteration = 0
best_val_loss = float('inf')

In [24]:
# Laading Checkpoints

def load_checkpoint(path):
    print("LLM - Loading model")
    checkpoint = torch.load(path, map_location=torch.device(device))
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    iteration = checkpoint['iteration']
    loss = checkpoint['loss']
    print(f"Loaded iter {iteration} with loss {loss}")
    return iteration, loss
    
if os.path.exists(f"{checkpoint_dir}/{checkpoint_load_fn}") and load_pretrained:
    print("Loading Checkpoint")
    start_iteration, loss = load_checkpoint(checkpoint_dir + checkpoint_load_fn)

Loading Checkpoint
LLM - Loading model
Loaded iter 84700 with loss 2.288281202316284


In [25]:
# INFERENCE

if inference == True:
    model.eval()
    while True:
        qs = input("Enter text (q to quit): ")
        if qs == "":
            continue
        if qs == "q":
            break
        generate_sample(qs)
    #sys.exit() # this is commented out because it doesn't make sense in jupyter notebook structure
    

Enter text (q to quit):  Once upon a time


Once upon a time in front of sand, he was captured by the goal geting out of sand and all the other enemies were killed. Once missovae began to reach campo perceived to return across the Alleblad sand flee and honour him be avant-


Enter text (q to quit):  Divya is


Divya is a friend of Arthur Sumcliffe.

It was recommended that he could commit extortion with Palmer in a city near But later.

Raymond Park

Raymond Rome Hill is an American direct-to-video album directed


Enter text (q to quit):  q


In [None]:
# TRAINING LOOP

try:
    for i in tqdm(range(start_iteration, train_iters)):
        xb, yb = get_batch("train")
        logits, loss = model(xb,yb)
    
        # Evaludating loss
        if (i % eval_interval == 0 or i == train_iters -1):
            l = calculate_loss()
            print(f"\n{i}: train loss: {l['train']} / val loss: {l['eval']}")
            generate_sample("Once upon a time")
    
            if l['eval'] < best_val_loss:
                best_val_loss = l['eval']
                print("[CHECKPOINT]: Saving with loss: ", best_val_loss)
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': best_val_loss,
                    'iteration': i,
                }, checkpoint_dir + checkpoint_fn)
    
            if wandb_log:
                wandb.log({
                    "loss/train": l['train'],
                    "loss/val": l['eval'],
                    "lr": scheduler.get_last_lr()[0],
                },
                step = i)
    
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
    
        nn.utils.clip_grad_norm(model.parameters(), max_norm=grad_clip)
    
        optimizer.step() # Tweak weights
        scheduler.step() # Changing the learning rate

    if wandb_log:
        wandb.finish()

except KeyboardInterrupt:
    print("Training interrupted, Cleaning up...")

finally:
    # Release GPU memory
    # torch.cuda.emty_cache()
    print("GPU memory released")
    sys.exit(0)

# torch.cuda.emty_cache()    