In [35]:
# Import libraries
import os, sys
import ipdb # for debugging, variation of pdb
from tqdm import tqdm
from datetime import datetime
import platform, shutil # detect platform type
import requests, zipfile, io

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

# tokenizer
import sentencepiece as spm

# these improve performance for Ampere architecture
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Empty GPU Cache Memory
torch.cuda.empty_cache()

In [2]:
files_url = "https://ideami.com/llm_train"
print("Downloading files using Python")
response = requests.get(files_url)
# Download and extract
#zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")

Downloading files using Python


In [36]:
# ARCHITECTURE PARAMETER
batch_size = 8 # 8 to 128 and beyond. 8 needs 4GB of GPU, 128 needs 24GB of GPU
context = 512
embed_size = 384
n_layers = 7
# each block(layer) includes:
# communication: an attention mechanism that learns how the different tokens relate to each other
# computation: a layer that provides complex processing for the network
n_heads = 7
# multi head attention mechanism
# the input arrives to the attention mechanism of a block, and it gets divided into a number of Attention Heads which will each process part of that input
# After all the heads do their processing, their results get combined together
BIAS = True

# HYPERPARAMETERS
lr = 3e-4 #learning rate (0.0003)
dropout = 0.05 # dropout: regulization by randomly turning off a fraction of neurons
weight_decay = 0.01 # Weight decay, or L2 regularization, adds a penalty to the loss funciton based on the magnititude of the weights
grad_clip = 1.0 # Gradient Clipping, a technique used to prevent exploding gradients by capping the maximum value of gradients during training, ensuring stable and efficient learnin

# TRAINING PARAMETERS
train_iters = 100000
eval_interval = 50 # evaluation purpose, so every 50 iteration out of training data to evaluate, the loss supopsed to be higher, but not too much
eval_iters = 10 # how many evaluation dataset we will check
compile = False # depends on the system, if it works, it's faster and efficient with memory
checkpoint_dir = 'models/'
checkpoint_fn = 'latest.pt'
checkpoint_load_fn = 'latest.pt'
dtype = torch.bfloat16

# MODE
inference = False

# DEVICE
device = torch.device("mps")
print ("device: you will be using: ", device)

device: you will be using:  mps


In [37]:
# LOGGING

#!wandb login --relogin #relogin
wandb_log = True
wandb_project = "llm1"
wandb_run_name = "llm1"+datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)

[34m[1mwandb[0m: Currently logged in as: [33mnickyoon89[0m ([33mnickyoon89-miss-to-mrs-box[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [38]:
with open('wiki.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[30000:30300])

terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


In [39]:
# TOKENIZER

sp = spm.SentencePieceProcessor(model_file="wiki_tokenizer.model")

vocab_size = sp.get_piece_size()
print(f"Tokenizer vocab_size: {vocab_size}")

Tokenizer vocab_size: 4096


In [40]:
encode = lambda s: sp.Encode(s)
decode = lambda l: sp.Decode(l)

print(encode("Once upon a time"))
print(decode(encode("Once upon a time")))

[612, 370, 698, 265, 261, 684]
Once upon a time


In [41]:
if os.path.exists("encoded_data.pt"):
    data = torch.load("encoded_data.pt")
else:
    data = torch.tensor(encode(text), dtyle=torch.long)
    torch.save(data, "encoded_data.pt")

In [42]:
data_size=len(data)
spl = int(0.9*data_size)
train_data=data[:spl]
val_data=data[spl:]

print(f'Total data: {data_size/1e6:.2f} Million | Training: {len(train_data)/1e6:.2f} Million | Validation: {len(val_data)/1e6:.2f} Million')

Total data: 59.21 Million | Training: 53.29 Million | Validation: 5.92 Million


In [43]:
def get_batch(split):
    data = train_data if split=="train" else val_data
    inds = torch.randint(len(data)-context, (batch_size,)) # a starting point, so it should minus context size
    x = torch.stack([data[i: i+context] for i in inds]) # (Batch Size, Sequence Length), (BS,SL) = (8,512)
    y = torch.stack([data[i+1:i+context+1] for i in inds]) # (8, 512) "+1" is to see what comes next

    x,y = x.to(device), y.to(device)
    return x,y

x,y=get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

torch.Size([8, 512]) torch.Size([8, 512])
tensor([3829, 1224, 4053,  289,  264,  925,  299,  264,  814,  280],
       device='mps:0')
tensor([1224, 4053,  289,  264,  925,  299,  264,  814,  280,  286],
       device='mps:0')


In [86]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size) # e.g. 4096 x 384
        self.positions = nn.Embedding(context, embed_size) # e.g. 512 x 384
        #self.blocks = nn.Squential(*[Block(n_heads) for _ in range(n_layers)]) # same as layer, a transformer is made of a number of blocks/layers
        # in Python * sign is known as the unpacking operator. It is used to unpack the elements of a list and pass them as individual arguments to a function
        self.ln = nn.LayerNorm(embed_size) # Layer normalization. We substract the mean and divide by the standard deviation
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS) # e.g. 384 x 4096 (prediction of all 4096 vocab)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) # Gaussian normal distribution)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, targets=None):
        # BS = Batch size, SL = Sequence or Context Length
        loss = None
        BS, SL = input.shape # BS x SL e.g. (8,512)
        emb = self.embeddings(input) # BS x SL x Embed size (384)
        pos = self.positions(torch.arange(SL, device=device)) # SL x 384
        x = emb + pos
        #x = self.blocks(x)
        x = self.ln(x)
        logits = self.final_linear(x) # BS x SL x (vocab size) 4096
        # logits are the final predictions of the network for each of the 512 tokens of each of the sequences

        if targets is not None:
            BS, SL, VS = logits.shape #BS x SL x 4096
            logits = logits.view(BS*SL, VS)
            targets = targets.view(BS*SL) 
            loss = F.cross_entropy(logits, targets)

            # Manual Calculation
            

            # i = - log p(x), when the event is likely happens (high probability), you need a low information
            # The entropy is the negative sum of the product of each of the probabilities by the log of that probability
            # H(x) = - sum (p(x) * Log p(x))
            # Cross Entropy is the negative sum of the probability of each of the elements of the true distribution times the logarithm of the predicted probability for the same element
            # H(q,p) = - sum (q(x) * log p(x)) # q = true distribution, p = predicted distribution
            # But most of q(x) will be removed, because the probability will be 0 except the right one(1)
            # Cross Entropy = - log p(x) (only correct one left)
            
            counts = logits.exp() # make all the number positive and exaggerate the gap
            prob = counts/ counts.sum(-1, keepdim=True) #F.softmax
            loss2 = -prob[torch.arange(BS*SL), targets].log().mean()
            # targets[3] = 329 | prob[3][329] = 0.014

            if(not torch.allclose(loss,loss2)):
                print(f"[Loss Diff] Pytorch:{loss.item()} Manual:{loss2.item()}")
        return logits, loss #, loss2

    # Generate a new sample
    def generate(self, input, max=500):
        for _ in range(max):
            input = input[:,-context:] #1, input Length until max of SL)
            # taking the last 512 tokens of our input. Every time we generate a new token, it gets added to the input.
            # So eventually the input can be longer than 512 tokens. And we can only process 512 tokens in our sequences.
            logits, _ = self(input) #(1, input length, 4096), going through the model
            logits = logits[:,-1,:] # select only the last prediction (1,4096)
            probs = F.softmax(logits, dim=-1) # (1, 4096)
            next = torch.multinomial(probs, num_samples=1) # num_samples=1 means just want one value after the token
            input = torch.cat((input, next),dim=1)
        return input

In [89]:
# OPTIONAL (after doing it, take out the Loss2 from the output of the model)
x,y = get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

model = GPT()
model = model.to(dtype)
model = model.to(device)

logits, loss = model(x,y)

#logits, loss, loss2 = model(x,y)
#print(loss.item(), loss2.item()) # check if manual calcualtion and PyTorch Calculation value is the same
# this won't be always the same because the manual calcuation is simpler version of the calculation        

torch.Size([8, 512]) torch.Size([8, 512])
tensor([ 277, 4053,  278,  362, 4043, 2744, 1704,   61,   13,   13],
       device='mps:0')
tensor([4053,  278,  362, 4043, 2744, 1704,   61,   13,   13,  764],
       device='mps:0')


In [91]:
@torch.no_grad() #decorator, it's an extra feature to a function, in this case, run the function without tracking its operations for gradient calculations
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device)
    t1 = t1[None, :] #(1, [size of the ids])
    newgen = model.generate(t1, max=64)[0].tolist()
    result=decode(newgen)
    print(f"{result}")

generate_sample("Once upon a time")

Once upon a time music compet March ended earth Canering Oceanshire Class Prov Sometimes deb sing rulba anino Pennsylvania' Co�{ railway specialctor Be met covered youngmonatoryleyeaneteroutianomun broadcastram IIpl)reamure band Association performance Tomard territ�workically Den ra Smith CO going Poland autible�gress
