In [None]:
# Small LLM / Notebook created by Javier Ideami (ideami.com)
# Typical LLMs need many GPUs and millions of dollars to be trained
# This code trains a small LLM with a single GPU and little GPU memory 
# Of course results are not like a chatGPT, but they are good enough to see how the LLM trains to go
# from random combinations of letters to actual words and phrases that are sometimes decently coherent
# GPT3 has 175 Billion parameters. GPT4 has many, many more.
# This model has only 19 Million parameters with its default settings. That's why its perfect for learning 
# and experimenting

# Official notebook #vj30

In [None]:
#### For GOOGLE COLAB and similar platform Users:
#### Make sure to select a GPU in the online platform. Don't run this code with a CPU (it will be too slow)

# If you are running this code locally, your GPU should be selected automatically

In [None]:
# uncomment and run the following installation lines ONLY if you havent installed these libraries already outside of the notebook
#!pip install ipdb -q
#!pip install tqdm -q
#!pip install sentencepiece -q
#!pip install wandb -q

# And if you are not in Google Colab and you didn't yet install Pytorch, make sure to do it:
# find the ideal pytorch installation command at https://pytorch.org/get-started/locally/

In [None]:
# You can use this command to view information about your GPU and the amount of free memory it has
# Make sure that you have at last 4GB of free GPU memory to do this course
!nvidia-smi 
# If you are using Google Colab or a similar online platform, make sure to select a GPU in the menus
# In Google colab, at the moment the option is within the Runtime menus

In [None]:
### Import necessary libraries

import os, sys
import ipdb # for debugging
from tqdm import tqdm
from datetime import datetime
import platform, shutil # detect platform type
import requests, zipfile, io 

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

import sentencepiece as spm # For the tokenizer

# These lines improve performance for Ampere Architecture (e.g: A100s)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
# Empty GPU cache memory
torch.cuda.empty_cache()


In [None]:
# Download necessary files and create necessary folders
# wiki.txt - dataset: a tiny segment of the English Wikipedia
# wiki_tokenizer.model: trained tokenizer file (in another notebook I show you how to produce this file)
# wiki_tokenizer.vocab: trained tokenizer file (in another notebook I show you how to produce this file)
# encoded_data.pt (dataset tokenized with the tokenizer)
# I will explain how to produce encoded_data.pt - because it takes quite a bit to process, it's nice to have it in advance

# NOTE: Downloading will take a while, be patient. You can refresh your folder from time to time to see when the files
# have been created. If you have any problems downloading the files with this code, I have also added llm_train.zip
# to the downloadable resources of this lecture (however, best option is to use this code, because then you don't need
# to upload the files or do anything else)

files_url = "https://ideami.com/llm_train"

# Downloading proceeds if we detect that one of the key files to download is not present
if not os.path.exists(f"encoded_data.pt"):
    print("Downloading files using Python")
    response = requests.get(files_url)
    zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")
else:
    print("you seem to have already downloaded the files. If you wish to re-download them, delete the encoded_data.pt file")



In [None]:
# Set main parameters

# ARCHITECTURE PARAMETERS
batch_size= 8 # How many samples do we train at once (set as needed, typical range 8 to 128)
              # 8 is good for a GPU with 4GB of memory, 128 is good for a GPU with 24GB of memory
context=512 # Sequence length used for training, 512 is a good compromise for our level of resources
embed_size=384 # Embedding size
n_layers = 7 # Number of transformer layers
n_heads = 7 # Number of heads within each layer
BIAS = True # Do we want Bias parameters?

# HYPERPARAMETERS
lr = 3e-4 # Initial learning rate
dropout=0.05 # Dropout percentage
weight_decay = 0.01 # Weight decay regularizer
grad_clip = 1.0 # Gradient clipping to prevent gradient explosion

# TRAINING parameters
train_iters = 100000 # Maximum number of training iterations
eval_interval=50 # How often do we evaluate the performance?
eval_iters=3 # Number of iterations while we evaluate performance
compile = False # Compile will accelerate performance in compatible systems
load_pretrained = False # Do we want to load a pretrained model to continue training?

checkpoint_dir = 'models/'  # Where do we store checkpoints?

checkpoint_fn = "latest.pt" 
# Name of checkpoint file to be saved during training

checkpoint_load_fn = "latest.pt" 
# Name of checkpoint file to be loaded when load_pretrained is True
# You can load llm2.pt to experiment with a checkpoint that already reached 2.31 of loss

dtype = torch.bfloat16 # our target internal data type

# MODE
# Do we want to run the model in inference mode?
inference=False 

# DEVICE - Sets device to GPU or CPU (use GPU always)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device: You will be using: ",device)


In [None]:
# LOGGING parameters
# When you run this cell, it will ask you to enter your Wandb API Key, which you
# can find at your account on https://wandb.ai/settings#api
wandb_log = True
wandb_project = "test"
wandb_run_name = "test-run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)

# The first time you run this logging code set to True, the weights and biases library
# will ask you for an API key. You can follow the instructions in the video, or you can
# also simply click on a link that should appear when you run this cell, pointing to this
# address: https://wandb.ai/authorize  
# Going to that address will allow you to quickly get an API key as well


In [None]:
with open('wiki.txt', 'r', encoding='utf-8') as f:
    text=f.read()

print(text[10000:10500])

In [None]:
# SENTENCEPIECE TOKENIZER

# Load trained tokenizer
# Make sure that " model_file = " is pointing to the right file
sp = spm.SentencePieceProcessor(model_file='wiki_tokenizer.model')

# Get the vocabulary size of our tokenizer
vocab_size = sp.get_piece_size()
print(f"Tokenizer vocab_size: {vocab_size}")

# Create the encoding and decoding tokenizer functions
encode = lambda s: sp.Encode(s)
decode = lambda l: sp.Decode(l)

# Test that encoding and decoding are working well
print(decode(encode("Encoding Decoding functions ready")))

In [None]:
# Tokenization of the dataset
if os.path.exists(f"encoded_data.pt"):
    # Load encoded data if you already saved it previously
    print("Loading saved encoded data")
    data = torch.load('encoded_data.pt')
else:
    # If you still didn't encode and save the encoding, do it here
    print("Encoding data")
    data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(data, 'encoded_data.pt')


In [None]:
data_size=len(data) # Get the size of the dataset

spl = int(0.9*data_size) # set the split at 90%-10%
train_data=data[:spl] # training data will be 90% of the dataset
val_data=data[spl:] # validation data will be 10% of the dataset
print(f'Total data: {data_size/1e6:.2f} Million | Training: {len(train_data)/1e6:.2f} Million | Validation: {len(val_data)/1e6:.2f} Million')

# data[:30] : shows the first 30 token IDs

In [None]:
############## HELPER FUNCTIONS ###########################

# Return a batch of either training or evaluation data
def get_batch(split):
    # BS = Batch Size / SL = Sequence Length or context length
    data = train_data if split=="train" else val_data # Select the split
    inds = torch.randint(len(data)-context, (batch_size,)) # (BS)
    x = torch.stack([data[i: i+context] for i in inds]) # (BS,SL)
    y = torch.stack([data[i+1: i+context+1] for i in inds]) # (BS,SL)

    # Examples of what it returns
    # # First 10 elements of first batch of inputs and labels
    #x[0][:10] -> tensor([ 664,  278, 4031, 4056, 4065, 4062, 4062, 4051, 13, 13])
    #y[0][:10] -> tensor([ 278, 4031, 4056, 4065, 4062, 4062, 4051,   13, 13, 4066])

    x,y = x.to(device), y.to(device)
    return x,y



In [None]:
# Uncomment to test your get_batch function
#x,y=get_batch("train")
#print(f"x.shape: {x.shape}")
#print(f"y.shape: {y.shape}")
#print(x[0][:10])
#print(y[0][:10])

In [None]:
#################################################################################
################## LLM MODEL #############################################
# 19 million parameters with the default configuration
# Can be trained with 1 single GPU
# With 8 Batch Size, should require 4 GB of GPU Memory
# With 128 Batch Size, should require 24 GB of GPU Memory
# Adjust Batch Size as needed for less or more memory and training speed
# Because of small dataset and model, results will be limited but enough to
# demonstrate good improvement during the training and understand all the
# main technology involved in building LLMs
#################################################################################
###############################################
##################################

class GPT(nn.Module):

    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size,embed_size) # Create embedding layer
        self.positions = nn.Embedding(context, embed_size) # Create basic positioning embeddings
        self.blocks = nn.Sequential(*[Block(n_heads) for _ in range(n_layers)]) # setup transformer blocks
        self.ln = nn.LayerNorm(embed_size) # normalization layers
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS) # feedforward linear layer
        self.apply(self._init_weights) # Initialize the weights

    # Weights initialization
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):        
            # Initialize weight matrices with normal distribution with mean 0 and small std
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            # Initialize bias parameters to 0
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        # Initialize Embedding weights with normal distribution with mean 0 and small std
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    # Running the LLM model
    def forward(self, input, targets=None):
        # BS = Batch Size / SL = Sequence Length or context length
        # For easier reading, I assume embedding dim of 384 and vocab size of 4096 in comments
        loss= None
        BS,SL = input.shape  # (BS,SL)
        emb = self.embeddings(input)  # (BS,SL,384)
        pos = self.positions(torch.arange(SL, device=device)) # (SL,384)
        x = emb+pos  # combine embedding and positioning stages (BS,SL,384)
        x = self.blocks(x)  #(BS,SL,384)
        x = self.ln(x) # (BS,SL,384)
        logits = self.final_linear(x) # (BS,SL,4096)

        # Calculate Loss if training with targets

        # Cross Entropy Logic
        # (equivalent to negative log likelihood)

        # Information: -log p(x) (inverse of probability)
        # Entropy: avg of information in random variable (prob distribution): - sum_x (x * log(x))
        # CrossEntropy: Compares 2 distr q(true) & p(predicted) in terms of information distance: -sum_x (q(x) * log p(x))
        # LLMs CrossEntropy: true labels are 1 for true, 0 for the rest, so it simplifies to: -sum_x log p(x)

        if targets is not None:
            BS, SL, VS = logits.shape  # (BS,SL,4096)
            logits = logits.view(BS*SL,VS)  # Reshape to prepare for cross_entropy (BS*SL,4096)
            targets = targets.view(BS*SL)   # Reshape as well (BS*SL)
            loss = F.cross_entropy(logits,targets)

            # Optional: Just for fun, manual way to calculate cross_entropy
            # By default, we comment out the manual version to prevent calculating the loss twice (will make things slower)

            # First apply softmax to produce probabilities
            #counts = logits.exp()  # (BS*SL,4096)
            #prob = counts / counts.sum(-1, keepdim=True) # (BS*SL,4096),(BS*SL,1) = (BS*SL,4096)
            #loss2 = -prob[torch.arange(BS*SL),targets].log().mean() # torch.arange(B*T) (BS*SL) | targets (BS*SL)

            # Finally at each of prob's positions, we pick the index specified by the respective target
            # example: targets[3]=329, prob[3][329] = 0.014

            # Most times they will match, sometimes they will not because F.cross_entropy is more precise
            # By uncommenting the following lines, you can see when they don't match 
            #if ( not torch.allclose(loss,loss2)):
            #    print(f"[Loss Diff] Pytorch:{loss.item()} Manual:{loss2.item()}")

        return logits,loss

    # Generate a new sample
    def generate(self, input, max=500):
        # SL = Sequence Length or context length
        for _ in range(max): # until you reach the maximum number of tokens
            input = input[:,-context:] #(1, input length until max of SL)
            logits, _ = self(input)  # (1, input length, 4096)
            logits = logits[:,-1,:]  # Pick last probability discarding the dimension (1, 4096)
            probs = F.softmax(logits, dim=-1) # (1,4096)
            next = torch.multinomial(probs, num_samples=1) # Sample next token value
            input = torch.cat((input,next),dim=1) # Add new token to the input
        return input

In [None]:
########################################
##########Transformer Block Class ######
########################################

class Block(nn.Module):
    # A transformer block combines communication and computation over the data
    # Helps create complex processing and also emphasize relationships between the
    # members of the sequence through the attention mechanisms
    def __init__(self, n_heads):
        super().__init__()
        head_size = embed_size // n_heads # We split the embedding dimensions among the number of heads
        self.ma = Multihead(n_heads,head_size) # We setup the multihead system within each block
        self.feed_forward = ForwardLayer(embed_size)
        self.ln1 = nn.LayerNorm(embed_size) # Normalizing layer
        self.ln2 = nn.LayerNorm(embed_size) # Normalizing layer

        # LayerNorm normalizes the inputs across the features for each data point independently.
        # It subtracts the mean and divides by the standard deviation, followed by scaling and shifting.
        # It is computationally more intensive than for example RMSnorm but offers greater flexibility.

    def forward(self, x):
        x = x + self.ma(self.ln1(x))  # We normalize and then apply multi head attention
        x = x + self.feed_forward(self.ln2(x)) # we normalize again and then apply a feed forward layer
        return x


In [None]:
# The ForwardLayer applies a network that increases the computational complexity of the processing 
class ForwardLayer(nn.Module):
    def __init__(self,embed_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embed_size, 6*embed_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6*embed_size, embed_size, bias=BIAS),
            nn.Dropout(dropout)
        )
    def forward(self,x):
        x = self.network(x)
        return x

In [None]:
# Multihead Attention Layer
# This layer coordinates the different attention heads within each transformer block
class Multihead(nn.Module):
    def __init__(self,n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)]) # Setup the heads | head_size = embed_size // n_heads
        self.combine = nn.Linear(head_size * n_heads, embed_size, bias=BIAS) # (378,384) - in case of our default values
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # BS = Batch Size / SL = Sequence Length or context length
        # x is (BS,SL,384)  # 384 is default embed size
        x = torch.cat([head(x) for head in self.heads], dim=-1)
        # Each head outputs (BS,SL, head_size)
        # Combining them with torch.cat produces (BS,SL,378)  378 is default head_size * default n_heads = 54 * 7
        x = self.combine(x) # project them back to embed_size (BS, SL, 384)  384 is default embed_size
        x = self.dropout(x)
        return x

In [None]:
# Head Attention Layer
# Detects and reinforces patterns in relationships between members of sequence
class Head(nn.Module):
    # BS = Batch Size / SL = Sequence Length or context length
    def __init__(self, head_size):
        super().__init__()
        self.queries= nn.Linear(embed_size, head_size, bias=BIAS) # Query Projection (embed_dim, head_size) (384, 54)
        self.keys= nn.Linear(embed_size, head_size, bias=BIAS) # Key Projection (384, 54)
        self.values= nn.Linear(embed_size, head_size, bias=BIAS) # Value Projection (384, 54)
        # We declare a triangular matrix that we will use to mask future tokens from the current position
        # self.tril contains 0s in upper triangle and 1s in lower triangle + diagonal
        self.register_buffer('tril',torch.tril(torch.ones(context,context))) # self.tril - (SL,SL)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        BS,SL, VS = x.shape
        q=self.queries(x) # (BS,SL,54)  54 is the head_size
        k=self.keys(x) # (BS,SL,54)
        v=self.values(x) # (BS,SL,54)

        # Calculate square attention weights matrix with dot product of q and k, and normalize
        attn_w = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (BS, SL, SL)

        # mask out future tokens, pay attention only to the past
        attn_w = attn_w.masked_fill(self.tril[:SL,:SL]==0, float('-inf'))  # set to -inf the upper right triangle of 0s

        attn_w = F.softmax(attn_w, dim=-1) # Transform into probabilities (BS, SL, SL)
        attn_w = self.dropout(attn_w) # (BS, SL, SL)

        # use attention weights to update the features of our tokens
        x = attn_w @ v # (BS,SL,54) # 54 is the head_size = embed_dim // n_heads
        return x

In [None]:
#################################################################################
# Main Training Process
#################################################################################

# Main Setup

model = GPT() # Instantiate LLM
model = model.to(dtype) # Set the precision type
model = model.to(device) # Move it to the right device

# Torch.compile compiles a PyTorch model to an optimized version, aiming to improve runtime performance and efficiency.
# Disable if your system doesn't support it
if compile:
    print("Torch :: Compiling model")
    model = torch.compile(model)


# Print the number of parameters of our model (19 million in our case)
print(sum(p.numel() for p in model.parameters()) / 1e6, " Million parameters")

In [None]:
# Calculate the Loss
@torch.no_grad()  # Prevent gradient calculation
def calculate_loss():
    out={}
    model.eval()
    for split in ['train','eval']:        
        l=torch.zeros(eval_iters)  # Create a tensor of zeros the size of eval_iters
        for i in range(eval_iters):
            x,y=get_batch(split) # Get a new batch of data
            _,loss=model(x,y)  # Calculate the loss
            l[i]=loss  # Store the loss in the next position of tensor
        out[split]=l.mean().item()  # Calculate the mean and extract the final value
    model.train()
    return out

l=calculate_loss()
print(l)

In [None]:
# Generate a new sample
@torch.no_grad()
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device) # Tokenize string -> (tensor of ids)
    t1 = t1[None,:]  # (1 , [size of ids])
    newgen = model.generate(t1,max=64)[0].tolist() # call the generate method, limit output size
    result=decode(newgen) # decode the result with the tokenizer to get back characters
    print(f"{result}")

generate_sample("The mountain in my city is") # Generate a sample

In [None]:
#################################################################################
# Main Training Process
#################################################################################

# Set Weight Decay differently for different kinds of parameters
# parameter dictionary where keys are parameter names, and values are the parameter themselves
p_dict = {p_name: p for p_name, p in model.named_parameters() if p.requires_grad} # len: 370

# isolate weight matrices as they benefit specially from weight decay
weight_decay_p = [p for n, p in p_dict.items() if p.dim() >= 2]  # len: 171

# isolate other parameters like bias parameters, that don't benefit from weight decay
no_weight_decay_p = [p for n, p in p_dict.items() if p.dim() < 2] # len: 199

# store the parameter types in a list of dictionaries
optimizer_groups = [
    {'params': weight_decay_p, 'weight_decay': weight_decay},
    {'params': no_weight_decay_p, 'weight_decay': 0.0}
]

# Declare optimizer, it helps us compute gradients, update parameters, manage learning rate, apply weight decay
optimizer = torch.optim.AdamW(optimizer_groups, lr=lr, betas=(0.9, 0.99))
# betas: control the exponential moving averages of the gradient and its square,
# which are essential components of the Adam and AdamW optimization algorithms.

# Declare scheduler to change learning rate through the training
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_iters, eta_min=lr/10)
# learning rate will descend till a minimum of a tenth of the lr

start_iteration = 0
best_val_loss = float('inf')  # Track best loss value


In [None]:
# Loading Checkpoints

# Loads a previously saved checkpoint
def load_checkpoint(path):
    print("LLM - Loading model")
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict']) # Load parameters
    optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # Load optimizer state
    iteration = checkpoint['iteration'] # In what iteration did we save the model?
    loss = checkpoint['loss'] # What was the last loss value?
    print(f"Loaded iter {iteration} with loss {loss}")
    return iteration, loss

################# OPTIONAL : LOAD A PREVIOUS CHECKPOINT
if os.path.exists(f"{checkpoint_dir}/{checkpoint_load_fn}") and load_pretrained:
    start_iteration, loss = load_checkpoint(checkpoint_dir + checkpoint_load_fn)
    best_val_loss = loss

In [None]:
#### INFERENCE MODE - Activate inference and then exit
if inference==True:
    model.eval()
    while True:
         qs = input("Enter text (q to quit) >>> ")
         if qs == "":
             continue
         if qs == 'q':
             break
         generate_sample(qs)

In [None]:
#################################################################
###################### TRAINING #################################
#################################################################

try:
    for i in tqdm(range(start_iteration, train_iters)):
        xb,yb = get_batch("train") # Get a new batch of data
        logits,loss = model(xb,yb) # Run the LLM and get the logits and the loss

        if (i % eval_interval==0 or i == train_iters-1): # Calculate the loss
            l = calculate_loss()
            print(f"\n{i}: train loss: {l['train']} / val loss: {l['eval']}")

            # We do a quick test so that we observe the evolution through the training
            # Remember that we use a very small dataset which doesn't include all topics
            generate_sample("The mountain in my city is") # Generate a sample

            if l['eval'] < best_val_loss: # If we improved the best loss, save a checkpoint
                best_val_loss = l['eval']
                print("[CHECKPOINT]: Saving with loss: ", best_val_loss)
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': best_val_loss,
                    'iteration': i,
                }, checkpoint_dir + checkpoint_fn)

            if wandb_log:
                wandb.log({
                        "loss/train": l['train'],
                        "loss/val": l['eval'],
                        "lr": scheduler.get_last_lr()[0],
                    },
                    step = i)

        optimizer.zero_grad(set_to_none=True) # Reset gradients
        loss.backward() # Calculate new gradients

        # This line clips the gradients to prevent the exploding gradient problem during training.
        # Exploding gradients can occur when gradients become too large, causing unstable updates to model weights.
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)

        optimizer.step() # Update the model parameters
        scheduler.step() # Update the learning rate value

    if wandb_log:
        wandb.finish()


except KeyboardInterrupt:
    print("Training interrupted. Cleaning up...")

finally:
    # Release GPU memory
    torch.cuda.empty_cache()
    print("GPU memory released.")

if wandb_log:   
    wandb.finish()
torch.cuda.empty_cache()

# Code designed by Javier ideami
# ideami.com
