In [1]:
!rm -rf llm
!git clone https://github.com/pankajr141/llm.git

Cloning into 'llm'...
remote: Enumerating objects: 78, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 78 (delta 36), reused 58 (delta 19), pack-reused 0 (from 0)[K
Receiving objects: 100% (78/78), 47.22 KiB | 6.75 MiB/s, done.
Resolving deltas: 100% (36/36), done.


# Existing Model to be used for retraining

In [18]:
!ls /kaggle/input/llm-bhasa

# LLM Training

## Training - Direct through library

In [2]:
%%time

from llm.bhasa import training
model_filepath="/kaggle/working/model_and_optimizer.pth"
# training.train(num_epochs=1, eval_freq=100)

## Training - Lets break down

Below we will break above single line function into building blocks to have more fine grain control

In [1]:
import os
import torch
import torch.nn as nn
from llm.bhasa import data
from llm.bhasa import config
from llm.bhasa import model
from llm.bhasa import training
from llm.bhasa import generator
from llm.bhasa import data, dataset
from llm.bhasa import tokenizer as tokenizer_lib

ModuleNotFoundError: No module named 'torch'

In [4]:
model_filepath = "/kaggle/working/model_and_optimizer.pth"
config_train = config.GPT_CONFIG_124M
context_len = config_train['context_length']

### Download data

In [15]:
%%time

gutenberg_book_ids = range(100)
filepaths = data.download_sample_text(gutenberg_book_ids=gutenberg_book_ids, verbose=False)
textdata = data.read_filepaths(filepaths)

print(f"Total Downloaded Books: {len(os.listdir('gutenberg_books'))}")

Total Downloaded Books: 72
CPU times: user 709 ms, sys: 247 ms, total: 956 ms
Wall time: 43.4 s


In [6]:
from llm.bhasa import tokenizer as tokenizer_lib
tokenizer = tokenizer_lib.get_tokenizer()

In [7]:
total_characters = len(textdata)                # Number of characters in textdata
total_tokens = len(tokenizer.encode(textdata))  # Convert/Encode textdata -> tokens to be passed to LLM
print(f"Characters: {total_characters}\nTokens: {total_tokens}")

Characters: 36845989
Tokens: 10057729


In [8]:
train_data, val_data = training.split_data(textdata, train_ratio=0.70)

### Data Loaders

In [9]:
# Creating data loader for both train and validation
train_loader = dataset.create_dataloader(train_data, batch_size=2, max_length=context_len, stride=context_len,
                                         drop_last=True, shuffle=True, num_workers=0)

val_loader = dataset.create_dataloader(val_data, batch_size=2, max_length=context_len, stride=context_len,
                                        drop_last=False, shuffle=False, num_workers=0)

### Model Defination

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [34]:
model_llm = model.LLMModel(config_train)
model_llm = model.load_model(model_llm, model_filepath) # Resuming training by loading previously trained model
model.print_model_information(model_llm)

if torch.cuda.device_count() > 1:
    model_llm = nn.DataParallel(model_llm)

model_llm.to(device)                                    # Assigning GPU/CPU to model
print("Model Defined")

Loading Model weights from /kaggle/working/model_and_optimizer.pth
# Total number of parameters: 163,009,536
# Token embedding layer shape: torch.Size([50257, 768])
# Output layer shape: torch.Size([50257, 768])
# Number of trainable parameters considering weight tying: 124,412,160
# Total size of the model: 621.83 MB
Model Defined


In [35]:
optimizer = torch.optim.AdamW(model_llm.parameters(), lr=0.0004, weight_decay=0.1)

### Lets train

In [36]:
%%time

# Training LLM model from scratch
train_losses, val_losses, tokens_seen = training.train_model(model_llm, 
                                                             train_loader, 
                                                             val_loader, 
                                                             optimizer, 
                                                             device, 
                                                             num_epochs=2, 
                                                             eval_freq=100, 
                                                             eval_iter=5,
                                                             start_context="Every effort moves you", 
                                                             tokenizer=tokenizer)
model.save_model(model_llm, optimizer, model_filepath)

Ep 1 (Step 000000): Train loss 6.976, Val loss 6.313
Ep 1 (Step 000100): Train loss 5.460, Val loss 6.158
Ep 1 (Step 000200): Train loss 5.695, Val loss 5.999
Ep 1 (Step 000300): Train loss 5.530, Val loss 6.000
Ep 1 (Step 000400): Train loss 4.918, Val loss 5.950
Ep 1 (Step 000500): Train loss 5.149, Val loss 5.890
Ep 1 (Step 000600): Train loss 5.122, Val loss 5.844
Ep 1 (Step 000700): Train loss 5.105, Val loss 5.783
Ep 1 (Step 000800): Train loss 4.854, Val loss 5.763
Ep 1 (Step 000900): Train loss 4.598, Val loss 5.748
Ep 1 (Step 001000): Train loss 4.802, Val loss 5.760
Ep 1 (Step 001100): Train loss 4.816, Val loss 5.702
Ep 1 (Step 001200): Train loss 4.456, Val loss 5.632
Ep 1 (Step 001300): Train loss 4.798, Val loss 5.679
Ep 1 (Step 001400): Train loss 4.733, Val loss 5.659
Ep 1 (Step 001500): Train loss 4.293, Val loss 5.635


KeyboardInterrupt: 

### Plot Results Train vs Validation

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
training.plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)