In [1]:
import os

In [None]:
%%capture
import sys
if "COLAB_" in "".join(os.environ.keys()):
    !git clone https://github.com/megakoresh/genai-labs.git
    !pip install -r genai-labs/requirements-other.txt
    !pip install -r genai-labs/requirements-pytorch.txt
    sys.path.append('/content/genai-labs')

In [3]:
from models.gpt2 import OpenAIModelConfigs
import re

files_dir = "data"

if not os.path.exists(files_dir):
    os.makedirs(files_dir)

data_dir = f"{files_dir}/datasets"
models_dir = f"{files_dir}/models"
downloads_dir = f"{files_dir}/downloads"
plots_dir = f"{files_dir}/plots"

ts_format = "%d.%m.%Y %H:%M:%S.%f"
fine_tuning_dataset_url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json"
sample_prompt = "All your base"

eos_token_id = 50256
batch_size = 8
num_workers = 0
rank = 16
alpha = 32
peak_lr = 0.001
initial_lr = 0.001
min_lr = 0.0001
adamw_weight_decay = 0.1
num_epochs = 1
eval_freq = 50
eval_iter = 5

config = OpenAIModelConfigs.gpt2_lg_755m
saved_model_file = f"{models_dir}/{re.sub(r'[^a-zA-Z0-9]+', '_', config.hf_repo_id)}-instruct-lora-r{rank}-a{alpha}.pth"

In [4]:
from models.gpt2 import GPT2Model, load_weights_into_gpt_from_safetensors_params
import tiktoken

model = GPT2Model(config).to(config.device)
tokenizer = tiktoken.get_encoding("gpt2")
print(f"Running on: {next(model.parameters()).device}")

KeyboardInterrupt: 

In [None]:
from utils.gpt_download_pretrainged_weights import download_model_weights
from utils.gpt_utils import generate, text_to_token_ids, token_ids_to_text

weights_file, weights = download_model_weights(config.hf_repo_id, models_dir, "model.safetensors")
print(f"Downloaded {weights_file}, loading them into the model")
load_weights_into_gpt_from_safetensors_params(model, weights)
print(f"Pretrained weights loaded to model, running inference to check")
tokens = generate(model, text_to_token_ids(sample_prompt, tokenizer), 15, config.context_length, 1, 15, eos_token_id)
generated_text = token_ids_to_text(tokens, tokenizer)
print(f"Prompt: {sample_prompt}\nResponse: {generated_text}")

In [None]:
from utils.downloads import download_file
import json
from utils.datasets import InstructionDataset
from torch.utils.data import DataLoader
from utils.datasets import custom_collate_fn, format_input_alpaca
from functools import partial

print(f"Downloading training dataset from {fine_tuning_dataset_url}")
fine_tuning_dataset = download_file(fine_tuning_dataset_url, data_dir)
print(f"Dataset source ready at {fine_tuning_dataset}")

with open(fine_tuning_dataset, "r") as file:
    data = json.load(file)
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion : train_portion + test_portion]
val_data = data[train_portion + test_portion :]

train_dataset = InstructionDataset(train_data, tokenizer, format_input_alpaca)
customized_collate_fn = partial(custom_collate_fn, device=config.device, allowed_max_length=config.context_length)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers,
)

val_dataset = InstructionDataset(val_data, tokenizer, format_input_alpaca)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
)

test_dataset = InstructionDataset(test_data, tokenizer, format_input_alpaca)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
)

print(f"Train data length: {len(train_loader)}\nValidation data length: {len(val_loader)}\nTest data length: {len(test_loader)}")

In [None]:
from models.gpt2 import replace_linear_with_lora

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters in base model: {total_params:,}")

for param in model.parameters():
    param.requires_grad = False

assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0, "All trainable parameters should be frozen"

print(f"Base weights frozen. Applying LoRA")
replace_linear_with_lora(model, rank=rank, alpha=alpha)
total_params_lora = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable LoRA parameters: {total_params_lora:,}")

In [None]:
import time
from datetime import datetime
import torch
import random

from models.gpt2 import train_generator_advanced

start_time = time.time()
optimizer = torch.optim.AdamW(model.parameters(), lr=peak_lr, weight_decay=adamw_weight_decay)
print(f"Starting training at {datetime.now().strftime(ts_format)}")
train_losses, val_losses, examples_seen, lr_seen = train_generator_advanced(
    model=model,
    config=config,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    num_epochs=num_epochs,
    eval_freq=eval_freq,
    eval_iter=eval_iter,
    tokenizer=tokenizer,
    start_context=format_input_alpaca(
        val_data[int(random.random() * len(val_data))]
    ),
)

print(f"Training finished at {datetime.now().strftime(ts_format)}")
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training took {execution_time_minutes:.2f} minutes.")