#todo: explain pipeline steps at a high level here
#todo: create requirements.txt and env setup instructions

In [1]:
import os
from pathlib import Path
import accelerate  # Not explicitly used but importing it before transformers prevents some issues w/ pytorch
from dotenv import load_dotenv
import torch

from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback )

Load secrets so we don't store them in the notebook.

In [2]:
load_dotenv()
HF_API_KEY = os.environ["HF_API_KEY"]

DOCUMENT    = "FM5_0"
PDF_PATH    = Path("pdfs/raw/fm5-0.pdf")
BASE_MODEL  = Path("QuantFactory/Llama-3.2-1B-GGUF")
GGUF_FILE   = "Llama-3.2-1B.Q8_0.gguf"
CACHE_DIR   = "hf_cache"
DATA_DIR    = DOCUMENT / BASE_MODEL / "data"
MODEL_DIR   = DOCUMENT / BASE_MODEL / "lora"
CHUNKED_DATA = DATA_DIR / "chunked" / "chunked.jsonl"

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [3]:
ds = load_dataset("json", data_files=CHUNKED_DATA.as_posix(), split="train")

In [4]:
ds = ds.map(lambda ex: {"text": ex["text"]}, remove_columns=ds.column_names)

In [5]:
tok = AutoTokenizer.from_pretrained(MODEL_DIR)

In [6]:
def tokenize_and_group(examples):
    tokens = tok(examples["text"])
    return tokens

In [7]:
tokenized = ds.map(
    tokenize_and_group,
    batched=True,
    remove_columns=["text"])

In [8]:
splits = tokenized.train_test_split(test_size=0.1, seed=42)
ds_train = splits["train"]
ds_eval  = splits["test"]

In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tok,
    mlm       = False)

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    cache_dir   = CACHE_DIR,
    gguf_file   = GGUF_FILE,
    device_map  = "auto",
    torch_dtype = torch.float16)
model.gradient_checkpointing_enable()

Converting and de-quantizing GGUF tensors...:   0%|          | 0/147 [00:00<?, ?it/s]

In [11]:
lora_cfg = LoraConfig(
    r              = 8,
    lora_alpha     = 16,
    target_modules = ["q_proj", "v_proj"],
    lora_dropout   = 0.05,
    bias           = "none",
    task_type      = "CAUSAL_LM")

In [12]:
model = get_peft_model(model, lora_cfg)
model.resize_token_embeddings(len(tok))
model.print_trainable_parameters()   # sanity check

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128261, 2048)

In [13]:
equivalent_batch_size = 128
batch_size = 4

training_args = TrainingArguments(
    output_dir                  = MODEL_DIR.as_posix(),
    per_device_train_batch_size = batch_size,
    gradient_accumulation_steps = int(equivalent_batch_size / batch_size),
    learning_rate               = 1e-4,
    num_train_epochs            = 1000,
    bf16                        = True,
    logging_steps               = 2,
    save_total_limit            = 5,
    dataloader_num_workers      = 8,
    dataloader_prefetch_factor  = 2,
    label_names                 = ["labels"],
    metric_for_best_model       = "eval_loss",
    save_strategy               = "epoch",
    eval_strategy               = "epoch")

In [14]:
early_stopping = EarlyStoppingCallback(
    early_stopping_patience  = 5,
    early_stopping_threshold = 0.001  )

trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = ds_train,
    eval_dataset  = ds_eval,
    data_collator = data_collator,
    callbacks     = [early_stopping])

In [15]:
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,3.2977,3.303851
2,3.1835,3.237642
3,3.1186,3.126435
4,3.0039,2.992384
5,2.9116,2.896091
6,2.7853,2.840994
7,2.7616,2.801643
8,2.7337,2.765341
9,2.7127,2.730158
10,2.6888,2.698385


TrainOutput(global_step=364, training_loss=2.2082887829005062, metrics={'train_runtime': 9830.5233, 'train_samples_per_second': 44.148, 'train_steps_per_second': 0.305, 'total_flos': 1.1849488438419456e+17, 'train_loss': 2.2082887829005062, 'epoch': 91.0})

In [16]:
trainer.save_model(MODEL_DIR.as_posix())