<a href="https://colab.research.google.com/github/lizhieffe/llm_knowledge/blob/main/examples/sft/QLoRA_SFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

QLoRA SFT training example.

Tutorial: https://huggingface.co/blog/dvgodoy/fine-tuning-llm-hugging-face

In [None]:
# Get dependencies

!pip install transformers==4.46.2 peft==0.13.2 accelerate==1.1.1 trl==0.12.1 bitsandbytes==0.45.2 datasets==3.1.0 huggingface-hub==0.26.2 safetensors==0.4.5 pandas==2.2.2 matplotlib==3.8.0 numpy==1.26.4

In [None]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

# Load a quantized base model

In [None]:
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float32
)
# repo_id = 'microsoft/Phi-3-mini-4k-instruct'
repo_id = 'Qwen/Qwen2.5-Coder-7B-Instruct'
model = AutoModelForCausalLM.from_pretrained(
   repo_id, device_map="cuda:0", quantization_config=bnb_config
)

In [None]:
print(f"Model memory footprints = {model.get_memory_footprint()/1e9:.1f}GB")

In [None]:
model

# Set up LoRA

> A quantized model can be used directly for inference, but it cannot be trained any further. Those pesky Linear4bit layers take up much less space, which is the whole point of quantization; however, we cannot update them.

When you call `prepare_model_for_kbit_training`, it performs the following key actions on your model:

1. Casts LayerNorms to FP32.

2. Enables Gradient Checkpointing.

3. Makes Output Embeddings Trainable.

4. Adds Forward Hooks

In [None]:
model = prepare_model_for_kbit_training(model)

In [None]:
config = LoraConfig(
    # the rank of the adapter, the lower the fewer parameters you'll need to train
    r=8,
    lora_alpha=16, # multiplier, usually 2*r
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    # Newer models, such as Phi-3 at time of writing, may require
    # manually setting target modules
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)
model = get_peft_model(model, config)
model

In [None]:
# The model preparation function (prepare_model_for_kbit_training()) turned
# every non-quantized layer to full precision (FP32), thus resulting in a 30%
# larger model:
print(f"The prepared model memory footprints = {model.get_memory_footprint()/1e9:.1f}GB")

train_p, tot_p = model.get_nb_trainable_parameters()
print(f'Trainable parameters:      {train_p/1e6:.2f}M')
print(f'Total parameters:          {tot_p/1e6:.2f}M')
print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')

# Dataset

The dataset has three columns:

1. original English sentence (sentence)
2. basic translation to Yoda-speak (translation)
3. enhanced translation including typical Yesss and Hrrmm interjections (translation_extra)

In [None]:
dataset = load_dataset("dvgodoy/yoda_sentences", split="train")
dataset

In [None]:
dataset[0]

## Convert the DS to *Conversational Format*

```
{"messages":[
  {"role": "system", "content": "<general directives>"},
  {"role": "user", "content": "<prompt text>"},
  {"role": "assistant", "content": "<ideal generated text>"}
]}
```

In [None]:
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])
dataset

In [None]:
dataset[0]

In [None]:
# @title Conversion Libs

# Adapted from trl.extras.dataset_formatting.instructions_formatting_function
# Converts dataset from prompt/completion format (not supported anymore)
# to the conversational format
def format_dataset(examples):
    if isinstance(examples["prompt"], list):
        output_texts = []
        for i in range(len(examples["prompt"])):
            converted_sample = [
                {"role": "user", "content": examples["prompt"][i]},
                {"role": "assistant", "content": examples["completion"][i]},
            ]
            output_texts.append(converted_sample)
        return {'messages': output_texts}
    else:
        converted_sample = [
            {"role": "user", "content": examples["prompt"]},
            {"role": "assistant", "content": examples["completion"]},
        ]
        return {'messages': converted_sample}

In [None]:
dataset = dataset.map(format_dataset).remove_columns(['prompt', 'completion'])
dataset[0]['messages']

# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.chat_template

In [None]:
# Example of the formatted example
print(tokenizer.apply_chat_template(dataset[0]['messages'], tokenize=False))

## Special handling for PI3

> **IMPORTANT UPDATE**: due to changes in the default collator used by the SFTTrainer class while building the dataset, the EOS token (which is, in Phi-3, the same as the PAD token) was masked in the labels too thus leading to the model not being able to properly stop token generation.
>
> In order to address this change, we can assign the UNK token to the PAD token, so the EOS token becomes unique and therefore not masked as part of the labels.

In [None]:
if "Phi-3" in repo_id:
  tokenizer.pad_token = tokenizer.unk_token
  tokenizer.pad_token_id = tokenizer.unk_token_id

# SFT

In [None]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False},
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## GROUP 2: Dataset-related
    max_seq_length=64,
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,

    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',

    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./phi3-mini-yoda-adapter',
    report_to='none'
)

In [None]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
)

In [None]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

# peek the DS
batch['input_ids'][0], batch['labels'][0]

In [None]:
trainer.train()