<a href="https://colab.research.google.com/github/lizhieffe/llm_knowledge/blob/main/examples/sft/QLoRA_SFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

QLoRA SFT training example.

Tutorial: https://huggingface.co/blog/dvgodoy/fine-tuning-llm-hugging-face

In [None]:
# Get dependencies

!pip install transformers==4.46.2 peft==0.13.2 accelerate==1.1.1 trl==0.12.1 bitsandbytes==0.45.2 datasets==3.1.0 huggingface-hub==0.26.2 safetensors==0.4.5 pandas==2.2.2 matplotlib==3.8.0 numpy==1.26.4



In [None]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

# Load a quantized base model

In [None]:
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float32
)
# repo_id = 'microsoft/Phi-3-mini-4k-instruct'
repo_id = 'Qwen/Qwen2.5-Coder-7B-Instruct'
model = AutoModelForCausalLM.from_pretrained(
   repo_id, device_map="cuda:0", quantization_config=bnb_config
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [None]:
print(f"Model memory footprints = {model.get_memory_footprint()/1e9:.1f}GB")

Model memory footprints = 5.4GB


In [None]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-0

# Set up LoRA

> A quantized model can be used directly for inference, but it cannot be trained any further. Those pesky Linear4bit layers take up much less space, which is the whole point of quantization; however, we cannot update them.

When you call `prepare_model_for_kbit_training`, it performs the following key actions on your model:

1. Casts LayerNorms to FP32.

2. Enables Gradient Checkpointing.

3. Makes Output Embeddings Trainable.

4. Adds Forward Hooks

In [None]:
model = prepare_model_for_kbit_training(model)

In [None]:
config = LoraConfig(
    # the rank of the adapter, the lower the fewer parameters you'll need to train
    r=8,
    lora_alpha=16, # multiplier, usually 2*r
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    # Newer models, such as Phi-3 at time of writing, may require
    # manually setting target modules
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)
model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 3584)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
              (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
              (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=3584, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, o

In [None]:
# The model preparation function (prepare_model_for_kbit_training()) turned
# every non-quantized layer to full precision (FP32), thus resulting in a 30%
# larger model:
print(f"The prepared model memory footprints = {model.get_memory_footprint()/1e9:.1f}GB")

train_p, tot_p = model.get_nb_trainable_parameters()
print(f'Trainable parameters:      {train_p/1e6:.2f}M')
print(f'Total parameters:          {tot_p/1e6:.2f}M')
print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')

The prepared model memory footprints = 7.7GB
Trainable parameters:      6.65M
Total parameters:          7622.27M
% of trainable parameters: 0.09%


# Dataset

The dataset has three columns:

1. original English sentence (sentence)
2. basic translation to Yoda-speak (translation)
3. enhanced translation including typical Yesss and Hrrmm interjections (translation_extra)

In [None]:
dataset = load_dataset("dvgodoy/yoda_sentences", split="train")
dataset

README.md:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentences.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/720 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'translation', 'translation_extra'],
    num_rows: 720
})

In [None]:
dataset[0]

{'sentence': 'The birch canoe slid on the smooth planks.',
 'translation': 'On the smooth planks, the birch canoe slid.',
 'translation_extra': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

## Convert the DS to *Conversational Format*

```
{"messages":[
  {"role": "system", "content": "<general directives>"},
  {"role": "user", "content": "<prompt text>"},
  {"role": "assistant", "content": "<ideal generated text>"}
]}
```

In [None]:
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 720
})

In [None]:
dataset[0]

{'prompt': 'The birch canoe slid on the smooth planks.',
 'completion': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

In [None]:
# @title Conversion Libs

# Adapted from trl.extras.dataset_formatting.instructions_formatting_function
# Converts dataset from prompt/completion format (not supported anymore)
# to the conversational format
def format_dataset(examples):
    if isinstance(examples["prompt"], list):
        output_texts = []
        for i in range(len(examples["prompt"])):
            converted_sample = [
                {"role": "user", "content": examples["prompt"][i]},
                {"role": "assistant", "content": examples["completion"][i]},
            ]
            output_texts.append(converted_sample)
        return {'messages': output_texts}
    else:
        converted_sample = [
            {"role": "user", "content": examples["prompt"]},
            {"role": "assistant", "content": examples["completion"]},
        ]
        return {'messages': converted_sample}

In [None]:
dataset = dataset.map(format_dataset).remove_columns(['prompt', 'completion'])
dataset[0]['messages']

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

[{'content': 'The birch canoe slid on the smooth planks.', 'role': 'user'},
 {'content': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.',
  'role': 'assistant'}]

# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.chat_template

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

'{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n    {%- else %}\n       

In [None]:
# Example of the formatted example
print(tokenizer.apply_chat_template(dataset[0]['messages'], tokenize=False))

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
The birch canoe slid on the smooth planks.<|im_end|>
<|im_start|>assistant
On the smooth planks, the birch canoe slid. Yes, hrrrm.<|im_end|>



## Special handling for PI3

> **IMPORTANT UPDATE**: due to changes in the default collator used by the SFTTrainer class while building the dataset, the EOS token (which is, in Phi-3, the same as the PAD token) was masked in the labels too thus leading to the model not being able to properly stop token generation.
>
> In order to address this change, we can assign the UNK token to the PAD token, so the EOS token becomes unique and therefore not masked as part of the labels.

In [None]:
if "Phi-3" in repo_id:
  tokenizer.pad_token = tokenizer.unk_token
  tokenizer.pad_token_id = tokenizer.unk_token_id

# SFT

In [None]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False},
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## GROUP 2: Dataset-related
    max_seq_length=64,
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,

    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',

    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./phi3-mini-yoda-adapter',
    report_to='none'
)

In [None]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
)

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

# peek the DS
batch['input_ids'][0], batch['labels'][0]

(tensor([   259,  13181,  10962,  29388,    279,   9853,    281,    604,     13,
         151645,    198, 151644,  77091,    198,  39726,    577,    279,   9853,
            281,    604,     11,    279,    259,  13181,  10962,     13,    472,
             81,  48886,     13, 151645,    198, 151645, 151644,   8948,    198,
           2610,    525,   1207,  16948,     11,   3465,    553,  54364,  14817,
             13,   1446,    525,    264,  10950,  17847,     13, 151645,    198,
         151644,    872,    198,     44,   1701,   4349,    374,    264,  12000,
          10223], device='cuda:0'),
 tensor([   259,  13181,  10962,  29388,    279,   9853,    281,    604,     13,
         151645,    198, 151644,  77091,    198,  39726,    577,    279,   9853,
            281,    604,     11,    279,    259,  13181,  10962,     13,    472,
             81,  48886,     13, 151645,    198, 151645, 151644,   8948,    198,
           2610,    525,   1207,  16948,     11,   3465,    553,  54364, 

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,3.4235
20,1.7686
30,1.2333
40,1.0636
50,0.9948
60,0.9304
70,0.9184
80,0.8957
90,0.8241
100,0.8143


TrainOutput(global_step=390, training_loss=0.747770469616621, metrics={'train_runtime': 2269.88, 'train_samples_per_second': 2.718, 'train_steps_per_second': 0.172, 'total_flos': 1.67680367296512e+16, 'train_loss': 0.747770469616621, 'epoch': 10.0})