<a href="https://colab.research.google.com/github/marinpet/finetune-llm/blob/main/LLM_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install datasets bitsandbytes trl
# bitsandbytes had to be bumped to 0.45.2 to avoid errors in Colab env
!pip install transformers==4.46.2 peft==0.13.2 accelerate==1.1.1 trl==0.12.1 bitsandbytes==0.45.2 datasets==3.1.0 huggingface-hub==0.26.2 safetensors==0.4.5 pandas==2.2.2 matplotlib==3.8.0 numpy==1.26.4




In [2]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

In [3]:
###  Load a Quantized Base Model

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.float32
)

repo_id = 'microsoft/Phi-3-mini-4k-instruct'
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map = "cuda:0",
    quantization_config = bnb_config
)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [5]:
print(model.get_memory_footprint()/1e6)

2206.347264


In [6]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3SdpaAttention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): Phi3RMSNorm((3072

In [7]:
# Add LoRA

model = prepare_model_for_kbit_training(model) # improve numerical stability during training

config = LoraConfig(
    r = 8,    # the rank of adapter - the lower the rank, the fewer parameters to train
    lora_alpha = 16,     # multiplier, usually 2*r
    bias = "none",
    lora_dropout = 0.05,
    task_type = "CAUSAL_LM",
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj']
)

model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3SdpaAttention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magn

In [8]:
print(model.get_memory_footprint()/1e6)

2651.080704


In [9]:
train_p, tot_p = model.get_nb_trainable_parameters()
print(f'Trainable parameters:      {train_p/1e6:.2f}M')
print(f'Total parameters:          {tot_p/1e6:.2f}M')
print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')

Trainable parameters:      12.58M
Total parameters:          3833.66M
% of trainable parameters: 0.33%


# Format the dataset

In [10]:
dataset = load_dataset("dvgodoy/yoda_sentences", split="train")

README.md:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentences.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/720 [00:00<?, ? examples/s]

In [11]:
dataset

Dataset({
    features: ['sentence', 'translation', 'translation_extra'],
    num_rows: 720
})

In [12]:
dataset[0]

{'sentence': 'The birch canoe slid on the smooth planks.',
 'translation': 'On the smooth planks, the birch canoe slid.',
 'translation_extra': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

The latest trl library does not support instructional format anymore -- transform into a conversational

In [13]:
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])
dataset


Dataset({
    features: ['prompt', 'completion'],
    num_rows: 720
})

In [14]:
dataset[0]

{'prompt': 'The birch canoe slid on the smooth planks.',
 'completion': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

In [15]:
def format_dataset(examples):
  if isinstance(examples["prompt"], list):
    output_texts = []
    for i in range(len(examples["prompt"])):
      converted_sample = [
          {"role":"user", "content": examples["prompt"][i]},
          {"role":"assistant", "content": examples["completion"][i]}
      ]
      output_texts.append(converted_sample)
    return {"messages": output_texts}
  else:
    converted_sample = [
        {"role":"user", "content": examples["prompt"]},
    {"role": "assistant", "content": examples["completion"]}
    ]
    return {"messages": converted_sample}

dataset = dataset.map(format_dataset).remove_columns(["prompt", "completion"])
dataset[0]["messages"]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

[{'content': 'The birch canoe slid on the smooth planks.', 'role': 'user'},
 {'content': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.',
  'role': 'assistant'}]

# Tokenizer

- load the tokenizer that responds to the model
- tokenizer determines how to turn words into tokens in the same way it was used while training the model

**Intruction/chat models:** tokenizer also has a chat template. It specifies:
- special tokens that need to be used and where to use them
- where the system directive, user prompt and model response should be placed
- what is the generation prompt -- what special token triggers model's response



In [16]:
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.chat_template

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

"{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"

In [17]:
print(tokenizer.apply_chat_template(dataset[0]["messages"], tokenize=False))

<|user|>
The birch canoe slid on the smooth planks.<|end|>
<|assistant|>
On the smooth planks, the birch canoe slid. Yes, hrrrm.<|end|>
<|endoftext|>


There were changes in default collator used by the SFTTrainer. he EOS token (which is, in Phi-3, the same as the PAD token) was masked in the labels too thus leading to the model not being able to properly stop token generation.

In order to address this change, we can assign the UNK token to the PAD token, so the EOS token becomes unique and therefore not masked as part of the labels.

In [18]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

# Finetuning with SFTTrainer

**SFTTrainer:**

*   uses Hugging Face's Trainer under the hood
*   needs the following:
    * a model
    * a tokenizer
    * a dataset
    * a configuration object

### SFTConfig

parameter categories:

* memory usage related to gradient accumulation and checkpointing
* dataset-related arguments (e.g. max_seq_length --> likely to cause out of memory issues)
* training parameters (e.g. learning_rate, num_train_epochs) --> to start with, try the learning rate used to train the base model
* environment and logging (e.g. output_dir, logging_dir, logging_steps)

In [19]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False},
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## GROUP 2: Dataset-related
    max_seq_length=64,
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,

    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',

    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./phi3-mini-yoda-adapter',
    report_to='none'
)


### SFTTrainer

In [20]:
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    args = sft_config,
    train_dataset = dataset
)

Generating train split: 0 examples [00:00, ? examples/s]



In [22]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

In [23]:
batch["input_ids"][0], batch["labels"][0]

(tensor([29892,  6505,   278,  1480, 29892,   366,  1818, 29889, 32007, 32000,
         32000, 32010,   450, 26935,  2317,   338,  2086,  1880,   363,   278,
           274,  3222, 29889, 32007, 32001,  1763, 29877,  1880,   363,   278,
           274,  3222, 29892,   278, 26935,  2317,   338, 29889,  3869, 29892,
           298, 21478,  1758, 29889, 32007, 32000, 32000, 32010,   319, 15331,
           304,  1432,  2278,   338,   278,  2381,   273, 13006, 29889, 32007,
         32001,  1763,  1432,  2278], device='cuda:0'),
 tensor([29892,  6505,   278,  1480, 29892,   366,  1818, 29889, 32007, 32000,
         32000, 32010,   450, 26935,  2317,   338,  2086,  1880,   363,   278,
           274,  3222, 29889, 32007, 32001,  1763, 29877,  1880,   363,   278,
           274,  3222, 29892,   278, 26935,  2317,   338, 29889,  3869, 29892,
           298, 21478,  1758, 29889, 32007, 32000, 32000, 32010,   319, 15331,
           304,  1432,  2278,   338,   278,  2381,   273, 13006, 29889, 320

In [24]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,3.0162
20,1.7453
30,1.5326
40,1.4816
50,1.3791
60,1.237
70,1.1372
80,0.943
90,0.8795
100,0.6238


TrainOutput(global_step=220, training_loss=0.8217035109346563, metrics={'train_runtime': 1521.7379, 'train_samples_per_second': 2.307, 'train_steps_per_second': 0.145, 'total_flos': 5034400555991040.0, 'train_loss': 0.8217035109346563, 'epoch': 10.0})

# Query the model

* the input needs to be properly formated
* argument add_generation_prompt - adds <|assistant|> to the end of conversation so the model can predict the next word, until it predicts <|endoftext|> token

In [26]:
# Apply chat template to the msg

def gen_prompt(tokenizer, sentence):
    converted_sample = [{"role": "user", "content": sentence}]
    prompt = tokenizer.apply_chat_template(
        converted_sample, tokenize=False, add_generation_prompt=True
    )
    return prompt


In [27]:
sentence = 'The Force is strong in you!'
prompt = gen_prompt(tokenizer, sentence)
print(prompt)


<|user|>
The Force is strong in you!<|end|>
<|assistant|>



In [28]:
# tokenize the prompt into a tensor of token IDs (add_special_tokens = False because tokens already added in the chat template)
# set evaluation mode
# call generate() method to produce the output (generated token IDs )
# decode generated token IDs back to text

def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
    tokenized_input = tokenizer(
        prompt, add_special_tokens=False, return_tensors="pt"
    ).to(model.device)

    model.eval()
    gen_output = model.generate(**tokenized_input,
                                eos_token_id=tokenizer.eos_token_id,
                                max_new_tokens=max_new_tokens)

    output = tokenizer.batch_decode(gen_output, skip_special_tokens=skip_special_tokens)
    return output[0]


In [29]:
print(generate(model, tokenizer, prompt))

<|user|> The Force is strong in you!<|end|><|assistant|> Strong in you, the Force is!<|end|><|endoftext|>


# Save the adapter

In [31]:
# save the adapter and the tokenizer to disk
trainer.save_model('local-phi3-mini-yoda-adapter')

What gets saved:

* the adapter configuration (adapter_config.json) and weights (adapter_model.safetensors)—the adapter itself is just 50 MB in size
* the training arguments (training_args.bin)
* the tokenizer (tokenizer.json and tokenizer.model), its configuration (tokenizer_config.json), and its special tokens (added_tokens.json and speciak_tokens_map.json)
* a README file