# Supervised Fine-Tuning (SFT) with PEFT

In [1]:
!pip install -U transformers trl bitsandbytes



In [2]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

## Import libraries

In [13]:
import torch
import pandas as pd
from peft import LoraConfig
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig

## Setting up helper functions

In [4]:
def generate_responses(model, tokenizer, user_message, system_message=None,
                       max_new_tokens=100):
    # Format chat using tokenizer's chat template
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})

    # We assume the data are all single-turn conversation
    messages.append({"role": "user", "content": user_message})

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    # Recommended to use vllm, sglang or TensorRT
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [5]:
def test_model_with_questions(model, tokenizer, questions,
                              system_message=None, title="Model Output"):
    print(f"\n=== {title} ===")
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question,
                                      system_message)
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")


In [10]:
def load_model_and_tokenizer(model_name, use_gpu = False, load_in_8bit = True):

    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Configure 8-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,  # Default threshold for outlier detection
        llm_int8_has_fp16_weight=False,  # Set to True if you want to keep some weights in fp16
    )

    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 quantization_config=quantization_config,
                                                 device_map="auto",  # Automatically distribute across available GPUs
                                                 torch_dtype=torch.float16,  # Use fp16 for non-quantized parts
                                                 trust_remote_code=True )

#    if use_gpu:
#        model.to("cuda")

    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""

    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [7]:
def display_dataset(dataset):
    # Visualize the dataset
    rows = []
    for i in range(3):
        example = dataset[i]
        user_msg = next(m['content'] for m in example['messages']
                        if m['role'] == 'user')
        assistant_msg = next(m['content'] for m in example['messages']
                             if m['role'] == 'assistant')
        rows.append({
            'User Prompt': user_msg,
            'Assistant Response': assistant_msg
        })

    # Display as table
    df = pd.DataFrame(rows)
    pd.set_option('display.max_colwidth', None)  # Avoid truncating long strings
    display(df)

In [8]:
USE_GPU = True

questions = [
    "Give me an 1-sentence introduction of LLM.",
    "Calculate 1+1-1",
    "What's the difference between thread and process?"
]

## Doing SFT with PEFT on a small model

<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> 💻 &nbsp; <b>Note:</b> We're performing SFT on a small model <code>HuggingFaceTB/SmolLM2-135M</code> and a smaller training dataset to to ensure the full training process can run on limited computational resources. If you're running the notebooks on your own machine and have access to a GPU, feel free to switch to a larger model—such as <code>Qwen/Qwen3-0.6B-Base</code>—to perform full SFT and reproduce the results shown above.</p>
</div>

In [32]:
#if model is not None:
#    del model
#    del tokenizer
#    torch.cuda.empty_cache()

In [20]:
model_name = "Qwen/Qwen3-0.6B-Base"
model, tokenizer = load_model_and_tokenizer(model_name, USE_GPU, load_in_8bit = True)

### Loading the finetuning dataset

In [28]:
train_df = pd.read_parquet("sft_dataset.parquet")

train_dataset = Dataset.from_pandas(train_df.sample(1000))

#if not USE_GPU:
#    train_dataset=train_dataset.select(range(100))

display_dataset(train_dataset)

Unnamed: 0,User Prompt,Assistant Response
0,"Lamaze International has created a number of tools to help you both market your class and tools to use as classroom resources. These resources are intended to make life easier for you and also ensure a consistent look and messaging throughout Lamaze classes around the world.\nIs there a resource you are interested in having? Contact us and let us know what would be helpful to you! Based on the passage above, If I wanted to suggest a new resource for Lamaze International, how would I contact them and what information should I include in my suggestion?","To suggest a new resource for Lamaze International, I should contact them through their website or email them. In my suggestion, I should include information about the type of resource I am suggesting and how it would be helpful for Lamaze educators."
1,"We are all just as vulnerable.\nWe are all at times gullible.\nWe’ve all been to hell and back.\nBut the crash is recoverable. Based on the passage above, Can you paraphrase the following sentence: ""But the crash is recoverable.""?","However, one can recover from the aftermath of the downfall."
2,How many games are played in a typical Ontario Hockey League season?,A typical Ontario Hockey League season consists of 68 games.


In [29]:
# SFTTrainer config
sft_config = SFTConfig(
    learning_rate=8e-5, # Learning rate for training.
    num_train_epochs=1, #  Set the number of epochs to train the model.
    per_device_train_batch_size=1, # Batch size for each device (e.g., GPU) during training.
    gradient_accumulation_steps=8, # Number of steps before performing a backward/update pass to accumulate gradients.
    gradient_checkpointing=False, # Enable gradient checkpointing to reduce memory usage during training at the cost of slower training speed.
    logging_steps=2,  # Frequency of logging training progress (log every 2 steps).
    report_to = "none"
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [26]:
sft_trainer = SFTTrainer(
    model=model,
    args=sft_config,
    peft_config=peft_config,
    train_dataset=train_dataset,
    processing_class=tokenizer
)
sft_trainer.train()

Tokenizing train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
2,2.5422
4,2.7168
6,2.8053
8,2.2746
10,2.5915
12,2.5726


TrainOutput(global_step=13, training_loss=2.5644021951235256, metrics={'train_runtime': 61.2288, 'train_samples_per_second': 1.633, 'train_steps_per_second': 0.212, 'total_flos': 42579460620288.0, 'train_loss': 2.5644021951235256})

## Testing training results on small model and small dataset

**Note:** The following results are for the small model and dataset we used for SFT training, due to limited computational resources. To view the results of full-scale training on a larger model, see the **"SFT Results on Qwen3-0.6B Model"** section above.

In [30]:
#if not USE_GPU: # move model to CPU when GPU isn’t requested
#    sft_trainer.model.to("cpu")
test_model_with_questions(sft_trainer.model, tokenizer, questions,
                          title="Base Model (After SFT) Output")

In [31]:
sft_trainer.save_model("sft_peft_model")  # You can name this whatever you want