<a href="https://colab.research.google.com/github/mshojaei77/RAG_CAG_SFT/blob/main/sft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# # Fine-Tuning the `gemma-3-4b-it` Model with Unsloth and a Custom Dataset
# 
# This notebook demonstrates how to fine-tune the `gemma-3-4b-it` model using the `unsloth` library on a custom Q&A dataset (`qa_pairs.jsonl`). It employs memory-efficient techniques such as 4-bit quantization and LoRA, making it ideal for GPUs with limited VRAM.

## 1. Setup and Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

In [None]:
# Before running this cell, ensure you have selected the GPU runtime in Colab:
# Go to the top right corner, click on "Runtime" > "Change runtime type" > Select "GPU" (preferably T4) > Save

import torch

# Check if CUDA is available and print GPU details
if torch.cuda.is_available():
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA version used by PyTorch: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    raise RuntimeError("CUDA not available. Ensure you have selected a GPU runtime in Colab.")

## 2. Configuration

In [None]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

## 3. Dataset Loading and Formatting

Load the `qa_pairs.jsonl` dataset and transform it into the chat format required by `SFTTrainer` (list of messages with `role` and `content`).

In [None]:
from google.colab import files
from datasets import load_dataset
import os

# Upload qa_pairs.jsonl file directly to Colab
uploaded = files.upload()

# Get the filename of the uploaded file
file_name = list(uploaded.keys())[0]
dataset_path = file_name  # Use the uploaded file's name as the path

print(f"Loading dataset from: {dataset_path}")
try:
    dataset = load_dataset("json", data_files=dataset_path, split="train")
    print(f"Dataset loaded with {len(dataset)} examples.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Define the formatting function based on sft.md
def format_to_chat(example):
    # Handles potential missing keys gracefully
    question = example.get("question", "")
    answer = example.get("answer", "")
    return {
        "messages": [
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
    }

print("Formatting dataset to chat format...")
# Get the original column names to remove them after mapping
original_columns = dataset.column_names

# Apply the formatting function and remove original columns
dataset = dataset.map(
    format_to_chat,
    remove_columns=original_columns
)
print("Dataset formatted.")
if len(dataset) > 0:
    print("Example formatted entry:", dataset[0]['messages'])
else:
    print("Warning: Dataset is empty after loading.")

In [None]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

In [None]:
dataset[10]

In [None]:
def apply_chat_template(examples):
    texts = tokenizer.apply_chat_template(examples["messages"])
    return { "text" : texts }
pass
dataset = dataset.map(apply_chat_template, batched = True)

In [None]:
dataset[10]["text"]


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

In [None]:
tokenizer.decode(trainer.train_dataset[10]["input_ids"])

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[10]["labels"]]).replace(tokenizer.pad_token, " ")

In [None]:
trainer_stats = trainer.train()

**bold text**<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "What are some capabilities of Large Language Models (LLMs)?",
    }]
}]

text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
)

outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64,
    temperature = 1.0, top_p = 0.95, top_k = 64,
)

# Extract just the answer part
raw_output = tokenizer.batch_decode(outputs)[0]
answer_start = raw_output.find("<start_of_turn>model\n") + len("<start_of_turn>model\n")
answer_end = raw_output.find("<end_of_turn>", answer_start)
answer = raw_output[answer_start:answer_end].strip()

# Print only the answer
print(answer)

# Saving the new fine tuned model

In [None]:
model.save_pretrained_merged("gemma-3-finetune", tokenizer)

Publish it Open-Source in huggingface

In [None]:
model.push_to_hub_merged(
    "HF_ACCOUNT/gemma-3-finetune", tokenizer,
    token = "hf_..."
)