<a href="https://colab.research.google.com/github/mightyoctopus/lora-fine-tuning-example-code/blob/main/LoRA_PG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q peft transformers datasets torch accelerate

In [2]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1


Dataset: Rabe3/QA_Synthatic_Medical_data

Model (Instrcut): Qwen/Qwen3-0.6B

- Load model
- tokenizing
- quantization
- load dataset
- Prepare Dataset Format

tokenizer.apply_chat_template(messages, tokenize=False)

- Lora Config
- train
- Save the fine tuned model

In [8]:
from torch.utils.data import DataLoader
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
                          )
from peft import LoraConfig, get_peft_model, TaskType

from google.colab import drive, userdata
from huggingface_hub import login, snapshot_download

import os

In [4]:
model_name = "Qwen/Qwen3-0.6B"
dataset_name = "Rabe3/QA_Synthatic_Medical_data"

In [5]:
drive.mount("/content/drive")
cache_path = "/content/drive/My Drive/models/huggingface_cache"

Mounted at /content/drive


In [25]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
model_folder_name = "models--" + model_name.replace("/", "--")
parent_model_path_in_drive = os.path.join(cache_path, model_folder_name)

if not os.path.exists(parent_model_path_in_drive):
    print("Model not found in Drive -- downloading from HF...")
    model_path = snapshot_download(
        repo_id=model_name,
        cache_dir=cache_path,
        local_dir_use_symlinks=False
    )
else:
    print("Model found in Drive -- fetching from the cache...")
    snapshots_dir = os.path.join(parent_model_path_in_drive, "snapshots")
    drive_id = os.listdir(snapshots_dir)

    if drive_id:
        model_path = os.path.join(snapshots_dir, drive_id[0])
    else:
        raise ValueError("No snapshot found in the cache path.")

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=cache_path
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)


Model found in Drive -- fetching from the cache...


In [10]:
lora_config = LoraConfig(
    r = 8,
    lora_alpha=16,
    bias="none",
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [11]:
data = load_dataset(dataset_name, "default", split="train[:200]")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.jsonl:   0%|          | 0.00/45.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20750 [00:00<?, ? examples/s]

In [12]:
def tokenize(batch):

    for convo in batch["conversations"]:
        for turn in convo:
            human_msg = turn["value"] if turn["from"] == "human" else ""
            assisant_msg = turn["value"] if turn["from"] == "gpt" else ""

            texts = [
                f"### Instruction:\n{human_msg}\n### Response:\n{assisant_msg}"
            ]

    tokens = tokenizer(
        texts,
        padding="max_length",
        max_length=256,
        truncation=True,
        return_tensors="pt"
    )

    tokens["labels"] = tokens["input_ids"].clone()

    return tokens

In [13]:
tokenized_data = data.map(tokenize, batched=True, remove_columns=data.column_names)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments(
    output_dir = "./fine_tuned_result",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate= 1e-3,
    num_train_epochs=50,
    fp16=True,
    logging_steps=20,
    save_strategy="epoch",
    remove_unused_columns=False,
    label_names=["labels"],
    report_to="none"
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    processing_class=tokenizer
)

In [16]:
result = trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
20,0.4667
40,0.0372


In [17]:
print(result.training_loss)

0.2080682224035263


In [18]:
### Get Perplexity:
import math

loss = result.training_loss
perplexity = math.exp(loss)

print(f"Perplexity: {perplexity}")

Perplexity: 1.2312971687358283
