## Fine-tuning LLMs with HuggingFace, PEFT (LoRa/QLoRA)

---

In [16]:
# %pip install transformers==4.40.2, peft==0.10.0\n",
# %pip install accelerate==1.7.0\n",
# %pip install bitsandbytes==0.41.1 --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui\n",
# %pip install torch==2.2.1+cu121 torchvision==0.17.1+cu121 torchaudio==2.2.1+cu121 --index-url https://download.pytorch.org/whl/cu121\n",
# %pip install bitsandbytes-cuda117==0.26.0.post2\n",
# %pip install -i https://pypi.org/simple/ bitsandbytes\n",
# %pip install trl\n",
# %pip install numpy\n",
# %pip install safetensors

### 0 - Setup

In [17]:
# HuggingFace
import transformers, peft
from datasets import load_dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from transformers.trainer_callback import TrainerCallback
from transformers.integrations import MLflowCallback
from trl import SFTTrainer
import bitsandbytes as bnb
from bitsandbytes.optim import AdamW8bit

# Models/MLOps
from ollama import chat
import torch
# import mlflow
# import mlflow.transformers

# System
from dotenv import load_dotenv
import os, sys, subprocess
import gc # Garbage collector

# Extras
import accelerate
from importlib.metadata import version
import warnings
from tqdm import tqdm # Progress bar
warnings.filterwarnings('ignore', category=UserWarning)

# Model and dataset configuration
# model_name = "meta-llama/Llama-2-7b-hf"
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' # More suitable for my GPU
train_dataset_path = "../data/training_dataset.jsonl"
output_model_dir = '../models/TinyLlama-1.1b-Chat-FineTuned-v1.0'
merged_model_dir = '../models/TinyLlama-1.1b-Chat-FineTuned-v1.0-merged'

# Loading environment variables
load_dotenv()
HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

Asserting that transformers and peft lib versions are compatible

In [18]:
print('transformers version:', transformers.__version__)
print('peft version:', peft.__version__)
print('bitsandbytes version:', version('bitsandbytes'))
print('trl version:', version('trl'))
print('accelerate version:', accelerate.__version__)
print(f"PyTorch version: {torch.__version__}. - Must be a version with GPU (CUDA) support, not CPU only.")

# Asserting versions
# assert transformers.__version__ == '4.40.2', 'transformers version mismatch'
# assert peft.__version__ == '0.10.0', 'peft version mismatch'

transformers version: 4.52.4
peft version: 0.15.2
bitsandbytes version: 0.46.0
trl version: 0.18.2
accelerate version: 1.8.1
PyTorch version: 2.2.1+cu121. - Must be a version with GPU (CUDA) support, not CPU only.


In [19]:
def check_nvidia_smi():
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        return result.stdout
    except FileNotFoundError:
        return "nvidia-smi not found. NVIDIA drivers may not be installed."

print("=== NVIDIA Driver Check ===")
print(check_nvidia_smi())

=== NVIDIA Driver Check ===
Thu Jun 26 18:32:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.57                 Driver Version: 576.57         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1050      WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   43C    P8            N/A  / 5001W |    1627MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                    

In [20]:
# Get GPU memory info
gpu_memory = torch.cuda.get_device_properties(0).total_memory
max_memory = {0: f"{gpu_memory * 0.85 / 1e9:.1f} GB"}  # Use 85% of GPU memory

print(f"GPU Memory: {gpu_memory / 1e9:.1f} GB")
print(f"Max memory for model: {max_memory}")

GPU Memory: 4.3 GB
Max memory for model: {0: '3.7 GB'}


CUDA Availability Check

In [21]:
# Check CUDA availability
use_gpu = torch.cuda.is_available()
print('CUDA Available:', use_gpu)
print('Current CUDA device:', torch.cuda.current_device() if use_gpu else 'No CUDA device')
print('CUDA version:', torch.version.cuda if use_gpu else 'Not Available')

if use_gpu:
    print(f'CUDA device: {torch.cuda.get_device_name(0)}')
    print(f'CUDA capability:', torch.cuda.get_device_capability(0))

CUDA Available: True
Current CUDA device: 0
CUDA version: 12.1
CUDA device: NVIDIA GeForce GTX 1050
CUDA capability: (6, 1)


#### Releasing GPU Memory

In [22]:
torch.cuda.empty_cache()
print('Bytes collected:')
gc.collect()

Bytes collected:


607

---

### 1- Loading Train Data

In [23]:
# Training data for Fine-Tuning
dataset = load_dataset("json", data_files=train_dataset_path, split="train")


# Formatting the dataset for training
def formatting(example):
    text = f"### Prompt:\n{example['prompt']}\n\n### Response:\n{example['response']}"
    return {"text": text}

dataset = dataset.map(formatting)

### 1.1- Loading Tokenizer and transforming Train Data

In [24]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Tokenizing formatted dataset
def preprocess(examples):
    # Tokenize the texts with padding and truncation
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors=None  # Return lists instead of tensors
    )
    
    # Set up the labels for training
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

# Apply preprocessing to the entire dataset at once
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names
)

---

### 2- Loading and Configuring Model

In [25]:
# Configure 8-bit quantization for efficient GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0,  # Threshold for outlier detection
    llm_int8_has_fp16_weight=False,  # Disabling fp16 for weights to avoid dtype mismatch
    llm_int8_enable_fp32_cpu_offload=True  # Offload to CPU
)

# Load Model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    use_cache=False,  # Important for training
    device_map="auto",  # Let accelerate handle device mapping
    torch_dtype=torch.float32  # Use float32 as base dtype
)

KeyboardInterrupt: 

### 2.1- Testing Model With Some Questions

In [None]:
# Test questions
questions = [
    "What is EEC?",
    "Explain what is APU.",
    "What can I see in MM03?",
    "What is AHEAD?",
    "What is EPEP?",
    "What is BER?"
]

for question in questions:
    prompt = f"### Prompt:\n{question}\n\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, 
                             max_new_tokens=256, 
                             do_sample=True,
                             temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:")[-1].strip()
    print(f"\nQ: {question}")
    print(f"A: {response}\n---")



Q: What is EEC?
A: EEC is an acronym for Electrical Engineering Colleges. It refers to the set of institutions that offer undergraduate and graduate programs in electrical engineering, electrical engineering technology, and related fields.
---

Q: Explain what is APU.
A: APU stands for Accelerated Pathways Unified. It is a program introduced by the California Community Colleges to provide students with a personalized learning experience that includes a blend of online and in-person learning. The program offers students the opportunity to earn two associate degrees within four years, while also completing the requirements for a transfer to a four-year university. The online classes are supplemented by in-person instruction, which helps to ensure that students have access to the resources and support they need to succeed.
---

Q: What can I see in MM03?
A: In MM03, you can see the following:

- A map of the world
- A time-traveling camera
- A world map with different time periods
- Inte

---

### 3- Preparing for PEFT - Applying LoRA/QLoRA

In [10]:
# Prepare for PEFT
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA Config
lora_config = LoraConfig(
    r=8, # Rank of the LoRA matrix, the number of trainable parameters (the higher the more trainable parameters, but also more memory and computation)
    lora_alpha=32, # Scaling factor for the LoRA matrix
    target_modules=["q_proj", "v_proj"], # Query and Value projection layers for TinyLlama
    lora_dropout=0.05, # Dropout rate for the LoRA matrix
    bias="none", # Bias for the LoRA matrix (not used for TinyLlama)
    task_type="CAUSAL_LM" # Task type for the LoRA matrix
)

model = get_peft_model(model, lora_config)

### Trainable Parameters Overview

In [11]:
print(model.print_trainable_parameters())

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023
None


---

### 4- Data Collator

A data collator is a crucial component in the training pipeline that prepares batches of data for the model. 
In this case, we're using DataCollatorForLanguageModeling which:
1. Pads sequences to the same length within each batch
2. Creates attention masks to handle the padding
3. Prepares the labels for language modeling

The mlm=False parameter indicates we're doing causal language modeling (predicting next token) 
rather than masked language modeling (predicting masked tokens).

This collator is necessary because:
- It ensures all sequences in a batch have the same length through padding
- It properly formats the input for the model's forward pass
- It handles the creation of labels for the language modeling task
- It optimizes memory usage by padding only within each batch rather than to a fixed length -->


In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer, 
                                                mlm=False) # As we are working with Causal Language Modeling (predicting next token), not MLM (Masked Language Modeling)

---

### 5- Configure Training Arguments

I'll  avoid Trainer, its returning error when trying to train():

"The safest path on your setup is to avoid Trainer and instead train using a custom training loop with Accelerate, which gives you more control and avoids hidden offloading. Sometimes Hugging Face's Trainer tries to offload to CPU automatically if it detects low VRAM."

In [None]:
# Training Arguments - Optimized for GTX 1050 4GB VRAM with TinyLlama 1.1B + LoRA
training_args = TrainingArguments(
    output_dir=output_model_dir,
    per_device_train_batch_size=1,  # Keep small batch size for 4GB VRAM
    gradient_accumulation_steps=16,  # Maintain effective batch size
    optim="adamw_8bit",  # Switching from Adamw_32bit to 8-bit optimizer from BitsAndBytes
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    logging_steps=20,
    num_train_epochs=3,
    max_steps=60, # Change to 200 when ready to production. I'll keep smaller for developing purposes
    bf16=False,  # Disable bf16 since we're using fp16
    fp16=True,  # Use fp16 for training
    torch_compile=False,  # Disable torch compilation
    gradient_checkpointing=True,
    warmup_steps=50,
    max_grad_norm=0.3,
    ddp_find_unused_parameters=False,
    dataloader_num_workers=0,       # For pre-loading batches in background
    remove_unused_columns=False,
    group_by_length=True,  # Group similar length sequences for efficiency
    save_strategy="epoch", 
    save_total_limit=5, # Only 5 last checkpoints
    # report_to="mlflow"  # MLFlow reporting
)

# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=lora_config,
    args=training_args,
    data_collator=data_collator,
    callbacks=[
        # MLflowCallback()
    ]
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


---

### 6 - Training

In [23]:
# Monitoring with MLFlow    
# with mlflow.start_run():
trainer.train()

Step,Training Loss
20,0.2491
40,0.0583
60,0.0464
80,0.0431
100,0.0427
120,0.0419
140,0.0412
160,0.0417


TrainOutput(global_step=160, training_loss=0.07055153772234916, metrics={'train_runtime': 1122.2936, 'train_samples_per_second': 2.281, 'train_steps_per_second': 0.143, 'total_flos': 163050970152960.0, 'train_loss': 0.07055153772234916})

---

### 7- Merging with Base Model

In [29]:
# Get the peft_model that is already loaded in the trainer
peft_model = trainer.model  # AutoPeftModelForCausalLM with the weights of the last checkpoint

# Merge the LoRA delta-weights into the base model and remove the PEFT wrapper
merged_model = peft_model.merge_and_unload()

### 8- Saving Fine-Tuned LLM

In [None]:
# Saving merged model
os.makedirs(merged_model_dir, exist_ok=True)
merged_model.save_pretrained(merged_model_dir, safe_serialization=True)

# Saving tokenizer
tokenizer.save_pretrained(merged_model_dir)

ImportError: cannot import name 'DTensor' from 'torch.distributed.tensor' (c:\Users\Paulo\Documents\repos\fine-tuning-llms\venv\Lib\site-packages\torch\distributed\tensor\__init__.py)

### 9- Testing Fine-Tuned Model in Same Questions

In [None]:
# Assert model is in GPU
merged_model.to("cuda" if torch.cuda.is_available() else "cpu")

# Making the same questions
for question in questions:
    prompt = f"### Prompt:\n{question}\n\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, 
                             max_new_tokens=256, 
                             do_sample=True,
                             temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:")[-1].strip()
    print(f"\nQ: {question}")
    print(f"A: {response}\n---")