## Fine-tuning LLMs with HuggingFace, PEFT (LoRa/QLoRA)

---

### 0 - Setup

In [24]:
# HuggingFace
import transformers, peft
from datasets import load_dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from transformers.trainer_callback import TrainerCallback
from transformers.integrations import MLflowCallback
from trl import SFTTrainer
import bitsandbytes as bnb
from bitsandbytes.optim import AdamW8bit

# Models/MLOps
from ollama import chat
import torch
import mlflow
import mlflow.transformers

# System
from dotenv import load_dotenv
import os, sys, subprocess
import gc # Garbage collector

# Extras
import accelerate
from importlib.metadata import version
import warnings
from tqdm import tqdm # Progress bar
warnings.filterwarnings('ignore', category=UserWarning)

# Model and dataset configuration
# model_name = "meta-llama/Llama-2-7b-hf"
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' # More suitable for my GPU
train_dataset_path = "../data/training_dataset.jsonl"
output_model = '../models/TinyLlama-1.1b-Chat-FineTuned-v1.0'

# Loading environment variables
load_dotenv()
HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

Asserting that transformers and peft lib versions are compatible

In [3]:
print('transformers version:', transformers.__version__)
#print('peft version:', peft.__version__)
print('bitsandbytes version:', version('bitsandbytes'))
print('trl version:', version('trl'))
print('accelerate version:', accelerate.__version__)
print(f"PyTorch version: {torch.__version__}. - Must be a version with GPU (CUDA) support, not CPU only.")

# Asserting versions
# assert transformers.__version__ == '4.40.2', 'transformers version mismatch'
#assert peft.__version__ == '0.10.0', 'peft version mismatch'

transformers version: 4.52.4
bitsandbytes version: 0.46.0
trl version: 0.18.2
accelerate version: 1.8.1
PyTorch version: 2.2.1+cu121. - Must be a version with GPU (CUDA) support, not CPU only.


In [4]:
def check_nvidia_smi():
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        return result.stdout
    except FileNotFoundError:
        return "nvidia-smi not found. NVIDIA drivers may not be installed."

print("=== NVIDIA Driver Check ===")
print(check_nvidia_smi())

=== NVIDIA Driver Check ===
Mon Jun 23 20:56:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.57                 Driver Version: 576.57         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1050      WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   44C    P8            N/A  / 5001W |       0MiB /   4096MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                    

In [5]:
# Get GPU memory info
gpu_memory = torch.cuda.get_device_properties(0).total_memory
max_memory = {0: f"{gpu_memory * 0.85 / 1e9:.1f} GB"}  # Use 85% of GPU memory

print(f"GPU Memory: {gpu_memory / 1e9:.1f} GB")
print(f"Max memory for model: {max_memory}")

GPU Memory: 4.3 GB
Max memory for model: {0: '3.7 GB'}


CUDA Availability Check

In [6]:
# Check CUDA availability
use_gpu = torch.cuda.is_available()
print('CUDA Available:', use_gpu)
print('Current CUDA device:', torch.cuda.current_device() if use_gpu else 'No CUDA device')
print('CUDA version:', torch.version.cuda if use_gpu else 'Not Available')

if use_gpu:
    print(f'CUDA device: {torch.cuda.get_device_name(0)}')
    print(f'CUDA capability:', torch.cuda.get_device_capability(0))

CUDA Available: True
Current CUDA device: 0
CUDA version: 12.1
CUDA device: NVIDIA GeForce GTX 1050
CUDA capability: (6, 1)


#### Releasing GPU Memory

In [21]:
torch.cuda.empty_cache()
print('Bytes collected:')
gc.collect()

Bytes collected:


620

---

### 1- Loading Train Data

In [8]:
# Training data for Fine-Tuning
dataset = load_dataset("json", data_files=train_dataset_path, split="train")


# Formatting the dataset for training
def formatting(example):
    text = f"### Prompt:\n{example['prompt']}\n\n### Response:\n{example['response']}"
    return {"text": text}

dataset = dataset.map(formatting)

### 1.1- Loading Tokenizer and transforming Train Data

In [9]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Tokenizing formatted dataset
def preprocess(examples):
    # Tokenize the texts with padding and truncation
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors=None  # Return lists instead of tensors
    )
    
    # Set up the labels for training
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

# Apply preprocessing to the entire dataset at once
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names
)

---

### 2- Loading and Configuring Model

In [10]:
# Configure 8-bit quantization for efficient GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0,  # Threshold for outlier detection
    llm_int8_has_fp16_weight=False,  # Disabling fp16 for weights to avoid dtype mismatch
    llm_int8_enable_fp32_cpu_offload=True  # Offload to CPU
)

# Load Model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    use_cache=False,  # Important for training
    device_map="auto",  # Let accelerate handle device mapping
    torch_dtype=torch.float32  # Use float32 as base dtype
)

---

### 3- Preparing for PEFT - Applying LoRA/QLoRA

In [11]:
# Prepare for PEFT
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA Config
lora_config = LoraConfig(
    r=8, # Rank of the LoRA matrix, the number of trainable parameters (the higher the more trainable parameters, but also more memory and computation)
    lora_alpha=32, # Scaling factor for the LoRA matrix
    target_modules=["q_proj", "v_proj"], # Query and Value projection layers for TinyLlama
    lora_dropout=0.05, # Dropout rate for the LoRA matrix
    bias="none", # Bias for the LoRA matrix (not used for TinyLlama)
    task_type="CAUSAL_LM" # Task type for the LoRA matrix
)

model = get_peft_model(model, lora_config)

### Trainable Parameters Overview

In [12]:
print(model.print_trainable_parameters())

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023
None


---

### 4- Data Collator

A data collator is a crucial component in the training pipeline that prepares batches of data for the model. 
In this case, we're using DataCollatorForLanguageModeling which:
1. Pads sequences to the same length within each batch
2. Creates attention masks to handle the padding
3. Prepares the labels for language modeling

The mlm=False parameter indicates we're doing causal language modeling (predicting next token) 
rather than masked language modeling (predicting masked tokens).

This collator is necessary because:
- It ensures all sequences in a batch have the same length through padding
- It properly formats the input for the model's forward pass
- It handles the creation of labels for the language modeling task
- It optimizes memory usage by padding only within each batch rather than to a fixed length -->


In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer, 
                                                mlm=False) # As we are working with Causal Language Modeling (predicting next token), not MLM (Masked Language Modeling)

---

### 5- Configure Training Arguments

I'll  avoid Trainer, its returning error when trying to train():

"The safest path on your setup is to avoid Trainer and instead train using a custom training loop with Accelerate, which gives you more control and avoids hidden offloading. Sometimes Hugging Face's Trainer tries to offload to CPU automatically if it detects low VRAM."

In [None]:
# Training Arguments - Optimized for GTX 1050 4GB VRAM with TinyLlama 1.1B + LoRA
training_args = TrainingArguments(
    output_dir=output_model,
    per_device_train_batch_size=1,  # Keep small batch size for 4GB VRAM
    gradient_accumulation_steps=16,  # Maintain effective batch size
    optim="paged_adamw_32bit",  # Use 32-bit optimizer for better stability
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=5,
    num_train_epochs=3,
    max_steps=250,
    bf16=False,  # Disable bf16 since we're using fp16
    fp16=True,  # Use fp16 for training
    torch_compile=False,  # Disable torch compilation
    gradient_checkpointing=True,
    warmup_steps=50,
    max_grad_norm=0.3,
    ddp_find_unused_parameters=False,
    dataloader_num_workers=2,       # Pre-loading batches in background
    remove_unused_columns=False,
    group_by_length=True,  # Group similar length sequences for efficiency
    report_to="none"  # Disable reporting to save memory
)

# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=lora_config,
    args=training_args,
    data_collator=data_collator,
    callbacks=[
        MLflowCallback()
    ]
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


---

### 6 - Training

In [15]:
# Monitoring with MLFlow    
with mlflow.start_run():
    trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
5,3.2884
10,3.2179
15,2.9209
20,2.6409
25,2.2722
30,1.8309
35,1.3912
40,0.9536
45,0.4983
50,0.1819




TrainOutput(global_step=250, training_loss=0.4182412056922913, metrics={'train_runtime': 1353.5198, 'train_samples_per_second': 2.955, 'train_steps_per_second': 0.185, 'total_flos': 254767140864000.0, 'train_loss': 0.4182412056922913})

---

### 7- Merging with Base Model

### 8- Saving Fine-Tuned LLM