<div align="center">
  <img src="logo_branding.png" width="250" alt="kavi.ai Logo">
  <h1>QLoRA: Scaling on Consumer Hardware</h1>
  <p><b>A Premium Training Module by kavi.ai</b></p>
</div>

---

### 💎 **Smarter Overview**
QLoRA represents the bleeding edge of quantization-aware training, enabling 4-bit loading of massive models while maintaining 16-bit finetuning accuracy.

### 🚀 **Enterprise Use Case**
Hosting and fine-tuning 70B+ parameter models on commodity workstation hardware.

### 📈 **Strategic Advantages**
- **Hardware Independence**
- **Paged Memories**
- **NF4 Optimization**

---

## Step 1: Quantization Setup

### **Purpose:**
Loading the model in 4-bit precision to fit large models onto smaller GPUs.

### **Line-by-Line Breakdown:**
- `BitsAndBytesConfig`: Configure 4-bit/8-bit quantization flags.

In [None]:
!pip install transformers --upgrade
!pip install datasets
!pip install trl[peft] --upgrade
!pip install -U git+https://github.com/huggingface/trl
!pip install bitsandbytes loralib
!pip install accelerate
!pip install wandb -U
!pip install hf_transfer


In [None]:
%env HF_HUB_ENABLE_HF_TRANSFER=True
%env WANDB_PROJECT=LLM-Training-Course
%env WANDB_RUN_ID=QLORA
%env WANDB_NOTEBOOK_NAME={__vsc_ipynb_file__}

In [None]:
import wandb
wandb.login()

In [None]:
import sys
sys.path.append('/root/llm-training-course/')

In [None]:
from datasets import load_dataset
train_ds, eval_ds = load_dataset("mlabonne/orpo-dpo-mix-40k", split=["train[:10%]","train[10%:15%]"])

In [None]:
train_ds

In [None]:
train_ds = train_ds.map(lambda x: { "messages": [{"role":"system", "content": x["prompt"] }] + x["chosen"] })
eval_ds = eval_ds.map(lambda x: { "messages": [{"role":"system", "content": x["prompt"] }] + x["chosen"] })

In [None]:
columns_to_remove = [c for c in train_ds.column_names if c not in ["messages"]]
train_ds = train_ds.remove_columns(columns_to_remove)

columns_to_remove = [c for c in eval_ds.column_names if c not in ["messages"]]
eval_ds = eval_ds.remove_columns(columns_to_remove)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
tokenizer.eos_token = "<|eot_id|>"
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

In [None]:
from helpers import get_gpu_status
get_gpu_status()

In [None]:
print(tokenizer)
print("---")
print("Vocab size:", tokenizer.vocab_size)
print("---")
print("Chat template:", tokenizer.chat_template)

In [None]:
chat_template = open('../chat_templates/llama-3-chat.jinja').read()
chat_template = chat_template.replace('    ', '').replace('\n', '')
print("Chat Template", chat_template)
tokenizer.chat_template = chat_template
print("---")
print(tokenizer.apply_chat_template(train_ds["messages"][0], tokenize=False))

In [None]:
from helpers import set_padding_for_tokenizer
set_padding_for_tokenizer(tokenizer)

In [None]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    quantization_config=bnb_config
)



In [None]:
print(model)

In [None]:
from helpers import stream_responses_for_sample
from transformers import GenerationConfig

generation_config =  GenerationConfig(max_new_tokens=50)
sample_conversations = [
    [{"role": "user", "content": "What is the capital of France?"}],
    [{"role": "user", "content": "Write me a javascript function that check if string is palindrome."}],
    [{"role": "user", "content": "Given x^2=36-4 what is x?"}]
]
stream_responses_for_sample(model, tokenizer, sample_conversations, generation_config=generation_config)

In [None]:
from peft import LoraConfig

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",
    modules_to_save=["embed_tokens", "input_layernorm", "post_attention_layernorm"],
    bias="none",
    task_type="CAUSAL_LM",
)

## Step 2: Apply LoRA Adapters

### **Purpose:**
Injecting lightweight trainable adapters into the quantized base model.

### **Line-by-Line Breakdown:**
- `LoraConfig`: Define rank (r) and alpha for adapters.

In [None]:
from peft import get_peft_model
peft_model = get_peft_model(model, peft_config)

In [None]:
# Update weights with loftq but only for what helps improve the MSE
from helpers import update_with_loftq_weights_if_useful
from tqdm.auto import tqdm

pbar = tqdm(total=len(list(peft_model.named_modules())))
update_with_loftq_weights_if_useful(peft_model, tokenizer=tokenizer, pbar=pbar)

In [None]:
from helpers import print_number_of_trainable_parameters
print_number_of_trainable_parameters(peft_model)

In [None]:
from helpers import get_gpu_status
get_gpu_status()

In [None]:
import os
from trl import SFTConfig, SFTTrainer

args = SFTConfig(
    output_dir=os.getenv("WANDB_RUN_ID"),
    report_to="wandb",
    num_train_epochs=1.0,
    do_train=True,
    do_eval=True,
    log_level="debug",
    gradient_checkpointing=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    per_device_eval_batch_size=1,
    lr_scheduler_type="constant",
    bf16=True,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=0.1,
    max_grad_norm=.3,
    learning_rate=5e-5,
)


In [None]:
trainer = SFTTrainer(
    model=peft_model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds
)
trainer.train()

In [None]:

generation_config =  GenerationConfig(max_new_tokens=250, eos_token_id=tokenizer.eos_token_id, length_penalty=-0.5)
sample_conversations = [
    [{"role": "user", "content": "What is the capital of France?"}],
    [{"role": "user", "content": "Write me a javascript function that check if string is palindrome."}],
    [{"role": "user", "content": "Given x^2=36-4 what is x?"}]
]
stream_responses_for_sample(peft_model, tokenizer, sample_conversations, generation_config=generation_config)

In [None]:
peft_model

In [None]:
peft_model

In [None]:
# merging the model

from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import PeftModel
peft_model
model = peft_model.merge_and_unload()