# Training Llama 3.1 with QLoRA

This notebook demonstrates QLoRA fine-tuning of Llama 3.1 8B on Google Colab or Vertex AI.

In [None]:
# Install dependencies (run once)
!pip install -q torch transformers peft trl bitsandbytes accelerate datasets mlflow

In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset

# Check GPU
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Configuration
BASE_MODEL = "meta-llama/Llama-3.1-8B"
DATASET_PATH = "sample_dataset.jsonl"  # Replace with your dataset
OUTPUT_DIR = "./llama-3.1-8b-finetuned"

# QLoRA hyperparameters
LORA_R = 64
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
BATCH_SIZE = 4
MAX_SEQ_LENGTH = 2048

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Prepare for training
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Load and format dataset
def format_instruction(example):
    instruction = example["instruction"]
    input_text = example.get("input", "")
    output = example["output"]
    
    user_content = instruction
    if input_text:
        user_content += f"\n\n{input_text}"
    
    return {
        "text": f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n{user_content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n{output}<|eot_id|>"
    }

dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
dataset = dataset.map(format_instruction)
dataset = dataset.train_test_split(test_size=0.1)

print(f"Train: {len(dataset['train'])} samples")
print(f"Eval: {len(dataset['test'])} samples")

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=LEARNING_RATE,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    eval_strategy="steps",
    bf16=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
)

In [None]:
# Train!
trainer.train()

In [None]:
# Save model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")