# Fine-tune Llama 3.1 8B on Custom Data

This notebook allows you to fine-tune the Llama 3.1 8B model on your own dataset using QLoRA (Quantized Low-Rank Adaptation). 

**Prerequisites:**
1.  **GPU:** Ensure you are connected to a GPU runtime (Runtime > Change runtime type > T4 GPU or better).
2.  **Hugging Face Token:** You need a token with access to Llama 3.1. Accept the license on the [model page](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) first.
3.  **Data:** Upload your `all_training_data.json` file to the Colab Files section.

In [None]:
# @title 1. Install Dependencies
!pip install -q -U torch transformers peft datasets bitsandbytes trl accelerate

In [None]:
# @title 2. Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# @title 3. Load and Format Data
import json
from datasets import Dataset

# Load your JSON file
data_file = "all_training_data.json"  # Make sure this file is uploaded!

try:
    with open(data_file, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
except FileNotFoundError:
    print(f"Error: {data_file} not found. Please upload it to Colab files.")
    raw_data = []

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(raw_data)

# Define formatting function for Llama 3
def format_chat_template(row):
    # Llama 3 format: 
    # <|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{output}<|eot_id|>
    
    text = (
        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
        f"{row['input']}<|eot_id|>"
        f"<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{row['output']}<|eot_id|>"
    )
    return {"text": text}

dataset = dataset.map(format_chat_template)
print("Sample formatted data:")
print(dataset[0]['text'])

In [None]:
# @title 4. Load Model (4-bit Quantization)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

In [None]:
# @title 5. Configure LoRA and Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from transformers import TrainingArguments

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA Config
peft_config = LoraConfig(
    r=16,       # Rank
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./llama-3-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=10,
    learning_rate=2e-4,
    fp16=False,
    bf16=True, # Use BF16 if on Ampere (A100/A10) or newer, otherwise False
    max_grad_norm=0.3,
    max_steps=100, # Adjust based on dataset size (e.g., 1 epoch ~ total_data / batch_size)
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    peft_config=peft_config,
)

In [None]:
# @title 6. Start Training
trainer.train()

In [None]:
# @title 7. Save Model
trainer.save_model("./llama-3-finetuned-final")
print("Model saved successfully!")

In [None]:
# @title 8. Test Inference
from peft import PeftModel

# Load base model again (if needed, or reuse 'model' from memory)
# Ideally, for inference, we merge adapters or load them on top
# Here we just use the trained model in memory for a quick check

prompt = "Act: IDA | Section: 10 - Grounds for dissolution of marriage | Description: A wife petitions for divorce on grounds of cruelty."
formatted_prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens=100, stop_strings=["<|eot_id|>"], tokenizer=tokenizer)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))