<a href="https://colab.research.google.com/github/omkarwazulkar/GoogleColab/blob/main/Llama_3.2_1B_QLoRA_Finetuning_AlpacaDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    device_map="cuda"
)

print("Baseline Model Loaded On GPU")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [3]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca", split="train")
print(dataset[0])

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


In [None]:
def build_prompt(example):
    if example["input"]:
        return f"Instruction: {example['instruction']}\nInput: {example['input']}\n\nAnswer:"
    else:
        return f"Instruction: {example['instruction']}\n\nAnswer:"

N = 2

for i in range(N):
    sample = dataset[i]

    prompt = build_prompt(sample)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    output_ids = model.generate(
        **inputs,
        max_new_tokens=40,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

    model_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print(f"\n━━━━━━━━ SAMPLE {i} ━━━━━━━━")
    print("📌 Instruction:")
    print(sample["instruction"])

    if sample["input"]:
        print("\n📎 Input:")
        print(sample["input"])

    print("\n🟢 Original Dataset Answer:")
    print(sample["output"])

    print("\n🔵 Baseline Model Output:")
    print(model_output)

    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━\n")

In [1]:
!pip install transformers peft bitsandbytes accelerate datasets trl

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "meta-llama/Llama-3.2-1B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    dtype=torch.bfloat16,
    device_map="auto"
)

print("Loaded Quantized 4-bit LLaMA Into GPU")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Loaded Quantized 4-bit LLaMA Into GPU


In [3]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM"
)

lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


In [4]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca", split="train")

# Take First 10,000 Examples
small_dataset = dataset.select(range(10000))

def convert(example):
    if example["input"]:
        return {"text": f"Instruction: {example['instruction']}\nInput: {example['input']}\n\nAnswer: {example['output']}"}
    else:
        return {"text": f"Instruction: {example['instruction']}\n\nAnswer: {example['output']}"}

train_dataset = small_dataset.map(convert, remove_columns=small_dataset.column_names)
print(train_dataset[0]["text"])
print("Dataset Size:", len(train_dataset))

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Instruction: Give three tips for staying healthy.

Answer: 1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 
2. Exercise regularly to keep your body active and strong. 
3. Get enough sleep and maintain a consistent sleep schedule.
Dataset Size: 10000


In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments

output_dir="llama-qlora-fast"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,         # keep memory safe
    gradient_accumulation_steps=8,         # effective batch = 16
    learning_rate=3e-4,                    # slightly higher = faster learning
    num_train_epochs=1,                    # 1 epoch is enough for fine-tune
    logging_steps=50,

    save_strategy="steps",                 # <-- REQUIRED
    save_steps=500,                        # <-- save every 500 steps
    save_total_limit=4,                    # keeps 4 most recent checkpoints

    optim="paged_adamw_8bit",
    bf16=True,
    report_to="none"
)

trainer = SFTTrainer(
    model=lora_model,
    train_dataset=train_dataset,
    args=training_args,
)

trainer.train()
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("Training Complete & Model Saved.")

Adding EOS to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Step,Training Loss
50,1.7933
100,1.7054
150,1.7038
200,1.6329
250,1.5667
300,1.5261
350,1.5251
400,1.5336
450,1.4977
500,1.5098


Training Complete & Model Saved.


In [7]:
trainer.save_model("LLama3.2-1B-QLoRA")           # save LoRA adapter weights
tokenizer.save_pretrained("LLama3.2-1B-QLoRA")    # save tokenizer
print("Model saved successfully as 'LLama3.2-1B-QLoRA'")

Model saved successfully as 'LLama3.2-1B-QLoRA'


In [8]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `HFT` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenti

In [10]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id="omkarwazulkar/LLama3.2-1B-QLoRA", private=False)

RepoUrl('https://huggingface.co/omkarwazulkar/LLama3.2-1B-QLoRA', endpoint='https://huggingface.co', repo_type='model', repo_id='omkarwazulkar/LLama3.2-1B-QLoRA')

In [11]:
from huggingface_hub import upload_folder

upload_folder(
    folder_path="LLama3.2-1B-QLoRA",          # your saved folder
    repo_id="omkarwazulkar/LLama3.2-1B-QLoRA",     # must match above
    repo_type="model"
)

print("Model Uploaded Successfully!")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...2-1B-QLoRA/tokenizer.json:   2%|1         |  295kB / 17.2MB            

  ...adapter_model.safetensors:   3%|3         |  421kB / 13.6MB            

  ...B-QLoRA/training_args.bin:   3%|3         |   192B / 6.22kB            

Model Uploaded Successfully!


In [None]:
from peft import PeftModel

lora_model = PeftModel.from_pretrained(model, "omkarwazulkar/LLama3.2-1B-QLoRA")   # <-- OR "llama-qlora-fast/checkpoint-625"
lora_model.eval()

print("\n🟣 LoRA adapter attached successfully!\n")

In [None]:
def generate_lora(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = lora_model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca", split="train")

for i in range(2):
    ex = dataset[i]
    prompt = f"Instruction: {ex['instruction']}\n\nAnswer:" if ex["input"]=="" else \
             f"Instruction: {ex['instruction']}\nInput: {ex['input']}\n\nAnswer:"

    print(f"\n========== SAMPLE {i} ==========")
    print("🟢 Original:", ex["output"])

    print("\n🟣 LoRA Output:")
    print(generate_lora(prompt))
    print("===============================\n")