In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
pip install transformers accelerate


Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

# =====================
# LoRA Layer Definition
# =====================
class LoRALinear(nn.Module):
    def __init__(self, original, r=8, alpha=16):
        super().__init__()
        self.original = original
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r

        self.lora_A = nn.Parameter(torch.randn(r, original.in_features) * 0.01)
        self.lora_B = nn.Parameter(torch.randn(original.out_features, r) * 0.01)

    def forward(self, x):
        return self.original(x) + ((x @ self.lora_A.T) @ self.lora_B.T) * self.scaling


In [3]:
# LoRA Injection into Attention Layers

def apply_lora(model, target_modules=("q_proj", "v_proj"), r=8, alpha=16):
    for name, module in model.named_modules():
        if name.endswith(target_modules):
            parts = name.split(".")
            parent = model
            for p in parts[:-1]:
                if p.isdigit():
                    parent = parent[int(p)]
                else:
                    parent = getattr(parent, p)
            layer_name = parts[-1]
            original = getattr(parent, layer_name)
            if isinstance(original, nn.Linear):
                setattr(parent, layer_name, LoRALinear(original, r=r, alpha=alpha))   #REPLACES THE ORIGINAL LAYER WITH LoRA LAYER


In [4]:
# ==========================
# Count Parameters Utilities
# ==========================
def print_trainable_params(model, label=""):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)  #includes parameters that are being updated during training
    percent = 100 * trainable / total   #WHAT PERCENT OF TOTAL PARAMETERS ARE TRAINABLE
    print(f"{label} Model - Total: {total:,}, Trainable: {trainable:,} ({percent:.4f}%)")
    return total, trainable


In [5]:
# ======================
# Dataset + Tokenization
# ======================
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
dataset = load_dataset("tiny_shakespeare")["train"]
print(f"Dataset size: {len(dataset)}")
dataset = dataset.select(range(min(1000, len(dataset))))


def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize)

def collate_fn(batch):
    input_ids = torch.tensor([b["input_ids"] for b in batch])
    return {"input_ids": input_ids, "labels": input_ids}

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn) 

Dataset size: 1


In [6]:
# ================
# Evaluation Logic
# ================
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    loss_fn = nn.CrossEntropyLoss()
    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            logits = outputs.logits
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = inputs["labels"][..., 1:].contiguous()

            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            total_loss += loss.item()

            preds = shift_logits.argmax(dim=-1)
            mask = shift_labels != -100
            correct += (preds == shift_labels).masked_select(mask).sum().item()
            total += mask.sum().item()

    return total_loss / len(dataloader), correct / total


In [7]:
# ====================
# Load Model Function
# ====================
def get_model():
    model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    return model

# ================
# Main Training Run
# ================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- Evaluate base model ----
print("\n🔍 Evaluating Base Model")
base_model1 = get_model().to(device)
print_trainable_params(base_model1, label="Base")
base_loss, base_acc = evaluate(base_model1, dataloader, device)
print(f"✅ Base Model -> Loss: {base_loss:.4f}, Accuracy: {base_acc:.4f}")



🔍 Evaluating Base Model


2025-06-16 10:08:28.278008: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750068508.536516     160 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750068508.615366     160 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Base Model - Total: 1,100,048,384, Trainable: 1,100,048,384 (100.0000%)
✅ Base Model -> Loss: 2.4788, Accuracy: 0.5276


In [8]:
# ---- Apply LoRA ----
print("\n🧩 Applying LoRA and Fine-Tuning")
lora_model = get_model().to(device)
apply_lora(lora_model, r=8, alpha=16)

# Freeze base model
for param in lora_model.parameters():
    param.requires_grad = False
for module in lora_model.modules():
    if isinstance(module, LoRALinear):
        module.lora_A.requires_grad = True
        module.lora_B.requires_grad = True

print_trainable_params(lora_model, label="LoRA")

# ---- Train LoRA ----
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, lora_model.parameters()), lr=5e-4)

lora_model.train()
for epoch in range(3):
    print(f"\n🚀 Epoch {epoch+1}")
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = lora_model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()



🧩 Applying LoRA and Fine-Tuning
LoRA Model - Total: 1,101,174,784, Trainable: 1,126,400 (0.1023%)

🚀 Epoch 1


100%|██████████| 1/1 [00:06<00:00,  6.14s/it]



🚀 Epoch 2


100%|██████████| 1/1 [00:05<00:00,  5.32s/it]



🚀 Epoch 3


100%|██████████| 1/1 [00:05<00:00,  5.13s/it]


In [9]:
# ---- Evaluate LoRA Model ----
print("\n🔍 Evaluating LoRA Model")
lora_loss, lora_acc = evaluate(lora_model, dataloader, device)
print(f"✅ LoRA Model -> Loss: {lora_loss:.4f}, Accuracy: {lora_acc:.4f}")

# ---- Comparison ----
print("\n📊 === Final Comparison ===")
print(f"Base Model: Loss = {base_loss:.4f}, Accuracy = {base_acc:.4f}")
print(f"LoRA Model: Loss = {lora_loss:.4f}, Accuracy = {lora_acc:.4f}")


🔍 Evaluating LoRA Model
✅ LoRA Model -> Loss: 1.9137, Accuracy: 0.6614

📊 === Final Comparison ===
Base Model: Loss = 2.4788, Accuracy = 0.5276
LoRA Model: Loss = 1.9137, Accuracy = 0.6614


**PEFT**

In [10]:
pip install peft

Note: you may need to restart the kernel to use updated packages.


In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
import torch


In [12]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [13]:
# Load Base Model
# ====================
import copy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model2 = AutoModelForCausalLM.from_pretrained(model_name).to(device)
base_model_copy2 = copy.deepcopy(base_model2)

print("\n🔍 Evaluating Base Model")
print_trainable_params(base_model_copy2, label="Base")
base_loss2, base_acc2 = evaluate(base_model_copy2, dataloader, device)
print(f"✅ Base Model -> Loss: {base_loss2:.4f}, Accuracy: {base_acc2:.4f}")



🔍 Evaluating Base Model
Base Model - Total: 1,100,048,384, Trainable: 1,100,048,384 (100.0000%)
✅ Base Model -> Loss: 2.4788, Accuracy: 0.5276


In [14]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # depends on model architecture
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM  # since this is a language model
)


In [21]:
LoraConfig?

[0;31mInit signature:[0m
[0mLoraConfig[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtask_type[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mpeft[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mpeft_types[0m[0;34m.[0m[0mTaskType[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpeft_type[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mpeft[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mpeft_types[0m[0;34m.[0m[0mPeftType[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mauto_mapping[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mdict[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbase_model_name_or_path[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrevision[0m[0;34m:[0m [0mO

WRAPPING THE MODEL WITH PEFT

In [15]:
from peft import prepare_model_for_kbit_training  # if using 8-bit or 4-bit, optional
# model = prepare_model_for_kbit_training(model)  # Only if using quantized model

peft_model = get_peft_model(base_model2, lora_config)


peft_model = peft_model.to(device)
print("\n🧩 LoRA-Injected Model")
print_trainable_params(peft_model, label="LoRA")


🧩 LoRA-Injected Model
LoRA Model - Total: 1,101,174,784, Trainable: 1,126,400 (0.1023%)


(1101174784, 1126400)

In [17]:
# Training Loop
# ====================
from torch.optim import AdamW
peft_model.train()
optimizer = AdamW(peft_model.parameters(), lr=1e-4)

for epoch in range(3):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = peft_model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"📚 Epoch {epoch+1} -> Training Loss: {avg_loss:.4f}")


📚 Epoch 1 -> Training Loss: 2.4788
📚 Epoch 2 -> Training Loss: 2.4510
📚 Epoch 3 -> Training Loss: 2.4219


In [18]:
# Evaluation After Training
# ====================
print("\nEvaluating Fine-Tuned LoRA Model")
final_loss, final_acc = evaluate(peft_model, dataloader, device)
print(f"Base Model: Loss = {base_loss2:.4f}, Accuracy = {base_acc2:.4f}")
print(f" LoRA Model -> Loss: {final_loss:.4f}, Accuracy: {final_acc:.4f}")


Evaluating Fine-Tuned LoRA Model
Base Model: Loss = 2.4788, Accuracy = 0.5276
 LoRA Model -> Loss: 2.3910, Accuracy: 0.5591


In [20]:
print_trainable_params(base_model1, label="Base Model")
print_trainable_params(lora_model, label="Manual LoRA Model")
print_trainable_params(peft_model, label="PEFT LoRA Model")


Base Model Model - Total: 1,100,048,384, Trainable: 1,100,048,384 (100.0000%)
Manual LoRA Model Model - Total: 1,101,174,784, Trainable: 1,126,400 (0.1023%)
PEFT LoRA Model Model - Total: 1,101,174,784, Trainable: 1,126,400 (0.1023%)


(1101174784, 1126400)