<a href="https://colab.research.google.com/github/mounikakarasu/thesis-peft-llm/blob/main/thesis_peft_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Notebook 00: Environment Setup
# RUN THIS FIRST IN EVERY COLAB SESSION!

!pip install -q transformers datasets evaluate accelerate peft scikit-learn torch

import torch
import numpy as np
import random
from pathlib import Path

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Create necessary directories
Path('results').mkdir(exist_ok=True)
Path('kd_data').mkdir(exist_ok=True)

print('='*50)
print('Environment Setup Complete!')
print('='*50)
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
print('='*50)
print('You can now run the other notebooks!')
print('='*50)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Environment Setup Complete!
PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4
You can now run the other notebooks!


In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
from pathlib import Path
Path('results').mkdir(exist_ok=True)
Path('kd_data').mkdir(exist_ok=True)

In [4]:
# Notebook A: Teacher Baseline
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch, numpy as np, json, time
from sklearn.metrics import accuracy_score, f1_score

# Use publicly available fine-tuned BERT-large SST-2 model
TEACHER_MODEL = 'yoshitomo-matsubara/bert-large-uncased-sst2'
dataset = load_dataset('glue', 'sst2')
tokenizer = AutoTokenizer.from_pretrained(TEACHER_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(TEACHER_MODEL)
if torch.cuda.is_available():
    model = model.cuda()

def tokenize(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=96)

tokenized = dataset.map(tokenize, batched=True)

def metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {'accuracy': accuracy_score(labels, preds), 'f1': f1_score(labels, preds, average='binary')}

trainer = Trainer(model=model, args=TrainingArguments(output_dir='./t', per_device_eval_batch_size=16, fp16=True),
                  eval_dataset=tokenized['validation'], tokenizer=tokenizer, compute_metrics=metrics)

val_results = trainer.evaluate()
print(f"Accuracy: {val_results['eval_accuracy']:.4f}")

def gen_logits(model, tokenizer, examples, temp=2.0):
    model.eval()
    logits_list = []
    with torch.no_grad():
        for i in range(0, len(examples), 16):
            batch = examples[i:i+16]
            inputs = tokenizer(batch['sentence'], padding=True, truncation=True, max_length=96, return_tensors='pt')
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}
            outputs = model(**inputs)
            logits_list.extend((outputs.logits / temp).cpu().numpy().tolist())
    return logits_list

np.random.seed(42)

# Ensure directory exists
from pathlib import Path
Path('kd_data').mkdir(exist_ok=True)

idx1000 = np.random.choice(len(dataset['train']), 1000, replace=False)
data1000 = dataset['train'].select(idx1000.tolist())
logits1000 = gen_logits(model, tokenizer, data1000)

with open('kd_data/kd_1000.json', 'w') as f:
    json.dump({'sentence': list(data1000['sentence']), 'label': list(data1000['label']), 'teacher_logits': logits1000}, f)

idx500 = np.random.choice(len(dataset['train']), 500, replace=False)
data500 = dataset['train'].select(idx500.tolist())
logits500 = gen_logits(model, tokenizer, data500)

with open('kd_data/kd_500.json', 'w') as f:
    json.dump({'sentence': list(data500['sentence']), 'label': list(data500['label']), 'teacher_logits': logits500}, f)

with open('results/teacher_results.json', 'w') as f:
    json.dump({'method': 'Teacher', 'validation_accuracy': float(val_results['eval_accuracy']),
               'validation_f1': float(val_results['eval_f1']), 'trainable_params': sum(p.numel() for p in model.parameters()),
               'training_time_seconds': 0}, f, indent=2)

print('Complete!')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/304 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(model=model, args=TrainingArguments(output_dir='./t', per_device_eval_batch_size=16, fp16=True),


Accuracy: 0.9346
Complete!


In [5]:
# Notebook C1: DistilBERT Full FT
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np, json, time
from sklearn.metrics import accuracy_score, f1_score

dataset = load_dataset('glue', 'sst2')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

tokenized = dataset.map(lambda x: tokenizer(x['sentence'], padding='max_length', truncation=True, max_length=96), batched=True)

def metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {'accuracy': accuracy_score(labels, preds), 'f1': f1_score(labels, preds, average='binary')}

trainer = Trainer(model=model, args=TrainingArguments(output_dir='./dft', num_train_epochs=2, learning_rate=3e-5,
    per_device_train_batch_size=8, per_device_eval_batch_size=16, gradient_accumulation_steps=2, fp16=True,
    eval_strategy='epoch', save_strategy='no', seed=42), train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'], tokenizer=tokenizer, compute_metrics=metrics)

start = time.time()
trainer.train()
train_time = time.time() - start
val_results = trainer.evaluate()

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
with open('results/distilbert_ft_results.json', 'w') as f:
    json.dump({'method': 'DistilBERT FT', 'validation_accuracy': float(val_results['eval_accuracy']),
               'validation_f1': float(val_results['eval_f1']), 'trainable_params': trainable,
               'training_time_seconds': train_time, 'efficiency_acc_per_1M_params': float(val_results['eval_accuracy'])/(trainable/1e6),
               'efficiency_acc_per_minute': float(val_results['eval_accuracy'])/(train_time/60)}, f, indent=2)

print(f"Accuracy: {val_results['eval_accuracy']:.4f}, Time: {train_time/60:.2f} min")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(model=model, args=TrainingArguments(output_dir='./dft', num_train_epochs=2, learning_rate=3e-5,


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1786,0.288509,0.904817,0.906215
2,0.1219,0.333706,0.897936,0.90144


Accuracy: 0.8979, Time: 9.61 min


In [6]:
# Notebook C2: DistilBERT KD
# IMPORTANT: Run Teacher notebook (A) first to generate KD data!

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset, load_dataset
import torch, torch.nn.functional as F
import numpy as np, json, time
from sklearn.metrics import accuracy_score, f1_score
from pathlib import Path
import os

# Check if KD data exists
if not os.path.exists('kd_data/kd_1000.json'):
    raise FileNotFoundError("KD data not found! Please run Teacher notebook (A) first to generate kd_1000.json")

with open('kd_data/kd_1000.json', 'r') as f:
    kd_data = json.load(f)

kd_dataset = Dataset.from_dict(kd_data)
full_dataset = load_dataset('glue', 'sst2')

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

def tokenize(examples):
    # regular tokenization
    tok = tokenizer(
        examples['sentence'],
        padding='max_length',
        truncation=True,
        max_length=96
    )
    # ground-truth labels for CE loss
    tok['labels'] = examples['label']
    # KD logits (only present in kd_dataset, not in validation)
    if 'teacher_logits' in examples:
        tok['teacher_logits'] = examples['teacher_logits']
    return tok

tokenized_kd = kd_dataset.map(tokenize, batched=True)
tokenized_val = full_dataset['validation'].map(tokenize, batched=True)

# remove index column injected by HF map()
for col in ["idx"]:
    if col in tokenized_kd.column_names:
        tokenized_kd = tokenized_kd.remove_columns(col)
    if col in tokenized_val.column_names:
        tokenized_val = tokenized_val.remove_columns(col)

class KDTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):

        # labels always exist
        labels = inputs.pop("labels")

        # teacher logits exist only in KD train dataset, NOT in validation set
        teacher_logits = inputs.pop("teacher_logits", None)

        # forward pass
        outputs = model(**inputs)

        # If teacher_logits is missing → validation step → compute CE only
        if teacher_logits is None:
            ce_only = F.cross_entropy(outputs.logits, labels)
            return (ce_only, outputs) if return_outputs else ce_only

        # otherwise run KD
        soft_targets = F.softmax(teacher_logits / 2.0, dim=-1)
        soft_prob    = F.log_softmax(outputs.logits / 2.0, dim=-1)

        kd_loss = F.kl_div(soft_prob, soft_targets, reduction="batchmean") * 4.0
        ce_loss = F.cross_entropy(outputs.logits, labels)

        loss = 0.7 * kd_loss + 0.3 * ce_loss
        return (loss, outputs) if return_outputs else loss

def metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='binary')
    }

training_args = TrainingArguments(
    output_dir='./dkd',
    num_train_epochs=3,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    fp16=True,
    eval_strategy='epoch',   # correct arg name; same intent
    save_strategy='no',
    seed=42,
    remove_unused_columns=False    # <- CRITICAL: keep teacher_logits in inputs
)

trainer = KDTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_kd,
    eval_dataset=tokenized_val,
    compute_metrics=metrics,
    # NOTE: no tokenizer passed → default collator keeps all fields
)

start = time.time()
trainer.train()
train_time = time.time() - start
val_results = trainer.evaluate()

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

Path('results').mkdir(exist_ok=True)
with open('results/distilbert_kd_results.json', 'w') as f:
    json.dump(
        {
            'method': 'DistilBERT KD',
            'kd_samples': 1000,
            'validation_accuracy': float(val_results['eval_accuracy']),
            'validation_f1': float(val_results['eval_f1']),
            'trainable_params': trainable,
            'training_time_seconds': train_time,
            'efficiency_acc_per_1M_params': float(val_results['eval_accuracy']) / (trainable / 1e6),
            'efficiency_acc_per_minute': float(val_results['eval_accuracy']) / (train_time / 60)
        },
        f,
        indent=2
    )

print(f"Accuracy: {val_results['eval_accuracy']:.4f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.382407,0.838303,0.829916
2,No log,0.384941,0.841743,0.831296
3,No log,0.360117,0.856651,0.856816


Accuracy: 0.8567


In [7]:
# Notebook B: BERT-large + LoRA
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import numpy as np, json, time
from sklearn.metrics import accuracy_score, f1_score
from pathlib import Path

lora_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=32, lora_alpha=64, lora_dropout=0.1,
                         target_modules=['query', 'key', 'value'])

dataset = load_dataset('glue', 'sst2')
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
base_model = AutoModelForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

tokenized = dataset.map(lambda x: tokenizer(x['sentence'], padding='max_length', truncation=True, max_length=96), batched=True)

def metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {'accuracy': accuracy_score(labels, preds), 'f1': f1_score(labels, preds, average='binary')}

trainer = Trainer(model=model, args=TrainingArguments(output_dir='./lora', num_train_epochs=1, learning_rate=5e-4,
    per_device_train_batch_size=8, per_device_eval_batch_size=16, gradient_accumulation_steps=2,
    gradient_checkpointing=True, fp16=True, eval_strategy='epoch', save_strategy='no', seed=42),
    train_dataset=tokenized['train'], eval_dataset=tokenized['validation'], tokenizer=tokenizer, compute_metrics=metrics)

start = time.time()
trainer.train()
train_time = time.time() - start
val_results = trainer.evaluate()

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

Path('results').mkdir(exist_ok=True)
with open('results/bert_lora_results.json', 'w') as f:
    json.dump({'method': 'BERT-large LoRA', 'validation_accuracy': float(val_results['eval_accuracy']),
               'validation_f1': float(val_results['eval_f1']), 'trainable_params': trainable, 'training_time_seconds': train_time,
               'efficiency_acc_per_1M_params': float(val_results['eval_accuracy'])/(trainable/1e6),
               'efficiency_acc_per_minute': float(val_results['eval_accuracy'])/(train_time/60)}, f, indent=2)

print(f"Accuracy: {val_results['eval_accuracy']:.4f}, Time: {train_time/60:.2f} min")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,720,642 || all params: 339,864,580 || trainable%: 1.3890


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(model=model, args=TrainingArguments(output_dir='./lora', num_train_epochs=1, learning_rate=5e-4,


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.563,0.507067,0.813073,0.827513


Accuracy: 0.8131, Time: 8.94 min


In [8]:
# Notebook D: Hybrid LoRA + KD
# IMPORTANT: Run Teacher notebook (A) first to generate KD data!

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset, load_dataset
import torch, torch.nn.functional as F
import numpy as np, json, time
from sklearn.metrics import accuracy_score, f1_score
from pathlib import Path
import os

# --------- KD DATA CHECK ----------
if not os.path.exists('kd_data/kd_500.json'):
    raise FileNotFoundError("KD data not found! Please run Teacher notebook (A) first to generate kd_500.json")

# --------- LoRA CONFIG ----------
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=["query", "key", "value"]
)

# --------- LOAD DATA ----------
with open('kd_data/kd_500.json', 'r') as f:
    kd_data = json.load(f)

kd_dataset = Dataset.from_dict(kd_data)
full_dataset = load_dataset("glue", "sst2")

# --------- TOKENIZER + MODEL ----------
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
base_model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2)
model = get_peft_model(base_model, lora_config)

# allow training in last two encoder layers
for param in model.base_model.bert.encoder.layer[22].parameters():
    param.requires_grad = True
for param in model.base_model.bert.encoder.layer[23].parameters():
    param.requires_grad = True

# --------- TOKENIZATION ----------
def tokenize(examples):
    tok = tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=96
    )
    tok["labels"] = examples["label"]        # CRITICAL for CE loss
    if "teacher_logits" in examples:
        tok["teacher_logits"] = examples["teacher_logits"]
    return tok

tokenized_kd = kd_dataset.map(tokenize, batched=True)
tokenized_val = full_dataset["validation"].map(tokenize, batched=True)

# remove HF-added idx column if present
for col in ["idx"]:
    if col in tokenized_kd.column_names:
        tokenized_kd = tokenized_kd.remove_columns(col)
    if col in tokenized_val.column_names:
        tokenized_val = tokenized_val.remove_columns(col)

# --------- CUSTOM TRAINER ----------
class HybridTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):

        labels = inputs.pop("labels")

        # teacher logits → only in KD dataset, NOT validation set
        teacher_logits = inputs.pop("teacher_logits", None)

        outputs = model(**inputs)

        # validation forward: CE only
        if teacher_logits is None:
            ce_only = F.cross_entropy(outputs.logits, labels)
            return (ce_only, outputs) if return_outputs else ce_only

        # KD path
        soft_targets = F.softmax(teacher_logits / 2.0, dim=-1)
        soft_prob    = F.log_softmax(outputs.logits / 2.0, dim=-1)

        kd_loss = F.kl_div(soft_prob, soft_targets, reduction="batchmean") * 4.0
        ce_loss = F.cross_entropy(outputs.logits, labels)

        loss = 0.7 * kd_loss + 0.3 * ce_loss
        return (loss, outputs) if return_outputs else loss

# --------- TRAINING ARGUMENTS ----------
training_args = TrainingArguments(
    output_dir="./hybrid",
    num_train_epochs=3,
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="no",
    seed=42,
    remove_unused_columns=False    # CRITICAL so teacher_logits is not dropped
)

# --------- TRAINER ----------
trainer = HybridTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_kd,
    eval_dataset=tokenized_val,
    compute_metrics=lambda p: {
        "accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
        "f1": f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average="binary")
    }
)

# --------- TRAIN ----------
start = time.time()
trainer.train()
train_time = time.time() - start
val_results = trainer.evaluate()

# --------- METRICS SAVE ----------
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

Path("results").mkdir(exist_ok=True)
with open("results/hybrid_lora_kd_results.json", "w") as f:
    json.dump(
        {
            "method": "Hybrid",
            "kd_samples": 500,
            "validation_accuracy": float(val_results["eval_accuracy"]),
            "validation_f1": float(val_results["eval_f1"]),
            "trainable_params": trainable,
            "training_time_seconds": train_time,
            "efficiency_acc_per_1M_params": float(val_results["eval_accuracy"]) / (trainable / 1e6),
            "efficiency_acc_per_minute": float(val_results["eval_accuracy"]) / (train_time / 60)
        },
        f,
        indent=2
    )

print(f"Accuracy: {val_results['eval_accuracy']:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.680487,0.579128,0.681147
2,No log,0.680447,0.542431,0.684585
3,No log,0.681388,0.540138,0.684996




Accuracy: 0.5401


In [None]:
# Notebook D: Hybrid LoRA + KD
# IMPORTANT: Run Teacher notebook (A) first to generate KD data!

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset, load_dataset
import torch, torch.nn.functional as F
import numpy as np, json, time
from sklearn.metrics import accuracy_score, f1_score
from pathlib import Path
import os

# Check if KD data exists
if not os.path.exists('kd_data/kd_500.json'):
    raise FileNotFoundError("KD data not found! Please run Teacher notebook (A) first to generate kd_500.json")

lora_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=32, lora_alpha=64, lora_dropout=0.1, target_modules=['query', 'key', 'value'])

with open('kd_data/kd_500.json', 'r') as f:
    kd_data = json.load(f)

kd_dataset = Dataset.from_dict(kd_data)
full_dataset = load_dataset('glue', 'sst2')
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
base_model = AutoModelForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
model = get_peft_model(base_model, lora_config)

for param in model.base_model.bert.encoder.layer[22].parameters():
    param.requires_grad = True
for param in model.base_model.bert.encoder.layer[23].parameters():
    param.requires_grad = True

def tokenize(examples):
    tok = tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=96)
    if 'teacher_logits' in examples:
        tok['teacher_logits'] = examples['teacher_logits']
    return tok

tokenized_kd = kd_dataset.map(tokenize, batched=True)
tokenized_val = full_dataset['validation'].map(tokenize, batched=True)

class HybridTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        teacher_logits = inputs.pop('teacher_logits')
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        soft_targets = F.softmax(teacher_logits / 2.0, dim=-1)
        soft_prob = F.log_softmax(outputs.logits / 2.0, dim=-1)
        kd_loss = F.kl_div(soft_prob, soft_targets, reduction='batchmean') * 4.0
        ce_loss = F.cross_entropy(outputs.logits, labels)
        loss = 0.7 * kd_loss + 0.3 * ce_loss
        return (loss, outputs) if return_outputs else loss

def metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {'accuracy': accuracy_score(labels, preds), 'f1': f1_score(labels, preds, average='binary')}

trainer = HybridTrainer(model=model, args=TrainingArguments(output_dir='./hybrid', num_train_epochs=3, learning_rate=5e-4,
    per_device_train_batch_size=8, per_device_eval_batch_size=16, gradient_accumulation_steps=2, gradient_checkpointing=True,
    fp16=True, eval_strategy='epoch', save_strategy='no', seed=42), train_dataset=tokenized_kd,
    eval_dataset=tokenized_val, tokenizer=tokenizer, compute_metrics=metrics)

start = time.time()
trainer.train()
train_time = time.time() - start
val_results = trainer.evaluate()

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

Path('results').mkdir(exist_ok=True)
with open('results/hybrid_lora_kd_results.json', 'w') as f:
    json.dump({'method': 'Hybrid', 'kd_samples': 500, 'validation_accuracy': float(val_results['eval_accuracy']),
               'validation_f1': float(val_results['eval_f1']), 'trainable_params': trainable, 'training_time_seconds': train_time,
               'efficiency_acc_per_1M_params': float(val_results['eval_accuracy'])/(trainable/1e6),
               'efficiency_acc_per_minute': float(val_results['eval_accuracy'])/(train_time/60)}, f, indent=2)

print(f"Accuracy: {val_results['eval_accuracy']:.4f}")

In [9]:
# Results Analyzer
import json
import pandas as pd

files = {'Teacher (A)': 'results/teacher_results.json', 'DistilBERT FT (C1)': 'results/distilbert_ft_results.json',
         'DistilBERT KD (C2)': 'results/distilbert_kd_results.json', 'BERT-large LoRA (B)': 'results/bert_lora_results.json',
         'Hybrid (D)': 'results/hybrid_lora_kd_results.json'}

results = {}
for name, path in files.items():
    try:
        with open(path) as f:
            results[name] = json.load(f)
    except:
        print(f'Missing: {path}')

data = []
for name, d in results.items():
    data.append({'Method': name, 'Accuracy': f"{d.get('validation_accuracy',0):.4f}",
                 'F1': f"{d.get('validation_f1',0):.4f}", 'Params': f"{d.get('trainable_params',0):,}",
                 'Time(min)': f"{d.get('training_time_seconds',0)/60:.2f}",
                 'Acc/M': f"{d.get('efficiency_acc_per_1M_params',0):.4f}",
                 'Acc/Min': f"{d.get('efficiency_acc_per_minute',0):.4f}"})

df = pd.DataFrame(data)
print('='*100)
print('PIPELINE RESULTS')
print('='*100)
print(df.to_string(index=False))
print('='*100)

if 'Teacher (A)' in results and 'BERT-large LoRA (B)' in results:
    ta = results['Teacher (A)']['validation_accuracy']
    la = results['BERT-large LoRA (B)']['validation_accuracy']
    lp = results['BERT-large LoRA (B)']['trainable_params']
    tp = results['Teacher (A)']['trainable_params']
    print(f'Q1: Teacher {ta:.4f} vs LoRA {la:.4f} (Gap: {abs(ta-la):.4f}, Params: {100*lp/tp:.2f}%)')

if 'DistilBERT FT (C1)' in results and 'DistilBERT KD (C2)' in results:
    fa = results['DistilBERT FT (C1)']['validation_accuracy']
    ka = results['DistilBERT KD (C2)']['validation_accuracy']
    print(f'Q2: FT {fa:.4f} vs KD {ka:.4f} (KD achieves {100*ka/fa:.1f}% with 1.5% data)')

print('Complete!')

PIPELINE RESULTS
             Method Accuracy     F1      Params Time(min)  Acc/M Acc/Min
        Teacher (A)   0.9346 0.9356 335,143,938      0.00 0.0000  0.0000
 DistilBERT FT (C1)   0.8979 0.9014  66,955,010      9.61 0.0134  0.0934
 DistilBERT KD (C2)   0.8567 0.8568  66,955,010      0.26 0.0128  3.3129
BERT-large LoRA (B)   0.8131 0.8275   4,720,642      8.94 0.1722  0.0909
         Hybrid (D)   0.5401 0.6850  29,913,090      0.43 0.0181  1.2550
Q1: Teacher 0.9346 vs LoRA 0.8131 (Gap: 0.1216, Params: 1.41%)
Q2: FT 0.8979 vs KD 0.8567 (KD achieves 95.4% with 1.5% data)
Complete!
