In [9]:
# !pip install bitsandbytes accelerate transformers datasets peft
# !pip install --upgrade bitsandbytes accelerate transformers peft


In [28]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, TaskType, get_peft_model
import evaluate 

In [17]:
dataset = load_dataset("ag_news")

# "train" + "test" are the only splits by default
# We'll create a small validation set from "train"
split_ds = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_ds = split_ds["train"]
val_ds   = split_ds["test"]
test_ds  = dataset["test"]  # Official test

print("Train size:", len(train_ds))
print("Val size:", len(val_ds))
print("Test size:", len(test_ds))


Train size: 108000
Val size: 12000
Test size: 7600


In [18]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_ds = train_ds.map(tokenize_function, batched=True)
val_ds   = val_ds.map(tokenize_function, batched=True)
test_ds  = test_ds.map(tokenize_function, batched=True)

# Convert columns to torch tensors
train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 108000/108000 [00:56<00:00, 1897.70 examples/s]
Map: 100%|██████████| 12000/12000 [00:07<00:00, 1593.79 examples/s]
Map: 100%|██████████| 7600/7600 [00:04<00:00, 1614.01 examples/s]


In [19]:
base_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-large",
    num_labels=4
)

# Create LoRA config
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,            # LoRA rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.05
)

# Wrap base model with LoRA
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()  # see how many params are trainable

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,840,132 || all params: 357,203,976 || trainable%: 0.5151


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_

In [20]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable LoRA Params: {trainable_params}")


Trainable LoRA Params: 1840132


In [35]:
training_args = TrainingArguments(
    output_dir="./roberta_lora_logs",
    eval_strategy="steps",
    eval_steps=100,                  # Evaluate every 200 steps
    save_strategy="steps",           # Save at each epoch end
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=50,
    fp16=True,
    load_best_model_at_end=True
)

def compute_metrics(eval_preds):
    import numpy as np
    from datasets import load_metric
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)


In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss


ImportError: cannot import name 'load_metric' from 'datasets' (/home/ns6287/.local/lib/python3.9/site-packages/datasets/__init__.py)

In [None]:
metrics = trainer.evaluate(test_ds)
print("Test set metrics:", metrics)


In [None]:
trainer.save_model("./finetuned_roberta_large_lora")
tokenizer.save_pretrained("./finetuned_roberta_large_lora")

# If you want to save only LoRA adapter:
# model.save_pretrained("./lora_adapter_only")


Importing Model Later On

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig

# Step A: Load the PeftConfig to see which base model was used
peft_model_id = "finetuned_qLoRA_roberta_large"
peft_config = PeftConfig.from_pretrained(peft_model_id)

# Step B: Load the base model in 8-bit (or 4-bit) as you wish
base_model = AutoModelForSequenceClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Step C: Attach the LoRA adapters
teacher_model = PeftModel.from_pretrained(base_model, peft_model_id)
teacher_model.eval()

# Step D: Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

# Confirm it’s on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher_model.to(device)

print("Loaded the fine-tuned qLoRA model as teacher!")


In [None]:
# OPTIONAL

teacher_model = teacher_model.merge_and_unload()  # merges LoRA into base
teacher_model.save_pretrained("finetuned_roberta_large_merged")
# Now it’s a single set of weights