In [23]:
import argparse
import torch
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    PretrainedConfig,
    default_data_collator,
    TrainingArguments,
    Trainer,
    EvalPrediction,
    AutoConfig,
)
import logging
from peft import PeftModel, AutoPeftModelForSequenceClassification
from torch.utils.data import DataLoader
import random
from Xlora.xlora import add_xlora_to_model
from Xlora.xlora_config import xLoRAConfig
from Xlora.xlora_utils import load_model
from sklearn.metrics import accuracy_score, f1_score

In [19]:
# @TODO: fill in the test args
task_name = 'mrpc'
max_seq_length = 512
padding = "max_length"
model_name = "roberta-base"
lora_model_dir = "./xlora_finetuned_model_mrpc" 

In [9]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scipy>=1.6.0
  Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.6.0
[0mNote: yo

In [10]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Determine if this is a regression task (e.g. stsb)
is_regression = task_name == "stsb"

# Load the dataset and evaluation metric
datasets = load_dataset("glue", task_name)
if not is_regression:
    label_list = datasets["train"].features["label"].names
    num_labels = len(label_list)
else:
    num_labels = 1
metric = evaluate.load("glue", task_name)

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [11]:
    # Load base model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
max_seq_length = min(max_seq_length, tokenizer.model_max_length)
sentence1_key, sentence2_key = task_to_keys[task_name]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    if task_name is not None:
        result = metric.compute(predictions=preds, references=p.label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result
    elif is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}

In [20]:
# Handle label mapping if needed
logger = logging.getLogger(__name__)
label_to_id = None
if (
    model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
    and task_name is not None
    and not is_regression
):
    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
    if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
    else:
        logger.warning(
            "Model labels do not match dataset labels. "
            f"Model labels: {list(sorted(label_name_to_id.keys()))}, Dataset labels: {list(sorted(label_list))}."
            " Ignoring model labels."
        )
elif task_name is None and not is_regression:
    label_to_id = {v: i for i, v in enumerate(label_list)}

# Preprocessing function for tokenization
def preprocess_function(examples):
    tokens = (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    result = tokenizer(*tokens, padding=padding, max_length=max_seq_length, truncation=True)
    if label_to_id is not None and "label" in examples:
        result["label"] = [label_to_id[l] if l != -1 else -1 for l in examples["label"]]
    return result

In [21]:
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=True)
train_dataset = datasets["train"]
eval_dataset = datasets["validation_matched" if task_name == "mnli" else "validation"]
test_dataset = datasets["test_matched" if task_name == "mnli" else "test"]
data_collator = default_data_collator

tasks = [task_name]
eval_datasets = [eval_dataset]
if task_name == "mnli":
    tasks.append("mnli-mm")
    eval_datasets.append(datasets["validation_mismatched"])

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [22]:
XLoRA_model_name = lora_model_dir
XLoRA_model, tokenizer = load_model(
    model_name=XLoRA_model_name,
    device="cuda:0",
    dtype=torch.bfloat16,
    load_xlora=True,
    adapters={
        "adapter_1": "./lora_finetuned_model_cola",
        "adapter_2": "./lora_finetuned_model_mrpc",
        "adapter_3": "./lora_finetuned_model_qnli",
        "adapter_4": "./lora_finetuned_model_sst2",
    },
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
trainer = Trainer(
    model=XLoRA_model,
    args=TrainingArguments(
        output_dir="./peft_test_results",
        report_to="none",
    ),
    data_collator=data_collator,           
    compute_metrics=compute_metrics
)
for dataset, task in zip(eval_datasets, tasks):
    print(f"Evaluating task: {task}")
    # Use trainer.predict to get model outputs and labels
    eval_output = trainer.predict(dataset)
    # If the model output is a tuple, we assume logits are in index 1.
    logits = eval_output.predictions[1] if isinstance(eval_output.predictions, tuple) else eval_output.predictions
    # For classification, take argmax over logits to get predicted class indices.
    preds = np.argmax(logits, axis=1)
    labels = dataset["label"]
    # print(f"Predictions: {preds}")  
    # print(f"Labels: {labels}")

    # Compute accuracy and F1 score manually using sklearn
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    combined_score = (acc + f1) / 2
    print(f"Metrics for task {task}:")
    print({"accuracy": acc, "f1": f1, "combined_score": combined_score})
    print("results for xlora eval")

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Evaluating task: mrpc


Metrics for task mrpc:
{'accuracy': 0.15441176470588236, 'f1': 0.14814299287698424, 'combined_score': 0.1512773787914333}
results for xlora eval
