# LoRA Roberta Finetuning Evaluation on SuperGLUE Tasks

## Install Necessary Libraries

In [None]:
!pip install transformers datasets peft accelerate bitsandbytes evaluate



## Imports & Setup

In [None]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import load_dataset
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from transformers import BitsAndBytesConfig, TrainerCallback
from evaluate import load
from accelerate import Accelerator
import torch
import numpy as np
import logging
import time
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# THIS PARAMETER ALLOWS YOU TO CHOOSE THE SPECIFIC TASK YOU WISH TO RUN! CHANGE TO ANY OTHER SUPERGLUE TASK MENTIONED BELOW!
# SuperGLUE tasks:
# 'boolq', 'cb', 'copa', 'multirc', 'rte', 'wic', 'wsc', 'wsc.fixed'
TASK = "rte"

## Specify Model

In [None]:
# Model we chose to train from Hugging Face
model_name = "roberta-base"

# Fine-tuned model name
new_model = f"roberta-{TASK}-finetune"

## LoRA Parameters

In [None]:
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

## BitsAndBytes Parameters

In [None]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

## TrainingArguments Parameters

In [None]:
# Output directory where the model predictions and checkpoints will be stored
output_dir = f"./results_{TASK}"

# Number of training epochs
num_train_epochs = 20

# Enable fp16/bf16 training
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 2

# Enable gradient checkpointing
gradient_checkpointing = False

# Maximum gradient norm
max_grad_norm = 1

# Learning rate
learning_rate = 1e-5

# Weight decay to apply to layers
weight_decay = 0.01

# Optimizer to use
optim = "adamw_torch"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Group sequences into batches with same length
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 500

# Log every X updates steps
logging_steps = 25

## Load SuperGLUE Dataset and Preprocess

In [None]:
# Task-specific configurations
task_configs = {
    'boolq': {'num_labels': 2, 'columns': ['passage', 'question'], 'label2id': {'False': 0, 'True': 1}},
    'cb': {'num_labels': 3, 'columns': ['premise', 'hypothesis'], 'label2id': {'entailment': 0, 'contradiction': 1, 'neutral': 2}},
    'copa': {'num_labels': 2, 'columns': ['premise', 'choice1', 'choice2'], 'special': True},
    'multirc': {'num_labels': 2, 'columns': ['paragraph', 'question', 'answer'], 'label2id': {'False': 0, 'True': 1}},
    'rte': {'num_labels': 2, 'columns': ['premise', 'hypothesis'], 'label2id': {'not_entailment': 0, 'entailment': 1}},
    'wic': {'num_labels': 2, 'columns': ['sentence1', 'sentence2', 'word'], 'label2id': {'False': 0, 'True': 1}},
    'wsc': {'num_labels': 2, 'columns': ['text', 'span1_text', 'span2_text'], 'label2id': {'False': 0, 'True': 1}},
    'wsc.fixed': {'num_labels': 2, 'columns': ['text', 'span1_text', 'span2_text'], 'label2id': {'False': 0, 'True': 1}}
}

# Get task configuration
task_config = task_configs.get(TASK)
if not task_config:
    raise ValueError(f"Task {TASK} not supported. Choose from: {list(task_configs.keys())}")

# Load the dataset
dataset = load_dataset("super_glue", TASK)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

# Ensure padding
if tokenizer.pad_token is None:
   tokenizer.add_special_tokens({'pad_token': '[PAD]'})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Data preprocessing function based on task type
def get_tokenize_function(task, config):
    """Get the appropriate tokenize function based on the task"""

    if task == 'copa':
        def tokenize_copa(examples):
            premises = examples['premise']
            choices1 = examples['choice1']
            choices2 = examples['choice2']
            questions = examples['question']
            labels = examples['label']

            processed_examples = {
                'input_ids': [],
                'attention_mask': [],
                'labels': []
            }

            for premise, choice1, choice2, question, label in zip(premises, choices1, choices2, questions, labels):
                connector = "because" if question == "cause" else "so"

                # Process the correct choice based on the label
                correct_choice = choice1 if label == 0 else choice2
                text = f"{premise} {connector} {correct_choice}"

                encoded = tokenizer(
                    text,
                    return_tensors="np",
                    truncation=True,
                    max_length=512,
                    padding="max_length"
                )

                # Add to batch
                processed_examples['input_ids'].append(encoded['input_ids'][0])
                processed_examples['attention_mask'].append(encoded['attention_mask'][0])
                processed_examples['labels'].append(0)  # Always 0 since we're reformulating as binary classification

            # Convert lists to numpy arrays
            processed_examples['input_ids'] = np.array(processed_examples['input_ids'])
            processed_examples['attention_mask'] = np.array(processed_examples['attention_mask'])
            processed_examples['labels'] = np.array(processed_examples['labels'])

            return processed_examples

        return tokenize_copa

    elif task == 'multirc':
        def tokenize_multirc(examples):
            # MultiRC needs special handling for paragraph, question, answer
            inputs = []
            for p, q, a in zip(examples['paragraph'], examples['question'], examples['answer']):
                inputs.append(f"{p} {q} {a}")

            tokenized_inputs = tokenizer(
                inputs,
                return_tensors="np",
                truncation=True,
                max_length=512,
                padding="max_length"
            )

            tokenized_inputs["labels"] = np.array(examples["label"])
            return tokenized_inputs

        return tokenize_multirc

    elif task == 'wic':
        def tokenize_wic(examples):
            # WiC needs context for word disambiguation
            inputs = []
            for s1, s2, word in zip(examples['sentence1'], examples['sentence2'], examples['word']):
                inputs.append(f"{s1} [SEP] {s2} [SEP] {word}")

            tokenized_inputs = tokenizer(
                inputs,
                return_tensors="np",
                truncation=True,
                max_length=512,
                padding="max_length"
            )

            tokenized_inputs["labels"] = np.array(examples["label"])
            return tokenized_inputs

        return tokenize_wic

    elif task == 'wsc' or task == 'wsc.fixed':
        def tokenize_wsc(examples):
            # WSC requires handling coreference resolution
            inputs = []
            for text, span1, span2 in zip(examples['text'], examples['span1_text'], examples['span2_text']):
                inputs.append(f"{text} [SEP] First span: {span1} [SEP] Second span: {span2}")

            tokenized_inputs = tokenizer(
                inputs,
                return_tensors="np",
                truncation=True,
                max_length=512,
                padding="max_length"
            )

            tokenized_inputs["labels"] = np.array(examples["label"])
            return tokenized_inputs

        return tokenize_wsc

    else:
        def tokenize_default(examples):
            if len(config['columns']) == 1:
                tokenized_inputs = tokenizer(
                    examples[config['columns'][0]],
                    return_tensors="np",
                    truncation=True,
                    max_length=512,
                    padding="max_length"
                )
            elif len(config['columns']) == 2:
                tokenized_inputs = tokenizer(
                    examples[config['columns'][0]],
                    examples[config['columns'][1]],
                    return_tensors="np",
                    truncation=True,
                    max_length=512,
                    padding="max_length"
                )
            else:
                raise ValueError(f"Unsupported number of columns for task {task}")

            tokenized_inputs["labels"] = np.array(examples["label"])
            return tokenized_inputs

        return tokenize_default

# Get the appropriate tokenize function
tokenize_function = get_tokenize_function(TASK, task_config)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

## Load Evaluation Metric

In [None]:
# Load appropriate metric for task
if TASK in ['rte', 'boolq', 'wic', 'wsc', 'wsc.fixed', 'multirc']:  # Add multirc to standard accuracy
    metric_name = 'accuracy'
elif TASK == 'cb':
    metric_name = 'f1'
elif TASK == 'copa':
    metric_name = 'accuracy'
else:
    metric_name = 'accuracy'

metric = load(metric_name)

# Function to compute metrics
def compute_metrics(pred):
    predictions, labels = pred

    if TASK == 'cb':
        predictions = predictions.argmax(axis=1)
        accuracy = metric.compute(predictions=predictions, references=labels, average='weighted')
        return accuracy
    elif TASK == 'multirc':
        predictions = predictions.argmax(axis=1)
        return metric.compute(predictions=predictions, references=labels)
    elif TASK == 'copa':
        predictions = predictions.argmax(axis=1)
        return {'accuracy': (predictions == labels).mean()}
    else:
        predictions = predictions.argmax(axis=1)
        return metric.compute(predictions=predictions, references=labels)

## Load Model & Configure LoRA

In [None]:
# Get number of labels for the task
num_labels = task_config.get('num_labels', 2)

# LoRA-specific configurations
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["query", "key", "value"]
)

# Load base model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    trust_remote_code=True,
)

# Define label mappings based on task
if 'label2id' in task_config:
    label2id = task_config['label2id']
    id2label = {v: k for k, v in label2id.items()}

    model.config.id2label = id2label
    model.config.label2id = label2id

# Add padding token
model.config.pad_token_id = tokenizer.eos_token_id

# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

# Apply LoRA to the model
lora_model = get_peft_model(model, peft_config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Initialize Accelerator

In [None]:
# Initialize the accelerator
accelerator = Accelerator(mixed_precision="bf16")

# Prepare the model
lora_model = accelerator.prepare(lora_model)

# Datasets and data collator are also prepared (optional, but useful for multi-GPU)
train_dataset = accelerator.prepare(tokenized_dataset["train"])
eval_dataset = accelerator.prepare(tokenized_dataset["validation"])

## Training / Fine-tuning

In [None]:
torch.cuda.empty_cache()

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    optim=optim,
    save_steps=save_steps,
    logging_steps=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    eval_strategy="epoch",
    logging_dir=f"./logs_{TASK}",
    lr_scheduler_type=lr_scheduler_type,
    report_to="none",
    load_best_model_at_end=True,
    save_strategy="epoch"
)

logging.basicConfig(level=logging.INFO)

# Define the Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start timer
start_time = time.time()

# Train the model
trainer.train()

# End timer
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training Time: {elapsed_time:.2f} seconds")

# GPU memory usage
if torch.cuda.is_available():
    memory_used = torch.cuda.max_memory_allocated() / 1e9
    print(f"Maximum Memory Used: {memory_used:.2f} GB")
else:
    print("GPU not available. Memory usage not tracked.")

# Total trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Trainable Parameters: {trainable_params:,}")

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3291,0.691829,0.527076
2,0.3486,0.693211,0.498195
3,0.3535,0.692252,0.563177
4,0.3418,0.694466,0.472924
5,0.3545,0.691533,0.584838
6,0.3545,0.69269,0.487365
7,0.3262,0.689982,0.541516
8,0.3193,0.689714,0.595668
9,0.3506,0.690419,0.534296
10,0.3311,0.689108,0.570397


Training Time: 772.27 seconds
Maximum Memory Used: 2.92 GB
Total Trainable Parameters: 4,131,074


## Save & Evaluate Fine-tuned Model

In [None]:
# Save the fine-tuned LoRA model
lora_model.save_pretrained(new_model)

# Evaluate the model
results = trainer.evaluate()
print(f"Evaluation results for {TASK}:")
print(results)

Evaluation results for rte:
{'eval_loss': 0.6805618405342102, 'eval_accuracy': 0.6064981949458483, 'eval_runtime': 1.6618, 'eval_samples_per_second': 166.69, 'eval_steps_per_second': 42.124, 'epoch': 19.937399678972714}
