In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda as cuda
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import time
import matplotlib.pyplot as plt
import psutil
from IPython.display import clear_output
import gc

# LoRA Setup

First, we need to setup a LoRALayer class which we can then inject into our transformer model, switching out only the query and value attention layers as stated in the original paper. The following code should work on any transformer model and therefore is model-agnostic, as long as the query and value layers hold "query" and "value" in their names.

In [2]:
class LoRALayer(nn.Module):
    def __init__(self, original_layer, rank=4, alpha=1):
        super().__init__()
        self.original_layer = original_layer
        for param in self.original_layer.parameters():
            param.requires_grad = False
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank
        self.lora_A = nn.Parameter(torch.randn(original_layer.in_features, rank) * 0.01)
        self.lora_B = nn.Parameter(torch.zeros(rank, original_layer.out_features))

    def forward(self, x):
        original_output = self.original_layer(x)
        lora_output = (x @ self.lora_A @ self.lora_B) * self.scaling
        return original_output + lora_output

def apply_lora(model, rank=8, alpha=8):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any(x in name for x in ['query', 'value']):
            parent_name = '.'.join(name.split('.')[:-1])
            child_name = name.split('.')[-1]
            parent = model.get_submodule(parent_name)
            lora_layer = LoRALayer(module, rank, alpha)
            setattr(parent, child_name, lora_layer)
    
    for name, param in model.named_parameters():
        if 'lora_A' in name or 'lora_B' in name:
            param.requires_grad = True
        else:
            param.requires_grad = False
    
    return model

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [3]:
from sklearn.metrics import matthews_corrcoef, f1_score
import numpy as np

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    start_time = time.time()
    
    for batch_idx, batch in enumerate(tqdm(dataloader, desc="Training")):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_samples += batch['labels'].size(0)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        if (batch_idx + 1) % 100 == 0:
            current_loss = total_loss / (batch_idx + 1)
            current_accuracy = total_correct / total_samples
            print(f"  Batch {batch_idx+1}: Loss: {current_loss:.4f}, Accuracy: {current_accuracy:.4f}")
    
    epoch_time = time.time() - start_time
    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples
    
    return avg_loss, accuracy, epoch_time

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    start_time = time.time()
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    
    eval_time = time.time() - start_time
    avg_loss = total_loss / len(dataloader)
    
    # Convert lists to numpy arrays
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    
    # Calculate accuracy
    accuracy = (all_predictions == all_labels).mean()
    
    # Calculate Matthews Correlation Coefficient
    mcc = matthews_corrcoef(all_labels, all_predictions)
    
    # Calculate F1 score
    f1 = f1_score(all_labels, all_predictions, average='binary')
    
    return avg_loss, accuracy, mcc, f1, eval_time

In [4]:
def train_model(model, train_dataloader, eval_dataloader, learning_rate, num_epochs, device):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=250, num_training_steps=len(train_dataloader) * num_epochs)
    
    train_losses = []
    train_accuracies = []
    eval_losses = []
    eval_accuracies = []
    
    total_train_time = 0
    total_eval_time = 0
    peak_memory_usage = 0
    
    for epoch in range(num_epochs):
        # Training
        train_loss, train_accuracy, train_time = train_epoch(model, train_dataloader, optimizer, scheduler, device)
        total_train_time += train_time
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        
        # Update peak memory usage
        current_memory = torch.cuda.memory_allocated() / 1e9  # Convert to GB
        peak_memory_usage = max(peak_memory_usage, current_memory)
        
        # Evaluation
        eval_loss, eval_accuracy, eval_mcc, eval_f1, eval_time = evaluate(model, eval_dataloader, device)
        total_eval_time += eval_time
        eval_losses.append(eval_loss)
        eval_accuracies.append(eval_accuracy)
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}, Time: {train_time:.2f}s")
        print(f"  Eval Loss: {eval_loss:.4f}, Accuracy: {eval_accuracy:.4f}, Time: {eval_time:.2f}s")
    
    return {
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'eval_losses': eval_losses,
        'eval_accuracies': eval_accuracies,
        'total_train_time': total_train_time,
        'total_eval_time': total_eval_time,
        'peak_memory_usage': peak_memory_usage,
        'final_train_accuracy': train_accuracies[-1],
        'final_eval_accuracy': eval_accuracies[-1]
    }

def print_train_results(name, results):
    print(f"{name}:")
    print(f"  Total training time: {results['total_train_time']:.2f}s")
    print(f"  Total evaluation time: {results['total_eval_time']:.2f}s")
    print(f"  Peak GPU memory usage: {results['peak_memory_usage']:.2f}GB")
    print(f"  Final evaluation accuracy: {results['eval_accuracies'][-1]:.4f}")

# Reproducing MRPC with RoBeRTa-base

We restrict ourselves to comparing full fine-tuning with LoRA on RoBeRTa-base, testing MRPC (this notebook) and SST-2 and CoLA (other notebooks) of the GLUE benchmark. Because we do not have the resources to create a proper submission for GLUE and therefore also cannot do a proper evaluation on the test split, we create our own test split from part of the train split.

In this notebook, we reproduce the Microsoft Research Paraphrase Corpus (MRPC), which contains sentences and a label whether they are a paraphrase of each other, therefore being a binary classification task. As in the paper, we will use a batch size of 16, a learning rate of 4e-4 for LoRA, a rank of 8, alpha of 8 and a maximum sequence length of 512.

### Setup

In [5]:
batch_size = 16
num_epochs = 5
full_learning_rate = 1e-5
lora_learning_rate = 4e-4
rank = 8
alpha = 8
max_sequence_length = 512

In [6]:
raw_dataset = load_dataset("glue", "mrpc")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=max_sequence_length)
    
split = raw_dataset["train"].train_test_split(test_size=0.1)
dataset = split
dataset["validation"] = raw_dataset["validation"]

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size)

Map:   0%|          | 0/3301 [00:00<?, ? examples/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Full fine-tuning setup
full_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
full_params = sum(p.numel() for p in full_model.parameters() if p.requires_grad)
print(f"Full fine-tuning trainable parameters: {full_params}")

# LoRA setup
lora_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
lora_model = apply_lora(lora_model, rank=rank, alpha=alpha).to(device)
lora_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
print(f"LoRA trainable parameters: {lora_params}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Full fine-tuning trainable parameters: 124647170


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA trainable parameters: 294912


### Training

In [8]:
print("Running full fine-tuning experiment...")
gc.collect()
torch.cuda.empty_cache()
full_results = train_model(full_model, train_dataloader, eval_dataloader, full_learning_rate, num_epochs, device)

Running full fine-tuning experiment...


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.6566, Accuracy: 0.6706
  Batch 200: Loss: 0.6285, Accuracy: 0.6700


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 1/5
  Train Loss: 0.6249, Accuracy: 0.6707, Time: 294.75s
  Eval Loss: 0.4920, Accuracy: 0.7157, Time: 12.69s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.4634, Accuracy: 0.7875
  Batch 200: Loss: 0.4285, Accuracy: 0.8072


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 2/5
  Train Loss: 0.4242, Accuracy: 0.8095, Time: 298.04s
  Eval Loss: 0.2894, Accuracy: 0.8873, Time: 12.70s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.2704, Accuracy: 0.8919
  Batch 200: Loss: 0.2570, Accuracy: 0.8931


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 3/5
  Train Loss: 0.2563, Accuracy: 0.8922, Time: 298.04s
  Eval Loss: 0.2947, Accuracy: 0.8824, Time: 12.73s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.1463, Accuracy: 0.9456
  Batch 200: Loss: 0.1592, Accuracy: 0.9359


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 4/5
  Train Loss: 0.1579, Accuracy: 0.9364, Time: 298.15s
  Eval Loss: 0.3013, Accuracy: 0.8995, Time: 12.69s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.0997, Accuracy: 0.9656
  Batch 200: Loss: 0.0929, Accuracy: 0.9706


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 5/5
  Train Loss: 0.0935, Accuracy: 0.9697, Time: 297.98s
  Eval Loss: 0.3179, Accuracy: 0.8922, Time: 12.70s


In [9]:
print("\nRunning LoRA experiment...")
torch.cuda.empty_cache()
gc.collect()
lora_results = train_model(lora_model, train_dataloader, eval_dataloader, lora_learning_rate, num_epochs, device)


Running LoRA experiment...


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.6670, Accuracy: 0.6675
  Batch 200: Loss: 0.6357, Accuracy: 0.6731


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 1/5
  Train Loss: 0.6334, Accuracy: 0.6722, Time: 224.99s
  Eval Loss: 0.5502, Accuracy: 0.6838, Time: 12.67s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.5392, Accuracy: 0.6744
  Batch 200: Loss: 0.5010, Accuracy: 0.7034


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 2/5
  Train Loss: 0.4984, Accuracy: 0.7052, Time: 226.51s
  Eval Loss: 0.3926, Accuracy: 0.8186, Time: 12.66s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.3958, Accuracy: 0.8206
  Batch 200: Loss: 0.3908, Accuracy: 0.8272


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 3/5
  Train Loss: 0.3936, Accuracy: 0.8243, Time: 226.48s
  Eval Loss: 0.3351, Accuracy: 0.8505, Time: 12.67s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.3383, Accuracy: 0.8500
  Batch 200: Loss: 0.3315, Accuracy: 0.8538


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 4/5
  Train Loss: 0.3296, Accuracy: 0.8543, Time: 226.39s
  Eval Loss: 0.3023, Accuracy: 0.8603, Time: 12.65s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.3290, Accuracy: 0.8562
  Batch 200: Loss: 0.3156, Accuracy: 0.8622


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 5/5
  Train Loss: 0.3121, Accuracy: 0.8637, Time: 226.46s
  Eval Loss: 0.3019, Accuracy: 0.8676, Time: 12.67s


In [11]:
lora_results2 = train_model(lora_model, train_dataloader, eval_dataloader, lora_learning_rate, 2, device)

Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.2639, Accuracy: 0.8900
  Batch 200: Loss: 0.2746, Accuracy: 0.8878


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 1/2
  Train Loss: 0.2750, Accuracy: 0.8876, Time: 223.72s
  Eval Loss: 0.3197, Accuracy: 0.8676, Time: 12.68s


Training:   0%|          | 0/207 [00:00<?, ?it/s]

  Batch 100: Loss: 0.2561, Accuracy: 0.9038
  Batch 200: Loss: 0.2589, Accuracy: 0.8972


Evaluating:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 2/2
  Train Loss: 0.2594, Accuracy: 0.8964, Time: 226.91s
  Eval Loss: 0.3127, Accuracy: 0.8578, Time: 12.71s


In [10]:
print("\nPerformance Comparison:")
print_train_results("Full", full_results)
print_train_results("LoRA", lora_results)


Performance Comparison:
Full:
  Total training time: 3436.54s
  Total evaluation time: 159.54s
  Peak GPU memory usage: 2.02GB
  Final evaluation accuracy: 0.8389
LoRA:
  Total training time: 2610.91s
  Total evaluation time: 161.06s
  Peak GPU memory usage: 1.02GB
  Final evaluation accuracy: 0.8207


### Evaluate and Save to Disk

In [12]:
print("Evaluating full fine-tuned model:")
full_loss, full_accuracy, full_mcc, full_f1, full_eval_time = evaluate(full_model, test_dataloader, device)
print(f"Full model - Accuracy: {full_accuracy:.4f}, Loss: {full_loss:.4f}, MCC: {full_mcc:.4f}, F1: {full_f1:.4f}, Eval time: {full_eval_time:.2f}s")

print("\nEvaluating LoRA model:")
lora_loss, lora_accuracy, lora_mcc, lora_f1, lora_eval_time = evaluate(lora_model, test_dataloader, device)
print(f"LoRA model - Accuracy: {lora_accuracy:.4f}, Loss: {lora_loss:.4f}, MCC: {lora_mcc:.4f}, F1: {lora_f1:.4f}, Eval time: {lora_eval_time:.2f}s")

Evaluating full fine-tuned model:


Evaluating:   0%|          | 0/23 [00:00<?, ?it/s]

Full model - Accuracy: 0.8856, Loss: 0.3648, MCC: 0.7433, F1: 0.9139, Eval time: 10.48s

Evaluating LoRA model:


Evaluating:   0%|          | 0/23 [00:00<?, ?it/s]

LoRA model - Accuracy: 0.8856, Loss: 0.3076, MCC: 0.7417, F1: 0.9146, Eval time: 10.76s


In [13]:
import os

full_save_path = "./mrpc-full_finetuned_model.pt"
torch.save(full_model.state_dict(), full_save_path)
full_model.save_pretrained("mrpc-full_finetuned_model")
full_model_size = os.path.getsize(full_save_path) / (1024 * 1024)  # Size in MB
print(f"\nFull fine-tuned model saved to {full_save_path}")
print(f"Full model size: {full_model_size:.2f} MB")

lora_save_path = "./mrpc-lora_and_classifier.pt"
lora_state_dict = {name: param for name, param in lora_model.named_parameters() 
                   if 'lora_A' in name or 'lora_B' in name or 'classifier' in name}
torch.save(lora_state_dict, lora_save_path)
lora_model_size = os.path.getsize(lora_save_path) / (1024 * 1024)  # Size in MB
print(f"LoRA layers and classifier saved to {lora_save_path}")
print(f"LoRA model size: {lora_model_size:.2f} MB")

print(f"\nSize reduction: {full_model_size / lora_model_size:.2f}x")


Full fine-tuned model saved to ./mrpc-full_finetuned_model.pt
Full model size: 475.57 MB
LoRA layers and classifier saved to ./mrpc-lora_and_classifier.pt
LoRA model size: 3.40 MB

Size reduction: 139.75x


### Load Model from Disk

In [15]:
# Load full fine-tuned model
loaded_full_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
loaded_full_model.load_state_dict(torch.load(full_save_path))
loaded_full_model.to(device)

# Evaluate loaded full model
full_loss, full_accuracy, full_eval_time = evaluate(loaded_full_model, test_dataloader, device)
print(f"Loaded full model - Accuracy: {full_accuracy:.4f}, Loss: {full_loss:.4f}, Eval time: {full_eval_time:.2f}s")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

Loaded full model - Accuracy: 0.9538, Loss: 0.1755, Eval time: 191.10s


In [17]:
# Load LoRA model
base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
loaded_lora_model = apply_lora(base_model, rank=8, alpha=8)
loaded_lora_model.load_state_dict(torch.load(lora_save_path), strict=False)
loaded_lora_model.to(device)

# Evaluate loaded LoRA model
lora_loss, lora_accuracy, lora_eval_time = evaluate(loaded_lora_model, test_dataloader, device)
print(f"Loaded LoRA model - Accuracy: {lora_accuracy:.4f}, Loss: {lora_loss:.4f}, Eval time: {lora_eval_time:.2f}s")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

Loaded LoRA model - Accuracy: 0.9468, Loss: 0.1553, Eval time: 201.92s


In [18]:
# Demonstrate that LoRA is there and that without it the model performs poorly

print("\nVerifying LoRA layers:")
for name, module in loaded_lora_model.named_modules():
    if isinstance(module, LoRALayer):
        print(f"LoRA layer found: {name}")
        print(f"lora_A shape: {module.lora_A.shape}")
        print(f"lora_B shape: {module.lora_B.shape}")
        print()

print("Evaluating model with zeroed LoRA weights:")
for name, param in loaded_lora_model.named_parameters():
    if 'lora_A' in name or 'lora_B' in name:
        param.data.zero_()

zero_loss, zero_accuracy, zero_eval_time = evaluate(loaded_lora_model, test_dataloader, device)
print(f"Zeroed LoRA model - Accuracy: {zero_accuracy:.4f}, Loss: {zero_loss:.4f}, Eval time: {zero_eval_time:.2f}s")


Verifying LoRA layers:
LoRA layer found: roberta.encoder.layer.0.attention.self.query
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.0.attention.self.value
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.1.attention.self.query
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.1.attention.self.value
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.2.attention.self.query
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.2.attention.self.value
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.3.attention.self.query
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.laye

Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

Zeroed LoRA model - Accuracy: 0.5543, Loss: 0.6874, Eval time: 200.53s
