In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda as cuda
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import time
import matplotlib.pyplot as plt
import psutil
from IPython.display import clear_output
import gc

# LoRA Setup

First, we need to setup a LoRALayer class which we can then inject into our transformer model, switching out only the query and value attention layers as stated in the original paper. The following code should work on any transformer model and therefore is model-agnostic, as long as the query and value layers hold "query" and "value" in their names.

In [2]:
class LoRALayer(nn.Module):
    def __init__(self, original_layer, rank=4, alpha=1):
        super().__init__()
        self.original_layer = original_layer
        for param in self.original_layer.parameters():
            param.requires_grad = False
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank
        self.lora_A = nn.Parameter(torch.randn(original_layer.in_features, rank) * 0.01)
        self.lora_B = nn.Parameter(torch.zeros(rank, original_layer.out_features))

    def forward(self, x):
        original_output = self.original_layer(x)
        lora_output = (x @ self.lora_A @ self.lora_B) * self.scaling
        return original_output + lora_output

def apply_lora(model, rank=8, alpha=8):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any(x in name for x in ['query', 'value']):
            parent_name = '.'.join(name.split('.')[:-1])
            child_name = name.split('.')[-1]
            parent = model.get_submodule(parent_name)
            lora_layer = LoRALayer(module, rank, alpha)
            setattr(parent, child_name, lora_layer)
    
    for name, param in model.named_parameters():
        if 'lora_A' in name or 'lora_B' in name:
            param.requires_grad = True
        else:
            param.requires_grad = False
    
    return model

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [16]:
from sklearn.metrics import matthews_corrcoef, f1_score
import numpy as np

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    start_time = time.time()
    
    for batch_idx, batch in enumerate(tqdm(dataloader, desc="Training")):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_samples += batch['labels'].size(0)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        if (batch_idx + 1) % 100 == 0:
            current_loss = total_loss / (batch_idx + 1)
            current_accuracy = total_correct / total_samples
            print(f"  Batch {batch_idx+1}: Loss: {current_loss:.4f}, Accuracy: {current_accuracy:.4f}")
    
    epoch_time = time.time() - start_time
    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples
    
    return avg_loss, accuracy, epoch_time

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    start_time = time.time()
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    
    eval_time = time.time() - start_time
    avg_loss = total_loss / len(dataloader)
    
    # Convert lists to numpy arrays
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    
    # Calculate accuracy
    accuracy = (all_predictions == all_labels).mean()
    
    # Calculate Matthews Correlation Coefficient
    mcc = matthews_corrcoef(all_labels, all_predictions)
    
    # Calculate F1 score
    f1 = f1_score(all_labels, all_predictions, average='binary')
    
    return avg_loss, accuracy, mcc, f1, eval_time

In [4]:
def train_model(model, train_dataloader, eval_dataloader, learning_rate, num_epochs, device):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=250, num_training_steps=len(train_dataloader) * num_epochs)
    
    train_losses = []
    train_accuracies = []
    eval_losses = []
    eval_accuracies = []
    
    total_train_time = 0
    total_eval_time = 0
    peak_memory_usage = 0
    
    for epoch in range(num_epochs):
        # Training
        train_loss, train_accuracy, train_time = train_epoch(model, train_dataloader, optimizer, scheduler, device)
        total_train_time += train_time
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        
        # Update peak memory usage
        current_memory = torch.cuda.memory_allocated() / 1e9  # Convert to GB
        peak_memory_usage = max(peak_memory_usage, current_memory)
        
        # Evaluation
        eval_loss, eval_accuracy, eval_time = evaluate(model, eval_dataloader, device)
        total_eval_time += eval_time
        eval_losses.append(eval_loss)
        eval_accuracies.append(eval_accuracy)
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}, Time: {train_time:.2f}s")
        print(f"  Eval Loss: {eval_loss:.4f}, Accuracy: {eval_accuracy:.4f}, Time: {eval_time:.2f}s")
    
    return {
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'eval_losses': eval_losses,
        'eval_accuracies': eval_accuracies,
        'total_train_time': total_train_time,
        'total_eval_time': total_eval_time,
        'peak_memory_usage': peak_memory_usage,
        'final_train_accuracy': train_accuracies[-1],
        'final_eval_accuracy': eval_accuracies[-1]
    }

def print_train_results(name, results):
    print(f"{name}:")
    print(f"  Total training time: {results['total_train_time']:.2f}s")
    print(f"  Total evaluation time: {results['total_eval_time']:.2f}s")
    print(f"  Peak GPU memory usage: {results['peak_memory_usage']:.2f}GB")
    print(f"  Final evaluation accuracy: {results['eval_accuracies'][-1]:.4f}")

# Reproducing CoLA with RoBeRTa-base

We restrict ourselves to comparing full fine-tuning with LoRA on RoBeRTa-base, testing CoLA (this notebook) and SST-2 MRPC (other notebooks) of the GLUE benchmark. Because we do not have the resources to create a proper submission for GLUE and therefore also cannot do a proper evaluation on the test split, we create our own test split from part of the train split.

## SST-2

In this notebook, we reproduce the Corpus of Linguistic Acceptability (CoLA), which contains sentences and a label whether they are linguistically acceptable or not, therefore being a binary classification task. As in the paper, we will use  a learning rate of 4e-4 for LoRA, a rank of 8, alpha of 8 and a maximum sequence length of 512. Different to the paper (because of our memory limitations), we use a batch size of 16 instead of 32.

### Setup

In [5]:
batch_size = 16
num_epochs = 5
full_learning_rate = 1e-5
lora_learning_rate = 4e-4
rank = 8
alpha = 8
max_sequence_length = 512

In [6]:
raw_dataset = load_dataset("glue", "cola")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=max_sequence_length)

split = raw_dataset["train"].train_test_split(test_size=0.1)
dataset = split
dataset["validation"] = raw_dataset["validation"]

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size)

Map:   0%|          | 0/7695 [00:00<?, ? examples/s]

Map:   0%|          | 0/856 [00:00<?, ? examples/s]

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Full fine-tuning setup
full_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
full_params = sum(p.numel() for p in full_model.parameters() if p.requires_grad)
print(f"Full fine-tuning trainable parameters: {full_params}")

# LoRA setup
lora_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
lora_model = apply_lora(lora_model, rank=rank, alpha=alpha).to(device)
lora_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
print(f"LoRA trainable parameters: {lora_params}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Full fine-tuning trainable parameters: 124647170


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA trainable parameters: 294912


### Training

In [8]:
print("Running full fine-tuning experiment...")
gc.collect()
torch.cuda.empty_cache()
full_results = train_model(full_model, train_dataloader, eval_dataloader, full_learning_rate, num_epochs, device)

Running full fine-tuning experiment...


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.7176, Accuracy: 0.4363
  Batch 200: Loss: 0.6476, Accuracy: 0.5722
  Batch 300: Loss: 0.5981, Accuracy: 0.6367
  Batch 400: Loss: 0.5671, Accuracy: 0.6725


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 1/5
  Train Loss: 0.5468, Accuracy: 0.6923, Time: 684.18s
  Eval Loss: 0.4822, Accuracy: 0.7958, Time: 31.92s


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.3447, Accuracy: 0.8544
  Batch 200: Loss: 0.3526, Accuracy: 0.8519
  Batch 300: Loss: 0.3567, Accuracy: 0.8502
  Batch 400: Loss: 0.3529, Accuracy: 0.8528


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 2/5
  Train Loss: 0.3489, Accuracy: 0.8535, Time: 688.15s
  Eval Loss: 0.3890, Accuracy: 0.8341, Time: 31.90s


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.2351, Accuracy: 0.9113
  Batch 200: Loss: 0.2489, Accuracy: 0.9059
  Batch 300: Loss: 0.2428, Accuracy: 0.9079
  Batch 400: Loss: 0.2388, Accuracy: 0.9094


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 3/5
  Train Loss: 0.2360, Accuracy: 0.9101, Time: 687.97s
  Eval Loss: 0.4907, Accuracy: 0.8303, Time: 31.88s


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.1470, Accuracy: 0.9425
  Batch 200: Loss: 0.1576, Accuracy: 0.9413
  Batch 300: Loss: 0.1601, Accuracy: 0.9429
  Batch 400: Loss: 0.1617, Accuracy: 0.9408


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 4/5
  Train Loss: 0.1608, Accuracy: 0.9414, Time: 688.06s
  Eval Loss: 0.5087, Accuracy: 0.8370, Time: 31.91s


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.1162, Accuracy: 0.9656
  Batch 200: Loss: 0.1135, Accuracy: 0.9647
  Batch 300: Loss: 0.1158, Accuracy: 0.9608
  Batch 400: Loss: 0.1213, Accuracy: 0.9591


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 5/5
  Train Loss: 0.1211, Accuracy: 0.9593, Time: 688.19s
  Eval Loss: 0.5679, Accuracy: 0.8389, Time: 31.92s


In [9]:
print("\nRunning LoRA experiment...")
torch.cuda.empty_cache()
gc.collect()
lora_results = train_model(lora_model, train_dataloader, eval_dataloader, lora_learning_rate, num_epochs, device)


Running LoRA experiment...


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.7213, Accuracy: 0.3875
  Batch 200: Loss: 0.6588, Accuracy: 0.5469
  Batch 300: Loss: 0.6146, Accuracy: 0.6110
  Batch 400: Loss: 0.5813, Accuracy: 0.6514


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 1/5
  Train Loss: 0.5606, Accuracy: 0.6747, Time: 522.26s
  Eval Loss: 0.4513, Accuracy: 0.8025, Time: 32.21s


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.4105, Accuracy: 0.8094
  Batch 200: Loss: 0.4223, Accuracy: 0.8075
  Batch 300: Loss: 0.4197, Accuracy: 0.8085
  Batch 400: Loss: 0.4212, Accuracy: 0.8069


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 2/5
  Train Loss: 0.4182, Accuracy: 0.8092, Time: 522.05s
  Eval Loss: 0.5260, Accuracy: 0.8063, Time: 32.22s


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.3617, Accuracy: 0.8444
  Batch 200: Loss: 0.3853, Accuracy: 0.8287
  Batch 300: Loss: 0.3757, Accuracy: 0.8344
  Batch 400: Loss: 0.3790, Accuracy: 0.8298


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 3/5
  Train Loss: 0.3799, Accuracy: 0.8296, Time: 522.23s
  Eval Loss: 0.4819, Accuracy: 0.8092, Time: 32.22s


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.3446, Accuracy: 0.8494
  Batch 200: Loss: 0.3547, Accuracy: 0.8500
  Batch 300: Loss: 0.3502, Accuracy: 0.8496
  Batch 400: Loss: 0.3537, Accuracy: 0.8494


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 4/5
  Train Loss: 0.3510, Accuracy: 0.8498, Time: 522.33s
  Eval Loss: 0.4745, Accuracy: 0.8236, Time: 32.24s


Training:   0%|          | 0/481 [00:00<?, ?it/s]

  Batch 100: Loss: 0.3550, Accuracy: 0.8544
  Batch 200: Loss: 0.3510, Accuracy: 0.8516
  Batch 300: Loss: 0.3445, Accuracy: 0.8558
  Batch 400: Loss: 0.3440, Accuracy: 0.8556


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Epoch 5/5
  Train Loss: 0.3429, Accuracy: 0.8555, Time: 522.04s
  Eval Loss: 0.4703, Accuracy: 0.8207, Time: 32.18s


In [10]:
print("\nPerformance Comparison:")
print_train_results("Full", full_results)
print_train_results("LoRA", lora_results)


Performance Comparison:
Full:
  Total training time: 3436.54s
  Total evaluation time: 159.54s
  Peak GPU memory usage: 2.02GB
  Final evaluation accuracy: 0.8389
LoRA:
  Total training time: 2610.91s
  Total evaluation time: 161.06s
  Peak GPU memory usage: 1.02GB
  Final evaluation accuracy: 0.8207


### Evaluate and Save to Disk

In [19]:
print("Evaluating full fine-tuned model:")
full_loss, full_accuracy, full_mcc, full_f1, full_eval_time = evaluate(full_model, test_dataloader, device)
print(f"Full model - Accuracy: {full_accuracy:.4f}, Loss: {full_loss:.4f}, MCC: {full_mcc:.4f}, F1: {full_f1:.4f}, Eval time: {full_eval_time:.2f}s")

print("\nEvaluating LoRA model:")
lora_loss, lora_accuracy, lora_mcc, lora_f1, lora_eval_time = evaluate(lora_model, test_dataloader, device)
print(f"LoRA model - Accuracy: {lora_accuracy:.4f}, Loss: {lora_loss:.4f}, MCC: {lora_mcc:.4f}, F1: {lora_f1:.4f}, Eval time: {lora_eval_time:.2f}s")

Evaluating full fine-tuned model:


Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]

Full model - Accuracy: 0.8481, Loss: 0.5417, MCC: 0.6285, F1: 0.8962, Eval time: 24.92s

Evaluating LoRA model:


Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]

LoRA model - Accuracy: 0.8107, Loss: 0.4678, MCC: 0.5289, F1: 0.8728, Eval time: 25.92s


In [20]:
import os

full_save_path = "./cola-full_finetuned_model.pt"
torch.save(full_model.state_dict(), full_save_path)
full_model_size = os.path.getsize(full_save_path) / (1024 * 1024)  # Size in MB
print(f"\nFull fine-tuned model saved to {full_save_path}")
print(f"Full model size: {full_model_size:.2f} MB")

lora_save_path = "./cola-lora_and_classifier.pt"
lora_state_dict = {name: param for name, param in lora_model.named_parameters() 
                   if 'lora_A' in name or 'lora_B' in name or 'classifier' in name}
torch.save(lora_state_dict, lora_save_path)
lora_model_size = os.path.getsize(lora_save_path) / (1024 * 1024)  # Size in MB
print(f"LoRA layers and classifier saved to {lora_save_path}")
print(f"LoRA model size: {lora_model_size:.2f} MB")

print(f"\nSize reduction: {full_model_size / lora_model_size:.2f}x")


Full fine-tuned model saved to ./cola-full_finetuned_model.pt
Full model size: 475.57 MB
LoRA layers and classifier saved to ./cola-lora_and_classifier.pt
LoRA model size: 3.40 MB

Size reduction: 139.75x


### Load Model from Disk

In [None]:
# Load full fine-tuned model
loaded_full_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
loaded_full_model.load_state_dict(torch.load(full_save_path))
loaded_full_model.to(device)

# Evaluate loaded full model
full_loss, full_accuracy, full_eval_time = evaluate(loaded_full_model, test_dataloader, device)
print(f"Loaded full model - Accuracy: {full_accuracy:.4f}, Loss: {full_loss:.4f}, Eval time: {full_eval_time:.2f}s")

In [None]:
# Load LoRA model
base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
loaded_lora_model = apply_lora(base_model, rank=8, alpha=8)
loaded_lora_model.load_state_dict(torch.load(lora_save_path), strict=False)
loaded_lora_model.to(device)

# Evaluate loaded LoRA model
lora_loss, lora_accuracy, lora_eval_time = evaluate(loaded_lora_model, test_dataloader, device)
print(f"Loaded LoRA model - Accuracy: {lora_accuracy:.4f}, Loss: {lora_loss:.4f}, Eval time: {lora_eval_time:.2f}s")

In [None]:
# Demonstrate that LoRA is there and that without it the model performs poorly

print("\nVerifying LoRA layers:")
for name, module in loaded_lora_model.named_modules():
    if isinstance(module, LoRALayer):
        print(f"LoRA layer found: {name}")
        print(f"lora_A shape: {module.lora_A.shape}")
        print(f"lora_B shape: {module.lora_B.shape}")
        print()

print("Evaluating model with zeroed LoRA weights:")
for name, param in loaded_lora_model.named_parameters():
    if 'lora_A' in name or 'lora_B' in name:
        param.data.zero_()

zero_loss, zero_accuracy, zero_eval_time = evaluate(loaded_lora_model, test_dataloader, device)
print(f"Zeroed LoRA model - Accuracy: {zero_accuracy:.4f}, Loss: {zero_loss:.4f}, Eval time: {zero_eval_time:.2f}s")