In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda as cuda
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import time
import matplotlib.pyplot as plt
import psutil
from IPython.display import clear_output
import gc

# LoRA Setup

First, we need to setup a LoRALayer class which we can then inject into our transformer model, switching out only the query and value attention layers as stated in the original paper. The following code should work on any transformer model and therefore is model-agnostic, as long as the query and value layers hold "query" and "value" in their names.

In [2]:
class LoRALayer(nn.Module):
    def __init__(self, original_layer, rank=4, alpha=1):
        super().__init__()
        self.original_layer = original_layer
        for param in self.original_layer.parameters():
            param.requires_grad = False
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank
        self.lora_A = nn.Parameter(torch.randn(original_layer.in_features, rank) * 0.01)
        self.lora_B = nn.Parameter(torch.zeros(rank, original_layer.out_features))

    def forward(self, x):
        original_output = self.original_layer(x)
        lora_output = (x @ self.lora_A @ self.lora_B) * self.scaling
        return original_output + lora_output

def apply_lora(model, rank=4, alpha=1):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any(x in name for x in ['query', 'value']):
            parent_name = '.'.join(name.split('.')[:-1])
            child_name = name.split('.')[-1]
            parent = model.get_submodule(parent_name)
            lora_layer = LoRALayer(module, rank, alpha)
            setattr(parent, child_name, lora_layer)
    
    for name, param in model.named_parameters():
        if 'lora_A' in name or 'lora_B' in name:
            param.requires_grad = True
        else:
            param.requires_grad = False
    
    return model

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [3]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    start_time = time.time()
    
    for batch_idx, batch in enumerate(tqdm(dataloader, desc="Training")):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_samples += batch['labels'].size(0)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        if (batch_idx + 1) % 100 == 0:
            current_loss = total_loss / (batch_idx + 1)
            current_accuracy = total_correct / total_samples
            print(f"  Batch {batch_idx+1}: Loss: {current_loss:.4f}, Accuracy: {current_accuracy:.4f}")
    
    epoch_time = time.time() - start_time
    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples
    
    return avg_loss, accuracy, epoch_time

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    start_time = time.time()
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_correct += (predictions == batch["labels"]).sum().item()
            total_samples += batch["labels"].size(0)
    
    eval_time = time.time() - start_time
    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples
    
    return avg_loss, accuracy, eval_time

In [4]:
def train_model(model, train_dataloader, eval_dataloader, learning_rate, num_epochs, device):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=250, num_training_steps=len(train_dataloader) * num_epochs)
    
    train_losses = []
    train_accuracies = []
    eval_losses = []
    eval_accuracies = []
    
    total_train_time = 0
    total_eval_time = 0
    peak_memory_usage = 0
    
    for epoch in range(num_epochs):
        # Training
        train_loss, train_accuracy, train_time = train_epoch(model, train_dataloader, optimizer, scheduler, device)
        total_train_time += train_time
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        
        # Update peak memory usage
        current_memory = torch.cuda.memory_allocated() / 1e9  # Convert to GB
        peak_memory_usage = max(peak_memory_usage, current_memory)
        
        # Evaluation
        eval_loss, eval_accuracy, eval_time = evaluate(model, eval_dataloader, device)
        total_eval_time += eval_time
        eval_losses.append(eval_loss)
        eval_accuracies.append(eval_accuracy)
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}, Time: {train_time:.2f}s")
        print(f"  Eval Loss: {eval_loss:.4f}, Accuracy: {eval_accuracy:.4f}, Time: {eval_time:.2f}s")
    
    return {
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'eval_losses': eval_losses,
        'eval_accuracies': eval_accuracies,
        'total_train_time': total_train_time,
        'total_eval_time': total_eval_time,
        'peak_memory_usage': peak_memory_usage,
        'final_train_accuracy': train_accuracies[-1],
        'final_eval_accuracy': eval_accuracies[-1]
    }

def print_train_results(name, results):
    print(f"{name}:")
    print(f"  Total training time: {results['total_train_time']:.2f}s")
    print(f"  Total evaluation time: {results['total_eval_time']:.2f}s")
    print(f"  Peak GPU memory usage: {results['peak_memory_usage']:.2f}GB")
    print(f"  Final evaluation accuracy: {results['eval_accuracies'][-1]:.4f}")

# Reproducing GLUE with RoBeRTa-base

One of the central experiments of the paper that we want to reproduce is the GLUE benchmark on several models with different finetuning strategies. We restrict ourselves to comparing full fine-tuning with LoRA on RoBeRTa-base, testing SST-2, XXX and XXX of the GLUE benchmark. Because we do not have the resources to create a proper submission for GLUE and therefore also cannot do a proper evaluation on the test split, we create our own test split from part of the train split.

## SST-2

The first task we reproduce is the Stanford Sentiment Treebank (SST-2). It contains sentences and phrases annotated for sentiment analysis, therefore being a binary classification task. As in the paper, we will use a batch size of 16, a learning rate of 5e-4 for LoRA, a rank of 8, alpha of 8 and a maximum sequence length of 512.

### Setup

In [5]:
batch_size = 16
num_epochs = 5
full_learning_rate = 1e-5
lora_learning_rate = 5e-4
rank = 8
alpha = 8
max_sequence_length = 512

In [6]:
raw_dataset = load_dataset("glue", "sst2")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=max_sequence_length)

split = raw_dataset["train"].train_test_split(test_size=0.1)
dataset = split
dataset["validation"] = raw_dataset["validation"]

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size)

Map:   0%|          | 0/60614 [00:00<?, ? examples/s]

Map:   0%|          | 0/6735 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Full fine-tuning setup
full_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
full_params = sum(p.numel() for p in full_model.parameters() if p.requires_grad)
print(f"Full fine-tuning trainable parameters: {full_params}")

# LoRA setup
lora_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
lora_model = apply_lora(lora_model, rank=rank, alpha=alpha).to(device)
lora_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
print(f"LoRA trainable parameters: {lora_params}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Full fine-tuning trainable parameters: 124647170


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA trainable parameters: 294912


In [8]:
lora_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): LoRALayer(
                (original_layer): Linear(in_features=768, out_features=768, bias=True)
              )
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): LoRALayer(
                (original_layer): Linear(in_features=768, out_features=768, bias=True)
              )
              (dropout): Dropout(p=0.1, inplace=False)
            )
        

### Training

In [9]:
print("Running full fine-tuning experiment...")
torch.cuda.empty_cache()
gc.collect()
full_results = train_model(full_model, train_dataloader, eval_dataloader, full_learning_rate, num_epochs, device)

Running full fine-tuning experiment...


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.6894, Accuracy: 0.5463
  Batch 200: Loss: 0.6287, Accuracy: 0.6122
  Batch 300: Loss: 0.5339, Accuracy: 0.6933
  Batch 400: Loss: 0.4724, Accuracy: 0.7406
  Batch 500: Loss: 0.4359, Accuracy: 0.7672
  Batch 600: Loss: 0.4062, Accuracy: 0.7884
  Batch 700: Loss: 0.3873, Accuracy: 0.8035
  Batch 800: Loss: 0.3703, Accuracy: 0.8152
  Batch 900: Loss: 0.3563, Accuracy: 0.8239
  Batch 1000: Loss: 0.3464, Accuracy: 0.8311
  Batch 1100: Loss: 0.3346, Accuracy: 0.8386
  Batch 1200: Loss: 0.3243, Accuracy: 0.8449
  Batch 1300: Loss: 0.3162, Accuracy: 0.8501
  Batch 1400: Loss: 0.3096, Accuracy: 0.8549
  Batch 1500: Loss: 0.3037, Accuracy: 0.8587
  Batch 1600: Loss: 0.2997, Accuracy: 0.8609
  Batch 1700: Loss: 0.2938, Accuracy: 0.8641
  Batch 1800: Loss: 0.2900, Accuracy: 0.8671
  Batch 1900: Loss: 0.2863, Accuracy: 0.8693
  Batch 2000: Loss: 0.2804, Accuracy: 0.8728
  Batch 2100: Loss: 0.2764, Accuracy: 0.8752
  Batch 2200: Loss: 0.2727, Accuracy: 0.8771
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 1/5
  Train Loss: 0.2393, Accuracy: 0.8973, Time: 5293.65s
  Eval Loss: 0.1840, Accuracy: 0.9461, Time: 25.52s


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.1362, Accuracy: 0.9450
  Batch 200: Loss: 0.1377, Accuracy: 0.9472
  Batch 300: Loss: 0.1371, Accuracy: 0.9475
  Batch 400: Loss: 0.1377, Accuracy: 0.9469
  Batch 500: Loss: 0.1368, Accuracy: 0.9475
  Batch 600: Loss: 0.1377, Accuracy: 0.9475
  Batch 700: Loss: 0.1414, Accuracy: 0.9464
  Batch 800: Loss: 0.1414, Accuracy: 0.9459
  Batch 900: Loss: 0.1410, Accuracy: 0.9465
  Batch 1000: Loss: 0.1416, Accuracy: 0.9464
  Batch 1100: Loss: 0.1416, Accuracy: 0.9470
  Batch 1200: Loss: 0.1422, Accuracy: 0.9466
  Batch 1300: Loss: 0.1409, Accuracy: 0.9471
  Batch 1400: Loss: 0.1414, Accuracy: 0.9468
  Batch 1500: Loss: 0.1410, Accuracy: 0.9466
  Batch 1600: Loss: 0.1402, Accuracy: 0.9472
  Batch 1700: Loss: 0.1401, Accuracy: 0.9474
  Batch 1800: Loss: 0.1401, Accuracy: 0.9474
  Batch 1900: Loss: 0.1411, Accuracy: 0.9474
  Batch 2000: Loss: 0.1404, Accuracy: 0.9477
  Batch 2100: Loss: 0.1400, Accuracy: 0.9478
  Batch 2200: Loss: 0.1393, Accuracy: 0.9480
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 2/5
  Train Loss: 0.1389, Accuracy: 0.9477, Time: 5313.38s
  Eval Loss: 0.1801, Accuracy: 0.9369, Time: 25.88s


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.0820, Accuracy: 0.9712
  Batch 200: Loss: 0.0875, Accuracy: 0.9691
  Batch 300: Loss: 0.0920, Accuracy: 0.9665
  Batch 400: Loss: 0.0941, Accuracy: 0.9663
  Batch 500: Loss: 0.0975, Accuracy: 0.9651
  Batch 600: Loss: 0.0999, Accuracy: 0.9643
  Batch 700: Loss: 0.1008, Accuracy: 0.9637
  Batch 800: Loss: 0.0991, Accuracy: 0.9648
  Batch 900: Loss: 0.0993, Accuracy: 0.9644
  Batch 1000: Loss: 0.1008, Accuracy: 0.9639
  Batch 1100: Loss: 0.1016, Accuracy: 0.9636
  Batch 1200: Loss: 0.1026, Accuracy: 0.9635
  Batch 1300: Loss: 0.1020, Accuracy: 0.9638
  Batch 1400: Loss: 0.1009, Accuracy: 0.9642
  Batch 1500: Loss: 0.1007, Accuracy: 0.9642
  Batch 1600: Loss: 0.1008, Accuracy: 0.9643
  Batch 1700: Loss: 0.1002, Accuracy: 0.9643
  Batch 1800: Loss: 0.1007, Accuracy: 0.9639
  Batch 1900: Loss: 0.1009, Accuracy: 0.9637
  Batch 2000: Loss: 0.1020, Accuracy: 0.9637
  Batch 2100: Loss: 0.1015, Accuracy: 0.9639
  Batch 2200: Loss: 0.1017, Accuracy: 0.9636
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 3/5
  Train Loss: 0.0997, Accuracy: 0.9640, Time: 5316.09s
  Eval Loss: 0.2085, Accuracy: 0.9438, Time: 25.91s


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.0666, Accuracy: 0.9762
  Batch 200: Loss: 0.0654, Accuracy: 0.9747
  Batch 300: Loss: 0.0659, Accuracy: 0.9754
  Batch 400: Loss: 0.0662, Accuracy: 0.9745
  Batch 500: Loss: 0.0689, Accuracy: 0.9738
  Batch 600: Loss: 0.0706, Accuracy: 0.9734
  Batch 700: Loss: 0.0696, Accuracy: 0.9738
  Batch 800: Loss: 0.0724, Accuracy: 0.9728
  Batch 900: Loss: 0.0713, Accuracy: 0.9736
  Batch 1000: Loss: 0.0702, Accuracy: 0.9740
  Batch 1100: Loss: 0.0718, Accuracy: 0.9736
  Batch 1200: Loss: 0.0722, Accuracy: 0.9733
  Batch 1300: Loss: 0.0737, Accuracy: 0.9728
  Batch 1400: Loss: 0.0743, Accuracy: 0.9728
  Batch 1500: Loss: 0.0750, Accuracy: 0.9722
  Batch 1600: Loss: 0.0751, Accuracy: 0.9720
  Batch 1700: Loss: 0.0745, Accuracy: 0.9724
  Batch 1800: Loss: 0.0745, Accuracy: 0.9726
  Batch 1900: Loss: 0.0743, Accuracy: 0.9730
  Batch 2000: Loss: 0.0755, Accuracy: 0.9726
  Batch 2100: Loss: 0.0761, Accuracy: 0.9724
  Batch 2200: Loss: 0.0753, Accuracy: 0.9728
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 4/5
  Train Loss: 0.0743, Accuracy: 0.9734, Time: 5283.27s
  Eval Loss: 0.2127, Accuracy: 0.9427, Time: 25.61s


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.0483, Accuracy: 0.9812
  Batch 200: Loss: 0.0492, Accuracy: 0.9816
  Batch 300: Loss: 0.0502, Accuracy: 0.9812
  Batch 400: Loss: 0.0530, Accuracy: 0.9794
  Batch 500: Loss: 0.0577, Accuracy: 0.9782
  Batch 600: Loss: 0.0572, Accuracy: 0.9785
  Batch 700: Loss: 0.0574, Accuracy: 0.9786
  Batch 800: Loss: 0.0578, Accuracy: 0.9786
  Batch 900: Loss: 0.0589, Accuracy: 0.9782
  Batch 1000: Loss: 0.0575, Accuracy: 0.9790
  Batch 1100: Loss: 0.0588, Accuracy: 0.9788
  Batch 1200: Loss: 0.0590, Accuracy: 0.9789
  Batch 1300: Loss: 0.0588, Accuracy: 0.9791
  Batch 1400: Loss: 0.0585, Accuracy: 0.9791
  Batch 1500: Loss: 0.0589, Accuracy: 0.9790
  Batch 1600: Loss: 0.0586, Accuracy: 0.9791
  Batch 1700: Loss: 0.0583, Accuracy: 0.9789
  Batch 1800: Loss: 0.0585, Accuracy: 0.9790
  Batch 1900: Loss: 0.0582, Accuracy: 0.9791
  Batch 2000: Loss: 0.0577, Accuracy: 0.9793
  Batch 2100: Loss: 0.0570, Accuracy: 0.9795
  Batch 2200: Loss: 0.0561, Accuracy: 0.9798
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 5/5
  Train Loss: 0.0564, Accuracy: 0.9801, Time: 5285.39s
  Eval Loss: 0.2318, Accuracy: 0.9427, Time: 25.61s

Running LoRA experiment...


TypeError: train_model() got an unexpected keyword argument 'is_lora'

In [11]:
print("\nRunning LoRA experiment...")
torch.cuda.empty_cache()
gc.collect()
lora_results = train_model(lora_model, train_dataloader, eval_dataloader, lora_learning_rate, num_epochs, device)


Running LoRA experiment...


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.6871, Accuracy: 0.5513
  Batch 200: Loss: 0.6515, Accuracy: 0.5922
  Batch 300: Loss: 0.5872, Accuracy: 0.6558
  Batch 400: Loss: 0.5237, Accuracy: 0.7077
  Batch 500: Loss: 0.4787, Accuracy: 0.7419
  Batch 600: Loss: 0.4455, Accuracy: 0.7666
  Batch 700: Loss: 0.4231, Accuracy: 0.7826
  Batch 800: Loss: 0.4064, Accuracy: 0.7951
  Batch 900: Loss: 0.3900, Accuracy: 0.8064
  Batch 1000: Loss: 0.3748, Accuracy: 0.8165
  Batch 1100: Loss: 0.3649, Accuracy: 0.8244
  Batch 1200: Loss: 0.3563, Accuracy: 0.8308
  Batch 1300: Loss: 0.3490, Accuracy: 0.8359
  Batch 1400: Loss: 0.3415, Accuracy: 0.8404
  Batch 1500: Loss: 0.3358, Accuracy: 0.8442
  Batch 1600: Loss: 0.3314, Accuracy: 0.8471
  Batch 1700: Loss: 0.3257, Accuracy: 0.8506
  Batch 1800: Loss: 0.3206, Accuracy: 0.8538
  Batch 1900: Loss: 0.3158, Accuracy: 0.8562
  Batch 2000: Loss: 0.3115, Accuracy: 0.8589
  Batch 2100: Loss: 0.3087, Accuracy: 0.8610
  Batch 2200: Loss: 0.3046, Accuracy: 0.8633
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 1/5
  Train Loss: 0.2709, Accuracy: 0.8832, Time: 4023.52s
  Eval Loss: 0.2108, Accuracy: 0.9278, Time: 26.22s


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.1758, Accuracy: 0.9263
  Batch 200: Loss: 0.1838, Accuracy: 0.9234
  Batch 300: Loss: 0.1911, Accuracy: 0.9187
  Batch 400: Loss: 0.1913, Accuracy: 0.9206
  Batch 500: Loss: 0.1957, Accuracy: 0.9186
  Batch 600: Loss: 0.1962, Accuracy: 0.9183
  Batch 700: Loss: 0.1988, Accuracy: 0.9176
  Batch 800: Loss: 0.1986, Accuracy: 0.9177
  Batch 900: Loss: 0.1985, Accuracy: 0.9192
  Batch 1000: Loss: 0.1994, Accuracy: 0.9194
  Batch 1100: Loss: 0.1975, Accuracy: 0.9203
  Batch 1200: Loss: 0.1974, Accuracy: 0.9199
  Batch 1300: Loss: 0.1972, Accuracy: 0.9205
  Batch 1400: Loss: 0.1981, Accuracy: 0.9207
  Batch 1500: Loss: 0.1990, Accuracy: 0.9206
  Batch 1600: Loss: 0.1984, Accuracy: 0.9210
  Batch 1700: Loss: 0.1990, Accuracy: 0.9207
  Batch 1800: Loss: 0.1988, Accuracy: 0.9209
  Batch 1900: Loss: 0.1987, Accuracy: 0.9211
  Batch 2000: Loss: 0.1981, Accuracy: 0.9216
  Batch 2100: Loss: 0.1969, Accuracy: 0.9221
  Batch 2200: Loss: 0.1973, Accuracy: 0.9220
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 2/5
  Train Loss: 0.1959, Accuracy: 0.9226, Time: 4037.13s
  Eval Loss: 0.1972, Accuracy: 0.9358, Time: 26.20s


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.1769, Accuracy: 0.9263
  Batch 200: Loss: 0.1726, Accuracy: 0.9322
  Batch 300: Loss: 0.1648, Accuracy: 0.9360
  Batch 400: Loss: 0.1695, Accuracy: 0.9353
  Batch 500: Loss: 0.1683, Accuracy: 0.9356
  Batch 600: Loss: 0.1682, Accuracy: 0.9354
  Batch 700: Loss: 0.1688, Accuracy: 0.9356
  Batch 800: Loss: 0.1685, Accuracy: 0.9343
  Batch 900: Loss: 0.1689, Accuracy: 0.9347
  Batch 1000: Loss: 0.1684, Accuracy: 0.9345
  Batch 1100: Loss: 0.1684, Accuracy: 0.9343
  Batch 1200: Loss: 0.1687, Accuracy: 0.9342
  Batch 1300: Loss: 0.1681, Accuracy: 0.9346
  Batch 1400: Loss: 0.1678, Accuracy: 0.9346
  Batch 1500: Loss: 0.1681, Accuracy: 0.9349
  Batch 1600: Loss: 0.1699, Accuracy: 0.9342
  Batch 1700: Loss: 0.1710, Accuracy: 0.9343
  Batch 1800: Loss: 0.1687, Accuracy: 0.9352
  Batch 1900: Loss: 0.1701, Accuracy: 0.9346
  Batch 2000: Loss: 0.1697, Accuracy: 0.9346
  Batch 2100: Loss: 0.1705, Accuracy: 0.9342
  Batch 2200: Loss: 0.1705, Accuracy: 0.9342
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 3/5
  Train Loss: 0.1694, Accuracy: 0.9352, Time: 4033.99s
  Eval Loss: 0.2074, Accuracy: 0.9300, Time: 26.23s


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.1506, Accuracy: 0.9425
  Batch 200: Loss: 0.1473, Accuracy: 0.9450
  Batch 300: Loss: 0.1502, Accuracy: 0.9456
  Batch 400: Loss: 0.1500, Accuracy: 0.9447
  Batch 500: Loss: 0.1488, Accuracy: 0.9447
  Batch 600: Loss: 0.1467, Accuracy: 0.9454
  Batch 700: Loss: 0.1467, Accuracy: 0.9452
  Batch 800: Loss: 0.1466, Accuracy: 0.9456
  Batch 900: Loss: 0.1453, Accuracy: 0.9455
  Batch 1000: Loss: 0.1443, Accuracy: 0.9453
  Batch 1100: Loss: 0.1457, Accuracy: 0.9448
  Batch 1200: Loss: 0.1464, Accuracy: 0.9445
  Batch 1300: Loss: 0.1477, Accuracy: 0.9437
  Batch 1400: Loss: 0.1478, Accuracy: 0.9441
  Batch 1500: Loss: 0.1477, Accuracy: 0.9440
  Batch 1600: Loss: 0.1478, Accuracy: 0.9439
  Batch 1700: Loss: 0.1476, Accuracy: 0.9439
  Batch 1800: Loss: 0.1475, Accuracy: 0.9438
  Batch 1900: Loss: 0.1482, Accuracy: 0.9434
  Batch 2000: Loss: 0.1490, Accuracy: 0.9430
  Batch 2100: Loss: 0.1490, Accuracy: 0.9430
  Batch 2200: Loss: 0.1494, Accuracy: 0.9430
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 4/5
  Train Loss: 0.1511, Accuracy: 0.9426, Time: 4113.23s
  Eval Loss: 0.2243, Accuracy: 0.9381, Time: 26.88s


Training:   0%|          | 0/3789 [00:00<?, ?it/s]

  Batch 100: Loss: 0.1248, Accuracy: 0.9506
  Batch 200: Loss: 0.1290, Accuracy: 0.9525
  Batch 300: Loss: 0.1304, Accuracy: 0.9494
  Batch 400: Loss: 0.1356, Accuracy: 0.9473
  Batch 500: Loss: 0.1379, Accuracy: 0.9461
  Batch 600: Loss: 0.1369, Accuracy: 0.9472
  Batch 700: Loss: 0.1338, Accuracy: 0.9486
  Batch 800: Loss: 0.1330, Accuracy: 0.9488
  Batch 900: Loss: 0.1333, Accuracy: 0.9494
  Batch 1000: Loss: 0.1332, Accuracy: 0.9503
  Batch 1100: Loss: 0.1331, Accuracy: 0.9502
  Batch 1200: Loss: 0.1326, Accuracy: 0.9499
  Batch 1300: Loss: 0.1337, Accuracy: 0.9496
  Batch 1400: Loss: 0.1353, Accuracy: 0.9489
  Batch 1500: Loss: 0.1358, Accuracy: 0.9485
  Batch 1600: Loss: 0.1350, Accuracy: 0.9490
  Batch 1700: Loss: 0.1353, Accuracy: 0.9489
  Batch 1800: Loss: 0.1343, Accuracy: 0.9489
  Batch 1900: Loss: 0.1352, Accuracy: 0.9487
  Batch 2000: Loss: 0.1350, Accuracy: 0.9487
  Batch 2100: Loss: 0.1357, Accuracy: 0.9483
  Batch 2200: Loss: 0.1350, Accuracy: 0.9486
  Batch 2300: Loss:

Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch 5/5
  Train Loss: 0.1376, Accuracy: 0.9480, Time: 4154.43s
  Eval Loss: 0.2179, Accuracy: 0.9369, Time: 26.81s


In [12]:
print("\nPerformance Comparison:")
print_train_results("Full", full_results)
print_train_results("LoRA", lora_results)


Performance Comparison:
Full:
  Total training time: 26491.80s
  Total evaluation time: 128.54s
  Peak GPU memory usage: 2.02GB
  Final evaluation accuracy: 0.9427
LoRA:
  Total training time: 20362.31s
  Total evaluation time: 132.35s
  Peak GPU memory usage: 1.02GB
  Final evaluation accuracy: 0.9369


### Evaluate and Save to Disk

In [13]:
print("Evaluating full fine-tuned model:")
full_loss, full_accuracy, full_eval_time = evaluate(full_model, test_dataloader, device)
print(f"Full model - Accuracy: {full_accuracy:.4f}, Loss: {full_loss:.4f}, Eval time: {full_eval_time:.2f}s")

print("\nEvaluating LoRA model:")
lora_loss, lora_accuracy, lora_eval_time = evaluate(lora_model, test_dataloader, device)
print(f"LoRA model - Accuracy: {lora_accuracy:.4f}, Loss: {lora_loss:.4f}, Eval time: {lora_eval_time:.2f}s")

Evaluating full fine-tuned model:


Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

Full model - Accuracy: 0.9538, Loss: 0.1755, Eval time: 197.12s

Evaluating LoRA model:


Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

LoRA model - Accuracy: 0.9468, Loss: 0.1553, Eval time: 205.08s


In [14]:
import os

full_save_path = "./sst2-full_finetuned_model.pt"
torch.save(full_model.state_dict(), full_save_path)
full_model.save_pretrained("sst2-full_finetuned_model")
full_model_size = os.path.getsize(full_save_path) / (1024 * 1024)  # Size in MB
print(f"\nFull fine-tuned model saved to {full_save_path}")
print(f"Full model size: {full_model_size:.2f} MB")

lora_save_path = "./sst2-lora_and_classifier.pt"
lora_state_dict = {name: param for name, param in lora_model.named_parameters() 
                   if 'lora_A' in name or 'lora_B' in name or 'classifier' in name}
torch.save(lora_state_dict, lora_save_path)
lora_model_size = os.path.getsize(lora_save_path) / (1024 * 1024)  # Size in MB
print(f"LoRA layers and classifier saved to {lora_save_path}")
print(f"LoRA model size: {lora_model_size:.2f} MB")

print(f"\nSize reduction: {full_model_size / lora_model_size:.2f}x")


Full fine-tuned model saved to ./sst2-full_finetuned_model.pt
Full model size: 475.57 MB
LoRA layers and classifier saved to ./sst2-lora_and_classifier.pt
LoRA model size: 3.40 MB

Size reduction: 139.75x


### Load Model from Disk

In [15]:
# Load full fine-tuned model
loaded_full_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
loaded_full_model.load_state_dict(torch.load(full_save_path))
loaded_full_model.to(device)

# Evaluate loaded full model
full_loss, full_accuracy, full_eval_time = evaluate(loaded_full_model, test_dataloader, device)
print(f"Loaded full model - Accuracy: {full_accuracy:.4f}, Loss: {full_loss:.4f}, Eval time: {full_eval_time:.2f}s")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

Loaded full model - Accuracy: 0.9538, Loss: 0.1755, Eval time: 191.10s


In [17]:
# Load LoRA model
base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
loaded_lora_model = apply_lora(base_model, rank=8, alpha=8)
loaded_lora_model.load_state_dict(torch.load(lora_save_path), strict=False)
loaded_lora_model.to(device)

# Evaluate loaded LoRA model
lora_loss, lora_accuracy, lora_eval_time = evaluate(loaded_lora_model, test_dataloader, device)
print(f"Loaded LoRA model - Accuracy: {lora_accuracy:.4f}, Loss: {lora_loss:.4f}, Eval time: {lora_eval_time:.2f}s")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

Loaded LoRA model - Accuracy: 0.9468, Loss: 0.1553, Eval time: 201.92s


In [18]:
# Demonstrate that LoRA is there and that without it the model performs poorly

print("\nVerifying LoRA layers:")
for name, module in loaded_lora_model.named_modules():
    if isinstance(module, LoRALayer):
        print(f"LoRA layer found: {name}")
        print(f"lora_A shape: {module.lora_A.shape}")
        print(f"lora_B shape: {module.lora_B.shape}")
        print()

print("Evaluating model with zeroed LoRA weights:")
for name, param in loaded_lora_model.named_parameters():
    if 'lora_A' in name or 'lora_B' in name:
        param.data.zero_()

zero_loss, zero_accuracy, zero_eval_time = evaluate(loaded_lora_model, test_dataloader, device)
print(f"Zeroed LoRA model - Accuracy: {zero_accuracy:.4f}, Loss: {zero_loss:.4f}, Eval time: {zero_eval_time:.2f}s")


Verifying LoRA layers:
LoRA layer found: roberta.encoder.layer.0.attention.self.query
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.0.attention.self.value
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.1.attention.self.query
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.1.attention.self.value
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.2.attention.self.query
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.2.attention.self.value
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.layer.3.attention.self.query
lora_A shape: torch.Size([768, 8])
lora_B shape: torch.Size([8, 768])

LoRA layer found: roberta.encoder.laye

Evaluating:   0%|          | 0/421 [00:00<?, ?it/s]

Zeroed LoRA model - Accuracy: 0.5543, Loss: 0.6874, Eval time: 200.53s


In [3]:
import gc
import torch

torch.cuda.empty_cache()
gc.collect()

0