In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
!pip install -Uq transformers datasets torch numpy

In [3]:
import torch, transformers, datasets
import numpy as np
torch.__version__, transformers.__version__, datasets.__version__, np.__version__

('2.1.2', '4.36.2', '2.16.1', '1.26.3')

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, set_seed
from torch import nn
from torch.nn import functional as F
import math

hf_ckp = 'roberta-base'
set_seed(100)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {f"accuracy": (predictions == labels).mean()}

def count_parameters(m, verbose=True):
    total_count = 0
    learnable_count = 0
    if verbose:
        print("Parameters (name, tunable, count):")

    output_width = max([len(n) for n, _ in m.named_parameters()])
    for n, p in m.named_parameters():
        count = p.data.numel()
        if verbose:
            print(f" {n:{output_width}} {p.requires_grad:5b} {count:>11d}")
        total_count += count
        if p.requires_grad:
            learnable_count += count

    print(
        f"Total parameters: {total_count:,}, "
        f"thereof learnable: {learnable_count:,} "
        f"({learnable_count/total_count*100.:5.4f}%)"
    )

    return total_count, learnable_count

def adapt_model(model):
    
    # Minimalized example in place of the original LoRA-from-Scratch 
    # implementation from the article: 
    # https://towardsdatascience.com/dive-into-lora-adapters-38f4da488ede
    class MinimalLoRAAdapter(nn.Module): 
        def __init__(self, 
                     adaptee):
            super().__init__()

            self.adaptee = adaptee

            self.orig_forward = adaptee.forward
            adaptee.forward = self.forward
            
            r = 1
            adaptee.lora_A = nn.Parameter(
                torch.randn(adaptee.in_features, r) / math.sqrt(adaptee.in_features)
            )
            adaptee.lora_B = nn.Parameter(torch.zeros(r, adaptee.out_features))

        def forward(self, x, *args, **kwargs):
            return (
                self.orig_forward(x, *args, **kwargs)
                + F.dropout(x, 0.1) @ self.adaptee.lora_A @ self.adaptee.lora_B
            )
   
    # freeze all layers, incl. embeddings, except for the classifier
    for m in model.roberta.modules():    
        m.requires_grad_(False)

    # Adapt linear modules in transformer layers
    for m in model.roberta.encoder.modules():    
        if isinstance(m, nn.Linear):
            MinimalLoRAAdapter(m)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [5]:
%%time

tokenizer = AutoTokenizer.from_pretrained(hf_ckp)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

datasets.logging.disable_progress_bar()
dataset = datasets.load_dataset("glue", "sst2")
train = dataset["train"]
valid = dataset["validation"]

def preprocess_function(examples):
        return tokenizer(examples['sentence'], padding=False, truncation=True)

tokenized_train = train.map(preprocess_function, batched=False)
tokenized_valid = valid.map(preprocess_function, batched=False)

def train(cp_enabled, model):
     
    model = AutoModelForSequenceClassification.from_pretrained(hf_ckp, num_labels=2)   

    
    training_args = TrainingArguments(
        gradient_checkpointing=cp_enabled,
        output_dir="out",    
        per_device_train_batch_size=10,
        learning_rate=3e-5,
        save_steps=10_000,
        eval_steps=   250,
        max_steps = 1_500,
        evaluation_strategy="steps",
        save_strategy="steps",
        save_total_limit=1,
        disable_tqdm=True,
        metric_for_best_model='eval_accuracy',
        report_to="none", # Disable wandb, tensorboard
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )
    print(f'{model.is_gradient_checkpointing=}')
    total, learnable = count_parameters(model, verbose=False)
    
    adapt_model(model)
    if cp_enabled:
        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant":False})


    trainer.train()
    trainer.evaluate()
    del(model) # essential!


print('\n---- without gradient checkpointing ----\n')
train(False, None)

print('\n---- with gradient checkpointing ----\n')
train(True, None)


---- without gradient checkpointing ----



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


model.is_gradient_checkpointing=False
Total parameters: 124,647,170, thereof learnable: 124,647,170 (100.0000%)
{'eval_loss': 0.6934486627578735, 'eval_accuracy': 0.5091743119266054, 'eval_runtime': 15.9581, 'eval_samples_per_second': 54.643, 'eval_steps_per_second': 6.83, 'epoch': 0.04}
{'loss': 0.6809, 'learning_rate': 1.9999999999999998e-05, 'epoch': 0.07}
{'eval_loss': 0.6518974900245667, 'eval_accuracy': 0.536697247706422, 'eval_runtime': 4.8602, 'eval_samples_per_second': 179.415, 'eval_steps_per_second': 22.427, 'epoch': 0.07}
{'eval_loss': 0.2987501919269562, 'eval_accuracy': 0.8922018348623854, 'eval_runtime': 4.71, 'eval_samples_per_second': 185.138, 'eval_steps_per_second': 23.142, 'epoch': 0.11}
{'loss': 0.4068, 'learning_rate': 9.999999999999999e-06, 'epoch': 0.15}
{'eval_loss': 0.2835868299007416, 'eval_accuracy': 0.8956422018348624, 'eval_runtime': 4.8738, 'eval_samples_per_second': 178.915, 'eval_steps_per_second': 22.364, 'epoch': 0.15}
{'eval_loss': 0.2968225777149200

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.is_gradient_checkpointing=False
Total parameters: 124,647,170, thereof learnable: 124,647,170 (100.0000%)




{'eval_loss': 0.6929524540901184, 'eval_accuracy': 0.5091743119266054, 'eval_runtime': 4.5995, 'eval_samples_per_second': 189.586, 'eval_steps_per_second': 23.698, 'epoch': 0.04}
{'loss': 0.6861, 'learning_rate': 1.9999999999999998e-05, 'epoch': 0.07}
{'eval_loss': 0.6892897486686707, 'eval_accuracy': 0.5091743119266054, 'eval_runtime': 4.6894, 'eval_samples_per_second': 185.951, 'eval_steps_per_second': 23.244, 'epoch': 0.07}
{'eval_loss': 0.6857690215110779, 'eval_accuracy': 0.5091743119266054, 'eval_runtime': 4.6746, 'eval_samples_per_second': 186.54, 'eval_steps_per_second': 23.317, 'epoch': 0.11}
{'loss': 0.6789, 'learning_rate': 9.999999999999999e-06, 'epoch': 0.15}
{'eval_loss': 0.6886543035507202, 'eval_accuracy': 0.5091743119266054, 'eval_runtime': 4.6602, 'eval_samples_per_second': 187.117, 'eval_steps_per_second': 23.39, 'epoch': 0.15}
{'eval_loss': 0.6868622303009033, 'eval_accuracy': 0.5091743119266054, 'eval_runtime': 4.7011, 'eval_samples_per_second': 185.488, 'eval_step