<a href="https://colab.research.google.com/github/philipstevens/notebooks/blob/master/Lora_ablation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers datasets huggingface_hub fsspec --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.3/489.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvi

In [None]:
import os
os.kill(os.getpid(), 9)  # Force Colab to restart

In [None]:
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
import torch
import numpy as np
from sklearn.metrics import accuracy_score

# --- Config ---
model_name = "distilbert-base-uncased"
dataset_name = "glue"
task_name = "sst2"
output_dir = "./results/distilbert_sst2_r4"
lora_rank = 4

# --- Load dataset ---
dataset = load_dataset(dataset_name, task_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)

encoded = dataset.map(tokenize, batched=True)
encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

train_dataset = encoded["train"]
eval_dataset = encoded["validation"]

# --- Apply LoRA ---
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=lora_rank,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"]  # for DistilBERT, adjust as needed
)

# --- Load model ---
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# --- Freeze base model ---
for param in model.base_model.parameters():
    param.requires_grad = False

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# --- Metrics ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

# --- Training ---
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# trainer.train()
#trainer.evaluate()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 665,858 || all params: 67,620,868 || trainable%: 0.9847


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
import torch
print(torch.cuda.is_available())  # Should be True
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


False
No GPU


In [None]:
print([name for name, _ in model.named_modules()])

['', 'base_model', 'base_model.model', 'base_model.model.distilbert', 'base_model.model.distilbert.embeddings', 'base_model.model.distilbert.embeddings.word_embeddings', 'base_model.model.distilbert.embeddings.position_embeddings', 'base_model.model.distilbert.embeddings.LayerNorm', 'base_model.model.distilbert.embeddings.dropout', 'base_model.model.distilbert.transformer', 'base_model.model.distilbert.transformer.layer', 'base_model.model.distilbert.transformer.layer.0', 'base_model.model.distilbert.transformer.layer.0.attention', 'base_model.model.distilbert.transformer.layer.0.attention.dropout', 'base_model.model.distilbert.transformer.layer.0.attention.q_lin', 'base_model.model.distilbert.transformer.layer.0.attention.q_lin.base_layer', 'base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_dropout', 'base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_dropout.default', 'base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_A', 'base_

In [None]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("distilbert-base-uncased")
print(config.model_type)

distilbert


In [None]:
from huggingface_hub import HfApi
from transformers import AutoModel
from transformers import AutoConfig

In [None]:
import importlib

def get_model_mro_from_architecture(architecture_name):
    # Try to infer model location from known pattern
    model_prefix = architecture_name.lower().replace("for", "_").replace(" ", "")
    model_name_root = architecture_name.split("For")[0].lower()

    try:
        # Attempt dynamic import
        module = importlib.import_module(f"transformers.models.{model_name_root}.modeling_{model_name_root}")
        model_class = getattr(module, architecture_name)
        return model_class.__mro__
    except Exception as e:
        print(f"Could not resolve {architecture_name}: {e}")
        return None

# Example
print(get_model_mro_from_architecture("BertForSequenceClassification"))


(<class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>, <class 'transformers.models.bert.modeling_bert.BertPreTrainedModel'>, <class 'transformers.modeling_utils.PreTrainedModel'>, <class 'torch.nn.modules.module.Module'>, <class 'transformers.modeling_utils.ModuleUtilsMixin'>, <class 'transformers.generation.utils.GenerationMixin'>, <class 'transformers.utils.hub.PushToHubMixin'>, <class 'transformers.integrations.peft.PeftAdapterMixin'>, <class 'object'>)


In [None]:
# Choose one base model, standardize/control all variables, run ablations, evaluate and produce plots
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    AutoModelForQuestionAnswering,
    AutoModelForMaskedLM,
    TrainingArguments,
    Trainer
)
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import pandas as pd

BASE_MODEL = "bert-base-uncased"
RANKS = [1, 2, 4, 8, 16, 32]

TASKS = {
    "sequence_classification": {
        "model_class": AutoModelForSequenceClassification,
        "task_type": TaskType.SEQ_CLS,
        "dataset": ("glue", "sst2"),
        "tokenize": lambda ex, tok: tok(ex["sentence"], truncation=True, padding="max_length", max_length=128),
        "metric_key": "eval_accuracy",
    },
    "token_classification": {
        "model_class": AutoModelForTokenClassification,
        "task_type": TaskType.TOKEN_CLS,
        "dataset": ("conll2003", None),
        "tokenize": lambda ex, tok: tok(ex["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128),
        "metric_key": "eval_f1",
    },
    "question_answering": {
        "model_class": AutoModelForQuestionAnswering,
        "task_type": TaskType.QUESTION_ANS,
        "dataset": ("squad", None),
        "tokenize": lambda ex, tok: tok(ex["question"], ex["context"], truncation=True, padding="max_length", max_length=128),
        "metric_key": "eval_exact_match",
    },
    "masked_lm": {
        "model_class": AutoModelForMaskedLM,
        "task_type": TaskType.CAUSAL_LM,  # actually MLM head, not causal
        "dataset": ("wikitext", "wikitext-2-raw-v1"),
        "tokenize": lambda ex, tok: tok(ex["text"], truncation=True, padding="max_length", max_length=128),
        "metric_key": "eval_loss",
    }
}

results = []

for task, cfg in TASKS.items():
    print(f"\nRunning task: {task}")
    model_class = cfg["model_class"]
    task_type = cfg["task_type"]
    dataset = load_dataset(*cfg["dataset"])
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    tokenize_fn = lambda ex: cfg["tokenize"](ex, tokenizer)

    train_data = dataset["train"].select(range(200))
    eval_data = dataset["validation"].select(range(200)) if "validation" in dataset else dataset["test"].select(range(200))
    train_data = train_data.map(tokenize_fn, batched=True)
    eval_data = eval_data.map(tokenize_fn, batched=True)

    for r in RANKS:
        print(f"  > Training with LoRA rank r={r}")
        model = model_class.from_pretrained(BASE_MODEL)
        lora_config = LoraConfig(
            r=r,
            lora_alpha=16,
            lora_dropout=0.0,
            bias="none",
            task_type=task_type
        )
        model = get_peft_model(model, lora_config)

        training_args = TrainingArguments(
            output_dir=f"./results/{task}_r{r}",
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=1,
            eval_strategy="epoch",
            logging_strategy="epoch",
            save_strategy="no",
            learning_rate=2e-5,
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=eval_data,
            tokenizer=tokenizer,
        )

        trainer.train()
        eval_result = trainer.evaluate()

        results.append({
            "task": task,
            "r": r,
            "metric": cfg["metric_key"].replace("eval_", ""),
            "value": eval_result.get(cfg["metric_key"], None)
        })

# Convert results to DataFrame
df = pd.DataFrame(results)
df.pivot(index="r", columns="task", values="value").plot(title="LoRA Rank Ablation")
df



Running task: sequence_classification


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  > Training with LoRA rank r=1


  trainer = Trainer(


{'loss': 0.7022, 'grad_norm': 2.8404700756073, 'learning_rate': 8.000000000000001e-07, 'epoch': 1.0}
{'eval_loss': 0.6963599920272827, 'eval_runtime': 76.8593, 'eval_samples_per_second': 2.602, 'eval_steps_per_second': 0.325, 'epoch': 1.0}
{'train_runtime': 253.668, 'train_samples_per_second': 0.788, 'train_steps_per_second': 0.099, 'train_loss': 0.7022010040283203, 'epoch': 1.0}
{'eval_loss': 0.6963599920272827, 'eval_runtime': 75.4676, 'eval_samples_per_second': 2.65, 'eval_steps_per_second': 0.331, 'epoch': 1.0}
  > Training with LoRA rank r=2


KeyboardInterrupt: 

In [None]:
from huggingface_hub import HfApi
from transformers import AutoConfig
from transformers.utils import logging
import traceback

# Suppress warnings from transformers
logging.set_verbosity_error()

api = HfApi()

model_names = [
    # Text Classification
    "bert-base-uncased",
    "textattack/bert-base-uncased-SST-2",
    "distilbert-base-uncased-finetuned-sst-2-english",
    "cardiffnlp/twitter-roberta-base-sentiment",

    # Question Answering
    "deepset/roberta-base-squad2",
    "distilbert-base-cased-distilled-squad",

    # Token Classification
    "dslim/bert-base-NER",
    "Davlan/bert-base-multilingual-cased-ner-hrl",

    # Causal Language Modeling
    "gpt2",
    "tiiuae/falcon-7b",
    #"mistralai/Mistral-7B-v0.1",

    # Seq2Seq
    "facebook/bart-large-cnn",
    "t5-small",
    "Helsinki-NLP/opus-mt-en-de",

    # Masked Language Modeling
    "roberta-base",

    # Edge cases
    "allenai/longformer-base-4096",
    "google/mt5-small",
    "Salesforce/codegen-350M-multi"
]

for model_name in model_names:
    print("=" * 80)
    print(f"Model: {model_name}")
    try:
        info = api.model_info(model_name)
        config = AutoConfig.from_pretrained(model_name)
        print("Pipeline tag:     ", info.pipeline_tag)
        print("Model type from config:", config.model_type)
        print("Model architecture:", config.architectures)
    except Exception as e:
        print("Error:", str(e))
        traceback.print_exc()


Model: bert-base-uncased
Pipeline tag:      fill-mask
Model type from config: bert
Model architecture: ['BertForMaskedLM']
Model: textattack/bert-base-uncased-SST-2
Pipeline tag:      text-classification
Model type from config: bert
Model architecture: ['BertForSequenceClassification']
Model: distilbert-base-uncased-finetuned-sst-2-english
Pipeline tag:      text-classification
Model type from config: distilbert
Model architecture: ['DistilBertForSequenceClassification']
Model: cardiffnlp/twitter-roberta-base-sentiment
Pipeline tag:      text-classification
Model type from config: roberta
Model architecture: ['RobertaForSequenceClassification']
Model: deepset/roberta-base-squad2
Pipeline tag:      question-answering
Model type from config: roberta
Model architecture: ['RobertaForQuestionAnswering']
Model: distilbert-base-cased-distilled-squad
Pipeline tag:      question-answering
Model type from config: distilbert
Model architecture: ['DistilBertForQuestionAnswering']
Model: dslim/bert

In [None]:
from transformers import CONFIG_MAPPING

print(sorted(CONFIG_MAPPING.keys()))


['albert', 'align', 'altclip', 'aria', 'aria_text', 'audio-spectrogram-transformer', 'autoformer', 'aya_vision', 'bamba', 'bark', 'bart', 'beit', 'bert', 'bert-generation', 'big_bird', 'bigbird_pegasus', 'biogpt', 'bit', 'bitnet', 'blenderbot', 'blenderbot-small', 'blip', 'blip-2', 'blip_2_qformer', 'bloom', 'bridgetower', 'bros', 'camembert', 'canine', 'chameleon', 'chinese_clip', 'chinese_clip_vision_model', 'clap', 'clip', 'clip_text_model', 'clip_vision_model', 'clipseg', 'clvp', 'code_llama', 'codegen', 'cohere', 'cohere2', 'colpali', 'conditional_detr', 'convbert', 'convnext', 'convnextv2', 'cpmant', 'csm', 'ctrl', 'cvt', 'd_fine', 'dab-detr', 'dac', 'data2vec-audio', 'data2vec-text', 'data2vec-vision', 'dbrx', 'deberta', 'deberta-v2', 'decision_transformer', 'deepseek_v3', 'deformable_detr', 'deit', 'depth_anything', 'depth_pro', 'deta', 'detr', 'diffllama', 'dinat', 'dinov2', 'dinov2_with_registers', 'distilbert', 'donut-swin', 'dpr', 'dpt', 'efficientformer', 'efficientnet', '

In [None]:
from transformers.pipelines import SUPPORTED_TASKS
from transformers import AutoConfig
import pandas as pd

def extract_default_pt_model(task_info):
    try:
        default = task_info.get("default", {})
        if "model" in default:
            pt_entry = default["model"].get("pt")
            if isinstance(pt_entry, tuple):
                return pt_entry[0]
        for k, v in default.items():
            if isinstance(k, tuple) and isinstance(v, dict):
                pt_entry = v.get("model", {}).get("pt")
                if isinstance(pt_entry, tuple):
                    return pt_entry[0]
    except Exception:
        return None
    return None

grouped_by_type = {}

for pipeline_tag, task_info in SUPPORTED_TASKS.items():
    task_type = task_info.get("type", "unknown")
    model_classes = [cls.__name__ for cls in task_info.get("pt", ())]
    pt_model_id = extract_default_pt_model(task_info)
    try:
        config = AutoConfig.from_pretrained(pt_model_id) if pt_model_id else None
        architecture = config.architectures[0] if config and config.architectures else "Unknown"
    except Exception:
        architecture = "Error loading config"

    grouped_by_type.setdefault(task_type, []).append({
        "pipeline_tag": pipeline_tag,
        "auto_model_class": model_classes,
        "default_model": pt_model_id,
        "architecture": architecture
    })

# Flatten and convert to DataFrame
flat_data = []
for task_type, tasks in grouped_by_type.items():
    for task in tasks:
        flat_data.append({
            "type": task_type,
            **task
        })

df = pd.DataFrame(flat_data)
df.sort_values(["type", "pipeline_tag"], inplace=True)
print(df.to_string(index=False))




      type                   pipeline_tag                                                  auto_model_class                                              default_model                           architecture
     audio           audio-classification                                 [AutoModelForAudioClassification]                             superb/wav2vec2-base-superb-ks      Wav2Vec2ForSequenceClassification
     image               depth-estimation                                     [AutoModelForDepthEstimation]                                            Intel/dpt-large                  DPTForDepthEstimation
     image           image-classification                                 [AutoModelForImageClassification]                                google/vit-base-patch16-224              ViTForImageClassification
     image       image-feature-extraction                                                       [AutoModel]                                google/vit-base-patch16-224          

In [None]:
print(df.to_markdown(index=False))

| type       | pipeline_tag                   | auto_model_class                                                      | default_model                                              | architecture                           |
|:-----------|:-------------------------------|:----------------------------------------------------------------------|:-----------------------------------------------------------|:---------------------------------------|
| audio      | audio-classification           | ['AutoModelForAudioClassification']                                   | superb/wav2vec2-base-superb-ks                             | Wav2Vec2ForSequenceClassification      |
| image      | depth-estimation               | ['AutoModelForDepthEstimation']                                       | Intel/dpt-large                                            | DPTForDepthEstimation                  |
| image      | image-classification           | ['AutoModelForImageClassification']                             

In [None]:
from transformers import (
    AutoModel, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForQuestionAnswering,
    CONFIG_MAPPING
)

auto_model_classes = {
    "AutoModel": AutoModel,
    "AutoModelForMaskedLM": AutoModelForMaskedLM,
    "AutoModelForCausalLM": AutoModelForCausalLM,
    "AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM,
    "AutoModelForSequenceClassification": AutoModelForSequenceClassification,
    "AutoModelForTokenClassification": AutoModelForTokenClassification,
    "AutoModelForQuestionAnswering": AutoModelForQuestionAnswering,
}

# Limit to common examples of encoder, decoder, and encoder-decoder models
selected_model_types = ["bert", "roberta", "gpt2", "opt", "t5", "bart"]

for model_type in selected_model_types:
    try:
        config_class = CONFIG_MAPPING[model_type]
        config = config_class()
    except Exception:
        continue

    for name, cls in auto_model_classes.items():
        try:
            model = cls.from_config(config)
            print(f"{model_type:<10} + {name:<35} → {model.__class__.__name__}")
        except:
            continue


bert       + AutoModel                           → BertModel


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


bert       + AutoModelForMaskedLM                → BertForMaskedLM
bert       + AutoModelForCausalLM                → BertLMHeadModel
bert       + AutoModelForSequenceClassification  → BertForSequenceClassification
bert       + AutoModelForTokenClassification     → BertForTokenClassification
bert       + AutoModelForQuestionAnswering       → BertForQuestionAnswering
roberta    + AutoModel                           → RobertaModel


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


roberta    + AutoModelForMaskedLM                → RobertaForMaskedLM
roberta    + AutoModelForCausalLM                → RobertaForCausalLM
roberta    + AutoModelForSequenceClassification  → RobertaForSequenceClassification
roberta    + AutoModelForTokenClassification     → RobertaForTokenClassification
roberta    + AutoModelForQuestionAnswering       → RobertaForQuestionAnswering
gpt2       + AutoModel                           → GPT2Model
gpt2       + AutoModelForCausalLM                → GPT2LMHeadModel
gpt2       + AutoModelForSequenceClassification  → GPT2ForSequenceClassification
gpt2       + AutoModelForTokenClassification     → GPT2ForTokenClassification
gpt2       + AutoModelForQuestionAnswering       → GPT2ForQuestionAnswering
opt        + AutoModel                           → OPTModel
opt        + AutoModelForCausalLM                → OPTForCausalLM
opt        + AutoModelForSequenceClassification  → OPTForSequenceClassification
opt        + AutoModelForQuestionAnswering     

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification

model1 = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
model2 = AutoModelForTokenClassification.from_pretrained("bert-base-uncased")

for (n1, p1), (n2, p2) in zip(model1.bert.named_parameters(), model2.bert.named_parameters()):
    assert torch.equal(p1, p2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
