# Model Architecture

> Implementation of BERT model variants for rank manipulation experiments

In [None]:
#| default_exp models.base_models

In [1]:
#| hide
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/rank-bert

Mounted at /content/drive
/content/drive/MyDrive/rank-bert


In [2]:
#| hide
!pip install -q nbdev datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
#| hide
from nbdev.showdoc import *

In [4]:
#| export
import torch
import torch.nn as nn
import numpy as np
from transformers import AutoConfig, AutoModelForSequenceClassification, BertConfig, BertForSequenceClassification
from fastai.text.all import *

## BERT Model Variants

We'll implement several BERT model variants for our experiments. According to our technical specification, we need:

1. **BERT-tiny**: A very small BERT model with 2 layers, 128 hidden size, and 2 attention heads
2. **BERT-mini**: 4 layers, 256 hidden size, 4 attention heads
3. **BERT-small**: 4 layers, 512 hidden size, 8 attention heads

We'll use the HuggingFace Transformers library to initialize these models.

In [5]:
#| export
# GLUE task constants
GLUE_NUM_LABELS = {
    'sst2': 2,
    'mrpc': 2,
    'rte': 2
}

In [6]:
#| export
# Model configuration constants
BERT_CONFIGS = {
    'prajjwal1/bert-tiny': {
        'hidden_size': 128,
        'num_hidden_layers': 2,
        'num_attention_heads': 2,
        'intermediate_size': 512
    },
    'prajjwal1/bert-mini': {
        'hidden_size': 256,
        'num_hidden_layers': 4,
        'num_attention_heads': 4,
        'intermediate_size': 1024
    },
    'prajjwal1/bert-small': {
        'hidden_size': 512,
        'num_hidden_layers': 4,
        'num_attention_heads': 8,
        'intermediate_size': 2048
    }
}

In [7]:
#| export
def get_pretrained_model(model_name, task_name, num_labels=None):
    """
    Initialize a pretrained model for a specific task.

    Args:
        model_name (str): HuggingFace model name or path (e.g., 'prajjwal1/bert-tiny', 'bert-mini')
        task_name (str): GLUE task name ('sst2', 'mrpc', 'rte')
        num_labels (int, optional): Number of output labels

    Returns:
        PreTrainedModel: Initialized model
    """
    # Get the number of labels for the task
    num_labels = num_labels or GLUE_NUM_LABELS.get(task_name, 2)

    # Check if the model name is a known configuration or a HuggingFace model
    if model_name in BERT_CONFIGS:
        # Create a new model with the specified configuration
        config = BertConfig(
            **BERT_CONFIGS[model_name],
            num_labels=num_labels,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1
        )
        model = BertForSequenceClassification(config)
    else:
        # Load a pretrained model from HuggingFace
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )

    return model

In [8]:
#| export
def count_parameters(model):
    """
    Count number of trainable parameters in a model.

    Args:
        model: PyTorch model

    Returns:
        Number of trainable parameters
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [10]:
#| export
class BertWrapper(Module):
    """
    Wrapper around BERT model for fastai integration.
    Handles input formatting and output processing.

    This class serves as a base for rank-constrained models,
    making it easier to modify and monitor model behavior.
    """

    def __init__(self, model):
        """
        Initialize the BERT wrapper.

        Args:
            model: Pretrained BERT model
        """
        self.model = model

    def forward(self, x):
        """
        Forward pass through the model.

        Args:
            x: Dictionary of inputs from tokenizer

        Returns:
            Model outputs
        """
        # Handle either dict or tuple input
        if isinstance(x, tuple):
            x = x[0]

        # Extract and ensure inputs are on the correct device
        input_ids = x['input_ids']
        attention_mask = x['attention_mask']
        token_type_ids = x.get('token_type_ids', None)

        # Forward pass through BERT
        if token_type_ids is not None:
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )
        else:
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

        return outputs.logits

In [11]:
#| export
def get_wrapped_model(model_name, task_name, num_labels=None):
    """
    Get a wrapped BERT model for fastai integration.

    Args:
        model_name (str): HuggingFace model name or path
        task_name (str): GLUE task name
        num_labels (int, optional): Number of output labels

    Returns:
        BertWrapper: Wrapped model
    """
    model = get_pretrained_model(model_name, task_name, num_labels)
    return BertWrapper(model)

## Example Usage

Here's how we can create different BERT model variants and check their parameter counts.

In [12]:
# Example: Create and compare different BERT variants
models = {}
for variant in ['prajjwal1/bert-tiny', 'prajjwal1/bert-mini', 'prajjwal1/bert-small']:
    models[variant] = get_pretrained_model(variant, 'sst2')

# Compare parameter counts
for name, model in models.items():
    print(f"{name}: {count_parameters(model):,} parameters")

prajjwal1/bert-tiny: 4,386,178 parameters
prajjwal1/bert-mini: 11,171,074 parameters
prajjwal1/bert-small: 28,764,674 parameters


In [13]:
# Example: Create a wrapped model for fastai
wrapped_model = get_wrapped_model('prajjwal1/bert-tiny', 'mrpc')
print(f"Wrapped model has {count_parameters(wrapped_model):,} parameters")

Wrapped model has 4,386,178 parameters


In [14]:
#| hide
import nbdev; nbdev.nbdev_export()