# Fine-Tuning Models
Helper class that takes base model and necessary hiperparameters and returns desired fine-tuned model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from transformers import AutoModel

### Classification Model
Class that takes DistilBERT model and simply adds classification head layer on top of it 
so it's ready for classification problems 

In [2]:
class ClassificationModel(nn.Module):
    def __init__(self, base_model):
        super(ClassificationModel, self).__init__()
        
        self.base_model = base_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 2) # output features from bert is 768 and 2 is number of labels
        
    def forward(self, input_ids, attn_mask):
        
        last_hidden_state = self.base_model(input_ids, attention_mask=attn_mask).last_hidden_state
        cls_embedding = last_hidden_state[:, 0, :]   # Take [CLS] token representation
        x = self.dropout(cls_embedding)
        logits = self.linear(x)
        return logits

## Full fine-tuning
In full fine-tuning all of the model's layers are set to trainable<br>

In [3]:
def get_full_classification_model(base_model):
    # Simply add classification head
    model = ClassificationModel(base_model)

    return model

## Parameter efficient fine-tuning

Instead of training and changing all trainable parameters, we choose a subset of them or add a small set of new ones that will be adjusted, while freezing the others. The idea behind this is to have faster and computationally more effective training time.
There are a lot of different approaches that can be used here


## Classification head model
Freezes all parameters except the ones in the last classification layer

In [4]:
def get_classification_head_model(base_model):
    # Freeze all parameters
    for param in base_model.parameters():
        param.requires_grad = False
            
    model = ClassificationModel(base_model)

    return model

In [5]:
base_model = AutoModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float32)
classification_model = get_classification_head_model(base_model)
print(classification_model)
trainable_params = sum(p.numel() for p in classification_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in classification_model.parameters())
print(f"Trainable parameters: {trainable_params} / {total_params}")

2025-09-01 08:35:31.531957: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ClassificationModel(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin

## Adapters

Adding addapter layers after attention and feed-forward layers

In [6]:
class BottleneckAdapter(nn.Module):
    def __init__(self, hidden_size, adapter_size, dropout_rate=0.1):
        """
        A bottleneck adapter module that can be inserted into a transformer.
        
        It projects hidden states down to a lower-dimensional space and then 
        back up again, with non-linearity and dropout in between. This helps 
        the model adapt to new tasks without updating the original transformer.
        
        Args:
            hidden_size: The dimension of the model's hidden states (e.g., 768 for BERT-base)
            adapter_size: The smaller bottleneck dimension (e.g., 64)
            dropout_rate: Regularization to improve generalization
        """
        super().__init__()
        
        self.down_project = nn.Linear(hidden_size, adapter_size)  # d -> b
        self.activation = nn.GELU()  # non-linearity
        self.up_project = nn.Linear(adapter_size, hidden_size)    # b -> d
        self.dropout = nn.Dropout(dropout_rate)
        self.layer_norm = nn.LayerNorm(hidden_size)
        
        # Initialize adapter weights — not learned from pretraining, so good init is important!
        nn.init.xavier_uniform_(self.down_project.weight)
        nn.init.zeros_(self.down_project.bias)
        nn.init.xavier_uniform_(self.up_project.weight)
        nn.init.zeros_(self.up_project.bias)

    def forward(self, hidden_states):
        # Store original input for residual connection
        residual = hidden_states

        # Apply adapter: down-project -> non-linear -> up-project -> dropout
        x = self.down_project(hidden_states)
        x = self.activation(x)
        x = self.up_project(x)
        x = self.dropout(x)

        # Add residual and normalize
        output = residual + x
        output = self.layer_norm(output)
        return output


In [7]:
class AdapterTransformerLayer(nn.Module):
    def __init__(self, transformer_layer, adapter_size):
        """
        Wraps a DistilBERT TransformerBlock with adapters.
        """
        super().__init__()
        self.layer = transformer_layer
        self.hidden_size = transformer_layer.attention.q_lin.in_features

        # Freeze the original transformer block
        for param in self.layer.parameters():
            param.requires_grad = False

        # Add adapters
        self.attention_adapter = BottleneckAdapter(self.hidden_size, adapter_size)
        self.ffn_adapter = BottleneckAdapter(self.hidden_size, adapter_size)

    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        # DistilBERT forward: attention -> add & norm -> ffn -> add & norm

        # 1. Attention sublayer
        sa_output = self.layer.attention(
            hidden_states, 
            attn_mask=attention_mask, 
            head_mask=head_mask
        )[0]

        # Add + Norm (frozen)
        sa_output = self.layer.sa_layer_norm(sa_output + hidden_states)

        # Adapter after attention
        sa_output = self.attention_adapter(sa_output)

        # 2. FFN sublayer
        ffn_output = self.layer.ffn(sa_output)
        ffn_output = self.layer.output_layer_norm(ffn_output + sa_output)

        # Adapter after FFN
        output = self.ffn_adapter(ffn_output)

        return output



In [8]:
def get_adapters_model(base_model, adapter_size=64):
    for i in range(len(base_model.transformer.layer)):
        original_layer = base_model.transformer.layer[i]
        base_model.transformer.layer[i] = AdapterTransformerLayer(original_layer, adapter_size)
    
    classification_model = ClassificationModel(base_model)
    return classification_model

In [9]:
base_model = AutoModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float32)
classification_model = get_adapters_model(base_model)
print(classification_model)
trainable_params = sum(p.numel() for p in classification_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in classification_model.parameters())
print(f"Trainable parameters: {trainable_params} / {total_params}")

ClassificationModel(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x AdapterTransformerLayer(
          (layer): TransformerBlock(
            (attention): DistilBertSdpaAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FFN(
       

## LoRA

Apply LoRA to query and value matrices inside attention layers

In [10]:
class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank=8, alpha=32):
        """
        LoRA implementation for linear layers.
        
        Args:
            in_features: Input dimension
            out_features: Output dimension
            rank: Rank of the low-rank decomposition
            alpha: Scaling factor for the LoRA contribution
        """
        super().__init__()
        self.rank = rank
        self.scaling = alpha / rank
        
        # LoRA weights
        self.lora_A = nn.Parameter(torch.zeros(in_features, rank))
        self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
        
        # Initialize weights
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)
    
    def forward(self, x):
        # LoRA contribution: scaling * (x @ A) @ B
        return self.scaling * (x @ self.lora_A) @ self.lora_B


In [11]:
class LoRALinear(nn.Module):
    def __init__(self, linear_layer, rank=8, alpha=32):
        """
        Wraps a pre-trained linear layer with LoRA functionality.
        
        Args:
            linear_layer: The pre-trained nn.Linear module to adapt
            rank: Rank of the low-rank decomposition
            alpha: Scaling factor
        """
        super().__init__()
        self.linear = linear_layer
        
        # Freeze original weights
        self.linear.weight.requires_grad = False
        if self.linear.bias is not None:
            self.linear.bias.requires_grad = False
            
        # Add LoRA components
        self.lora = LoRALayer(
            linear_layer.in_features, 
            linear_layer.out_features,
            rank=rank,
            alpha=alpha
        )
    
    def forward(self, x):
        # Combine original output with LoRA contribution
        return self.linear(x) + self.lora(x)


In [12]:
def get_lora_model(base_model, rank=8, alpha=32, target_modules=["q_lin", "v_lin"]):
    """
    Apply LoRA to specific modules in a transformer model.
    
    Args:
        model: A Hugging Face transformer model
        rank: Rank for LoRA decomposition
        alpha: Scaling factor
        target_modules: List of module names to apply LoRA to
    """
    # First, freeze all parameters
    for param in base_model.parameters():
        param.requires_grad = False
    
    # Then apply LoRA to target modules
    for name, module in base_model.named_modules():
        if any(target_name in name for target_name in target_modules):
            if isinstance(module, nn.Linear):
                # Get the parent module
                parent_name = '.'.join(name.split('.')[:-1])
                child_name = name.split('.')[-1]
                parent_module = base_model.get_submodule(parent_name)
                
                # Replace with LoRA version
                lora_layer = LoRALinear(module, rank=rank, alpha=alpha)
                setattr(parent_module, child_name, lora_layer)
    
    classification_model = ClassificationModel(base_model)
    return classification_model

In [13]:
base_model = AutoModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float32)
classification_model = get_lora_model(base_model)
print(classification_model)
trainable_params = sum(p.numel() for p in classification_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in classification_model.parameters())
print(f"Trainable parameters: {trainable_params} / {total_params}")

ClassificationModel(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LoRALinear(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): LoRALinear(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
    