In [6]:
from torch import nn
from transformers import AutoModelForSequenceClassification, DistilBertTokenizer
from functools import partial
import torch

class LoraLayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
# A is initialized with random normal values scaled by std_dev
        std_dev = 1/torch.sqrt(torch.tensor(rank).float())
        self.A = torch.nn.Parameter(torch.randn(in_dim,rank)*std_dev)
        self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))

    def forward(self, tensor):
         tensor = self.alpha * (tensor @ self.A @ self.B)
         return tensor

class LinearWithLora(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.alpha = alpha
        self.lora = LoraLayer(linear.in_features, linear.out_features, rank, alpha)

    def forward(self, tensor):
        return self.linear(tensor) + self.lora(tensor)



In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)
for param in model.parameters():
    param.requires_grad = False

print(model)

cuda


Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1351.38it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]
DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | Details
------------------------+------------+--------
vocab_transform.bias    | UNEXPECTED |        
vocab_layer_norm.bias   | UNEXPECTED |        
vocab_transform.weight  | UNEXPECTED |        
vocab_projector.bias    | UNEXPECTED |        
vocab_layer_norm.weight | UNEXPECTED |        
classifier.bias         | MISSING    |        
pre_classifier.weight   | MISSING    |        
pre_classifier.bias     | MISSING    |        
classifier.weight       | MISSING    |        

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because mis

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSelfAttention(
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [5]:
lora_rank = 8
lora_alpha = 16
lora_dropout = 0.5
lora_query= True
lora_value = True
lora_key = False
lora_mlp = False
lora_projection = False
lora_head = False

layers = []

assign_lora = partial(LinearWithLora, rank=lora_rank, alpha=lora_alpha)  # Freeze Rank and Alpha params

for layer in model.distilbert.transformer.layer:
    layer.attention.q_lin = assign_lora(layer.attention.q_lin)
    layer.attention.v_lin = assign_lora(layer.attention.v_lin)

print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSelfAttention(
            (q_lin): LinearWithLora(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoraLayer()
            )
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): LinearWithLora(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoraLayer()
            )
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)