<a href="https://colab.research.google.com/github/mitran27/GenerativeNetworks/blob/main/LoraConfig.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from torch.nn import Module,Linear
import torch

In [7]:
type(Linear)

type

In [43]:
class LoraLayer(Module): #adapter layer aaden to the required weight of the real Model
  def __init__(self, in_dim, out_dim, rank, alpha):
    super().__init__()
    self.lora_A = Linear(in_dim, rank, bias=False)
    self.lora_B =  Linear(in_dim, rank, bias=False)


    #we initialized B with zeros beause at he initial iteration the training, before A and B are updated via backpropagation(not learnt learnt), the LoRALayer influence the output and loss in a noise way instabilizing the training so not to have impact the original weights because AB=0 if B=0.

    # we want to learn but not contribute to the output
    with torch.no_grad():
      self.lora_B.weight.copy_(torch.zeros(rank, out_dim))

    self.alpha = alpha # the scaling factor alpha More eight of lora to op real weights

  def forward(self, X):

    return ( X @ self.lora_A  @ self.lora_B ) * self.alpha

class LinearWithLora(Module):
  def __init__(self, linear:Linear, rank, alpha):
    super().__init__()
    self.base_layer = linear # any Layer
    self.lora_layer = LoraLayer(linear.in_features, linear.out_features, rank, alpha)
  def forward(self,X):
    return self.base_layer(X) + self.lora_layer(X)



In [44]:
from functools import partial

class LoraConfig(Module):
  def __init__(self, rank, alpha, target_modules):
    super().__init__()
    self.r =rank
    self.alpha = alpha
    self.target_modules = target_modules
  def get_peft_model(self, model:Module):
    # take one layer of the model and pass it to lorawithlinear along with lora config to add adpter to that layer in the model


    assign_lora = partial(LinearWithLora, rank=self.r, alpha=self.alpha) # just to avoid writing LinearWithLora(linear,rank,alpha) eah time

    for layer in model.distilbert.transformer.layer:

      if("q_proj" in self.target_modules):
            layer.attention.q_lin = assign_lora(layer.attention.q_lin)
      if("k_proj" in self.target_modules):
            layer.attention.k_lin = assign_lora(layer.attention.k_lin)
      if("v_proj" in self.target_modules):
            layer.attention.v_lin = assign_lora(layer.attention.v_lin)
      if("out_proj" in self.target_modules):
            layer.attention.out_lin = assign_lora(layer.attention.out_lin)

    return model




In [45]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:

# Step 1 : freeze the Model
for param in model.parameters():
    param.requires_grad = False

print(model)

# step 2 Lora configs
lora_configs = LoraConfig(rank=4,alpha=16,target_modules=["q_proj","v_proj"])
peftmodel = lora_configs.get_peft_model(model)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [47]:
print(peftmodel)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LinearWithLora(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_layer): LoraLayer(
                (lora_A): Linear(in_features=768, out_features=4, bias=False)
                (lora_B): Linear(in_features=768, out_features=4, bias=False)
              )
            )
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): LinearWithLora(
             