In [None]:
!pip install torch
!pip install transformers
!pip install tqdm


In [18]:
#Import libraries
import math
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
from torch.nn.parameter import Parameter
from transformers import AutoModel

**LoRA linear layer**


In [4]:
class LL(nn.Module):
    def __init__(self, indim, outdim, rank, alpha):
        super().__init__()
        self.A = Parameter(torch.Tensor(indim, rank))
        nn.init.kaiming_uniform_(self.A, a = math.sqrt(5))
        self.B = Parameter(torch.Tensor(rank, outdim))
        self.alpha = alpha #scaling factor

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

class LinearL(nn.Module): #linearLoRA
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LL(linear.in_features, linear.out_features, rank, alpha)
    def forward(self, x):
        return self.linear(x) + self.lora(x)

In [5]:
def replace_lora(model,rank,alpha):
    #replace linear modules with lora layer
    for name, module in model.named_children():
        if isinstance(module, nn.Linear):
            setattr(model, name, LinearL(module, rank, alpha))
        else:
            replace_lora(module, rank, alpha)


In [7]:
model = AutoModel.from_pretrained('microsoft/deberta-v3-large')

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

In [8]:
original_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
original_params

434012160

In [9]:
for param in model.parameters():
    param.requires_grad = False


In [11]:
replace_lora(model = model, rank = 8, alpha = 16)
params_with_lora = sum(p.numel() for p in model.parameters() if p.requires_grad)
params_with_lora

7077888

In [12]:
((original_params - params_with_lora) / original_params) * 100

98.3691959229898

As can be seen, applying LoRA reduces the number of trainable parameters by 98%

**LoRA on simple Transformer**

In [43]:
class LoRaTFM(nn.Module):
    def __init__(self, layer, rank, alpha):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.layer = layer
        self.A = nn.Parameter(torch.randn(layer.weight.shape[0], rank))
        self.B = nn.Parameter(torch.randn(rank, layer.weight.shape[1]))

    def forward(self, x):
        dw = self.alpha / self.rank * torch.matmul(self.A, self.B)
        return self.layer(x) + torch.matmul(x, dw.T)





In [44]:
class Block(nn.Module):
    def __init__(self, dim, num_heads,dim_feedforward, dropout = 0.1, rank = 4, alpha=32):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(dim, num_heads, dropout=dropout)

        self.linear1 = LoRaTFM(nn.Linear(dim, dim_feedforward), rank, alpha)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = LoRaTFM(nn.Linear(dim_feedforward, dim), rank, alpha)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self,x):
        x1 = self.norm1(x)
        attn_out, _ = self.self_attn(x1, x1,x1)
        x = x + self.dropout1(attn_out)

        x1 = self.norm2(x)
        x1 = self.linear1(x1)
        x1 = self.dropout(x1)
        x1 = self.linear2(x1)
        x = x + self.dropout2(x1)

        return x




In [45]:
class SimpleTransformerModel(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, dim_feedforward, dropout=0.1, rank=4, alpha=32):
        super(SimpleTransformerModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.transformer_blocks = nn.ModuleList([
            Block(d_model, nhead, dim_feedforward, dropout, rank, alpha)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, input_dim)

    def forward(self, x):
        x = self.embedding(x)
        for block in self.transformer_blocks:
            x = block(x)
        return self.fc_out(x)


In [46]:
def train(model, dataloader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)
        for src, tgt in progress_bar:
            optimizer.zero_grad()
            output = model(src)
            loss = criterion(output.view(-1, output.size(-1)), tgt.view(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(dataloader)}")



In [None]:
input_dim = 10000
d_model = 512
nhead = 8
num_layers = 6
dim_feedforward = 2048
dropout = 0.1
rank = 4
alpha = 32

model = SimpleTransformerModel(input_dim, d_model, nhead, num_layers, dim_feedforward, dropout, rank, alpha)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Example data
dataloader = [(torch.randint(0, input_dim, (10, 20)), torch.randint(0, input_dim, (10, 20))) for _ in range(100)]

train(model, dataloader, criterion, optimizer, num_epochs=5)