In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
class LoRALayer(nn.Module):
  """ 
    merge:  使用这个线性层的时候,要不要把预训练的权重用上去 | 开关
    rank: 降到多少维
    lora_alpha: 权重到底以什么样的比例跟原始的权重进行相加
  """
  def __init__(self, in_features, out_features, merge, rank=16, lora_alpha=16, dropout=0.5):
    super(LoRALayer, self).__init__()
    self.in_features = in_features
    self.out_features = out_features
    self.rank = rank
    self.merge = merge
    self.lora_alpha = lora_alpha
    self.dropout_rate = dropout

    self.linear = nn.Linear(in_features, out_features)

    if rank > 0:
      self.lora_b = nn.Parameter(torch.zeros(out_features, rank)) # outxr
      self.lora_a = nn.Parameter(torch.zeros(rank, in_features)) # rxin
      self.scale = self.lora_alpha / self.rank   # 以怎么样一个权重系数,放缩之后的一个结果加到原始的权重当中
      self.linear.weight.requires_grad = False  # 冻结原始的权重
    
    if self.dropout_rate > 0:
      self.dropout = nn.Dropout(p=self.dropout_rate)
    else:
      self.dropout = nn.Identity()
    
    # 对lora_a权重进行初始化
    self.initial_weights()
  
  def initial_weights(self):
    # 把a变成均值为0,方差维sigma的一个高斯噪声分布
    nn.init.kaiming_uniform_(self.lora_a, a=math.sqrt(5))
    nn.init.zeros_(self.lora_b)
  
  def forward(self, x):
    if self.rank > 0 and self.merge:
      # self.linear.weight.shape = (out_features, in_features)
      # self.lora_b @ self.lora_a.shape = (out_features, in_features)
      output = F.linear(x, self.linear.weight + self.lora_b @ self.lora_a * self.scale, self.linear.bias)
      output = self.dropout(output)
      return output
    else:
      return self.dropout(self.linear(x))