In [31]:
# QLORA Contains: 
# Quantized based model: 4 bit NF4
# Trainable A and B such that W + A.B 
# Only A and B are updated via backpropagation


In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_id = "sshleifer/tiny-gpt2"


In [33]:
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [34]:
#Freezing pretrained model weights
for param in model.parameters():
    param.require_grads = False

In [35]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 2)
    (wpe): Embedding(1024, 2)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-1): 2 x GPT2Block(
        (ln_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=6, nx=2)
          (c_proj): Conv1D(nf=2, nx=2)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=8, nx=2)
          (c_proj): Conv1D(nf=2, nx=8)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=2, out_features=50257, bias=False)
)

In [36]:
# I will now try to modify the h0 attention layer

import bitsandbytes as bnb
import torch.nn as nn
import math
class QLoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r=4, alpha =8):
        super().__init__()
        self.r= r
        self.scaling = alpha/r
        self.base = bnb.nn.Linear8bitLt(
            in_features, out_features, bias= True, has_fp16_weights= False
        )

        self.lora_A = nn.Linear(in_features, r, bias=False)
        self.lora_B = nn.Linear(r, out_features, bias=False)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.lora_A.weight, a= math.sqrt(5))
        nn.init.zeros_(self.lora_B.weight)

    def forward(self, x):
        return self.base(x) + self.lora_B(self.lora_A(x)) * self.scaling

In [37]:
model.transformer.h[0].attn.c_attn

Conv1D(nf=6, nx=2)

In [38]:
old_layer = model.transformer.h[0].attn.c_attn
in_features = old_layer.weight.shape[0]
out_features = old_layer.weight.shape[1]

qlora_layer = QLoRALinear(in_features, out_features)
qlora_layer.base.weight.data = old_layer.weight.data.clone()

model.transformer.h[0].attn.c_attn = qlora_layer

In [39]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 2)
    (wpe): Embedding(1024, 2)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): QLoRALinear(
            (base): Linear8bitLt(in_features=2, out_features=6, bias=True)
            (lora_A): Linear(in_features=2, out_features=4, bias=False)
            (lora_B): Linear(in_features=4, out_features=6, bias=False)
          )
          (c_proj): Conv1D(nf=2, nx=2)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=8, nx=2)
          (c_proj): Conv1D(nf=2, nx=8)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1):

In [40]:
qlora_layer.lora_A.weight.shape

torch.Size([4, 2])

In [41]:
qlora_layer.lora_B.weight.shape

torch.Size([6, 4])

In [44]:
for name, param in model.named_parameters():
    if "lora_A" in name or "lora_B" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [45]:
optimizer = torch.optim.AdamW(
    list(qlora_layer.lora_A.parameters()) + list(qlora_layer.lora_B.parameters()), lr=1e-3
)

In [46]:
total_params =0
trainable_params = 0
for name, param in model.named_parameters():
        param_count = param.numel()
        total_params += param_count
        if param.requires_grad:
            trainable_params += param_count
            print(f"{name:<60} | shape={list(param.shape)}")

print(f"Total Params: {total_params}")
print(f"trainable_params : {trainable_params}") 

transformer.h.0.attn.c_attn.lora_A.weight                    | shape=[4, 2]
transformer.h.0.attn.c_attn.lora_B.weight                    | shape=[6, 4]
Total Params: 102746
trainable_params : 32


In [47]:
#So we Froze all the weights exept lora_a and lora_b. Also the base model has been quantized which reduces the memory usage for forward propagation.