In [33]:
import torch
import torch.nn as nn

class LoRALayer(nn.Module):
    def __init__(self, weight, r, alpha):
        super(LoRALayer, self).__init__()
        self.weight = weight
        self.weight.requires_grad = False
        self.r = r
        self.alpha = alpha
        out_features = self.weight.shape[0]
        in_features = self.weight.shape[1]
        self.A = nn.Parameter(self.weight.new_zeros(self.r, in_features))
        self.B = nn.Parameter(self.weight.new_zeros(out_features, r))
    
    def forward(self, x):
        result = x @ self.weight.T
        result += x @ (self.A.T @ self.B.T)
        return result

In [34]:
class FFN(nn.Module):
    def __init__(self, in_channels, hidden_dim, out_channels):
        super(FFN, self).__init__()
        self.linear1 = nn.Linear(in_channels, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, out_channels)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return self.sigmoid(x)

In [35]:
from torch.utils.data import DataLoader, TensorDataset

ffn = FFN(2, 16, 1)
x_xor = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y_xor = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)

dataset_xor = TensorDataset(x_xor, y_xor)
dataloader_xor = DataLoader(dataset_xor, batch_size=1, shuffle=True)

def train_xor_model(model, dataloader):
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(400):
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

def validate_xor_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # No gradients needed for predictions
        for inputs, labels in dataloader:
            outputs = model(inputs)
            print(f"Input: {inputs.numpy()}, Predicted: {outputs.numpy()}")

train_xor_model(ffn, dataloader_xor)
validate_xor_model(ffn, dataloader_xor)


Epoch 10, Loss: 0.2370571345090866
Epoch 20, Loss: 0.18878374993801117
Epoch 30, Loss: 0.14447425305843353
Epoch 40, Loss: 0.15330176055431366
Epoch 50, Loss: 0.03857145458459854
Epoch 60, Loss: 0.028042878955602646
Epoch 70, Loss: 0.04426877945661545
Epoch 80, Loss: 0.017143048346042633
Epoch 90, Loss: 0.02256743796169758
Epoch 100, Loss: 0.009024682454764843
Epoch 110, Loss: 0.012838989496231079
Epoch 120, Loss: 0.005366366356611252
Epoch 130, Loss: 0.008101089857518673
Epoch 140, Loss: 0.006729419343173504
Epoch 150, Loss: 0.0031156884506344795
Epoch 160, Loss: 0.004780036862939596
Epoch 170, Loss: 0.005099200643599033
Epoch 180, Loss: 0.002020863350480795
Epoch 190, Loss: 0.0017750142142176628
Epoch 200, Loss: 0.0027535418048501015
Epoch 210, Loss: 0.0013972821179777384
Epoch 220, Loss: 0.0012848370242863894
Epoch 230, Loss: 0.002415280556306243
Epoch 240, Loss: 0.001051163300871849
Epoch 250, Loss: 0.0009390150080434978
Epoch 260, Loss: 0.0018143825000151992
Epoch 270, Loss: 0.001

In [36]:
ffn_weight = ffn.linear1.weight.detach().clone()
lora_layer = LoRALayer(ffn_weight, 1, 0.1)
setattr(ffn, 'linear1', lora_layer)

y_or = torch.tensor([[0], [1], [1], [1]], dtype=torch.float32)

dataset_xor = TensorDataset(x_xor, y_or)
dataloader_xor = DataLoader(dataset_xor, batch_size=1, shuffle=True)

train_xor_model(ffn, dataloader_xor)
validate_xor_model(ffn, dataloader_xor)

Epoch 10, Loss: 0.6913596391677856
Epoch 20, Loss: 1.1999181879218668e-05
Epoch 30, Loss: 0.21014522016048431
Epoch 40, Loss: 3.653912372669765e-08
Epoch 50, Loss: 0.16833238303661346
Epoch 60, Loss: 1.4353318533721904e-08
Epoch 70, Loss: 0.03315728157758713
Epoch 80, Loss: 9.094947017729282e-09
Epoch 90, Loss: 2.1443704554258147e-06
Epoch 100, Loss: 2.111504272761522e-06
Epoch 110, Loss: 0.08623956143856049
Epoch 120, Loss: 2.0042900814587483e-06
Epoch 130, Loss: 5.150070592208067e-09
Epoch 140, Loss: 1.8691039258555975e-06
Epoch 150, Loss: 0.010944991372525692
Epoch 160, Loss: 1.7177943618662539e-06
Epoch 170, Loss: 1.6434642020612955e-06
Epoch 180, Loss: 0.008251631632447243
Epoch 190, Loss: 3.2741809263825417e-09
Epoch 200, Loss: 0.006990970578044653
Epoch 210, Loss: 1.3510579037756543e-06
Epoch 220, Loss: 2.7138327141074114e-09
Epoch 230, Loss: 0.026804056018590927
Epoch 240, Loss: 0.005128184799104929
Epoch 250, Loss: 2.3194388631964102e-09
Epoch 260, Loss: 0.021439911797642708
E

In [37]:
ffn.modules

<bound method Module.modules of FFN(
  (linear1): LoRALayer()
  (linear2): Linear(in_features=16, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)>

In [119]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
print(model.modules)

loading file vocab.json from cache at /home/piragi/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/vocab.json
loading file merges.txt from cache at /home/piragi/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/piragi/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer_config.json
loading file tokenizer.json from cache at /home/piragi/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer.json
loading configuration file config.json from cache at /home/piragi/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectu

<bound method Module.modules of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>


In [120]:
for name, module in model.named_modules():
    print(name, module.__class__.__name__)

 GPT2LMHeadModel
transformer GPT2Model
transformer.wte Embedding
transformer.wpe Embedding
transformer.drop Dropout
transformer.h ModuleList
transformer.h.0 GPT2Block
transformer.h.0.ln_1 LayerNorm
transformer.h.0.attn GPT2Attention
transformer.h.0.attn.c_attn Conv1D
transformer.h.0.attn.c_proj Conv1D
transformer.h.0.attn.attn_dropout Dropout
transformer.h.0.attn.resid_dropout Dropout
transformer.h.0.ln_2 LayerNorm
transformer.h.0.mlp GPT2MLP
transformer.h.0.mlp.c_fc Conv1D
transformer.h.0.mlp.c_proj Conv1D
transformer.h.0.mlp.act NewGELUActivation
transformer.h.0.mlp.dropout Dropout
transformer.h.1 GPT2Block
transformer.h.1.ln_1 LayerNorm
transformer.h.1.attn GPT2Attention
transformer.h.1.attn.c_attn Conv1D
transformer.h.1.attn.c_proj Conv1D
transformer.h.1.attn.attn_dropout Dropout
transformer.h.1.attn.resid_dropout Dropout
transformer.h.1.ln_2 LayerNorm
transformer.h.1.mlp GPT2MLP
transformer.h.1.mlp.c_fc Conv1D
transformer.h.1.mlp.c_proj Conv1D
transformer.h.1.mlp.act NewGELUActiva

In [121]:
from transformers.pytorch_utils import Conv1D

class LoRAConv1D(nn.Module):
    def __init__(self, weight, bias, r, alpha):
        super(LoRAConv1D, self).__init__()
        self.nf, self.nx = weight.shape 
        self.weight = weight
        self.weight.requires_grad = False
        self.bias = bias
        self.r = r
        self.alpha = alpha
        self.A = nn.Parameter(self.weight.new_zeros(self.r, self.nx))
        self.B = nn.Parameter(self.weight.new_zeros(self.nf, self.r))
    
    def forward(self, x):
        print(x.shape)
        size_out = x.size()[:-1] + (self.nf,)
        result = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
        low_rank = self.B @ self.A
        result += x.view(-1, x.size(-1)) @ low_rank
        x = x.view(size_out)
        return x

In [122]:
for name, module in model.named_modules():
    if isinstance(module, Conv1D):
        print(module.nf)
        print(module.weight.shape)
        print(module.bias.shape)
        break

print("------")
weight = torch.randn([768, 2304])
x = torch.randn([4,473,768])
size_out = x.size()[:-1] + (2304, )
A = torch.randn([8, 2304])
B = torch.randn([768, 8])
result_1 = x.view(-1, x.size(-1)) @ weight
result_2 = B @ A
print((result_1).shape)
print((result_2).shape)
result_2 = x.view(-1, x.size(-1)) @ result_2
print((result_2).shape)
print((result_1 + result_2).shape)


2304
torch.Size([768, 2304])
torch.Size([2304])
------
torch.Size([1892, 2304])
torch.Size([768, 2304])
torch.Size([1892, 2304])
torch.Size([1892, 2304])


In [124]:
r = 8
alpha = 0
for name, module in model.named_modules():
    if isinstance(module, Conv1D):
        lora_layer = LoRAConv1D(module.weight, module.bias, r, alpha)
        #lora_layer = LoRAConv1D(module.nf, module.weight.shape[0])
        # Replace the module directly in the parent's _modules dictionary
        parent_name, child_name = name.rsplit('.', 1)
        parent_module = dict(model.named_modules())[parent_name]
        parent_module._modules[child_name] = lora_layer

In [125]:
model.named_modules

<bound method Module.named_modules of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): LoRAConv1D()
          (c_proj): LoRAConv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): LoRAConv1D()
          (c_proj): LoRAConv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>

In [126]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

# Load dataset
dataset = load_dataset('wikitext', 'wikitext-103-raw-v1')
texts = dataset['train']['text'][:500]  # Directly access the text data

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize data
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Setup device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Prepare data for training
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
dataset = TensorDataset(input_ids, attention_mask)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(3):  # 3 epochs
    for batch in dataloader:
        input_ids, attention_mask = batch[0].to(device), batch[1].to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 9.75 GiB of which 18.19 MiB is free. Including non-PyTorch memory, this process has 9.63 GiB memory in use. Of the allocated memory 8.82 GiB is allocated by PyTorch, and 569.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)