In [11]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
print(model.modules)

<bound method Module.modules of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>


In [14]:
from transformers.pytorch_utils import Conv1D
import torch.nn as nn

class LoRAConv1D(nn.Module):
    def __init__(self, weight, bias, r, alpha):
        super(LoRAConv1D, self).__init__()
        self.nx, self.nf = weight.shape 
        self.weight = weight
        self.weight.requires_grad = False
        self.bias = bias
        self.r = r
        self.alpha = alpha
        self.A = nn.Parameter(self.weight.new_zeros(self.r, self.nx))
        self.B = nn.Parameter(self.weight.new_zeros(self.nf, self.r))
    
    def forward(self, x):
        size_out = x.size()[:-1] + (self.nf,)
        result = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
        low_rank = self.B @ self.A
        result += x.view(-1, x.size(-1)) @ low_rank.T
        result = result.view(size_out)
        return result

In [27]:
#replace all the attention layers in model with LoRA layers
r = 64
alpha = 0
for name, module in model.named_modules():
    if isinstance(module, Conv1D) and "c_attn" in str(name):
        lora_layer = LoRAConv1D(module.weight, module.bias, r, alpha)
        # Replace the module directly in the parent's _modules dictionary
        parent_name, child_name = name.rsplit('.', 1)
        parent_module = dict(model.named_modules())[parent_name]
        parent_module._modules[child_name] = lora_layer

In [28]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): LoRAConv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [8]:
for _, param in model.named_parameters():
    param.requires_grad = False

for name, module in model.named_modules():
    if isinstance(module, LoRAConv1D):
        for param in module.parameters():
            param.requires_grad = True

for name, param in model.named_parameters():
    if "attn.c_attn" in name: assert param.requires_grad == True
    else: assert param.requires_grad == False


AssertionError: 

In [12]:
r=0
for _, param in model.named_parameters():
    param.requires_grad = False

for name, module in model.named_modules():
    if "attn.c_" in name:
        for param in module.parameters():
            param.requires_grad = True

for name, param in model.named_parameters():
    if "attn.c_" in name: assert param.requires_grad == True
    else: assert param.requires_grad == False


In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
import torch
from torch.cuda.amp import GradScaler, autocast
import tqdm

# Load dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
texts = dataset['train']['text']  # Using a small slice for quick training

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize data
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'load model with total params: {pytorch_total_params} for r= {r}')
model.to(device)

# Prepare data for training
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
dataset = TensorDataset(input_ids, attention_mask)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(dataloader)*5)

# Setup for mixed-precision training
scaler = GradScaler()

# Training loop
model.train()
progress_bar = tqdm.tqdm(range(len(dataloader) * 5), desc="Training")
for epoch in range(5):  # 5 epochs
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids, attention_mask = batch[0].to(device), batch[1].to(device)

        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        progress_bar.update(1)
        progress_bar.set_postfix(loss=loss.item())

progress_bar.close()
print(f"Final loss: {loss.item()}")

load model with total params: 28348416 for r= 0


Training: 100%|██████████| 11475/11475 [37:03<00:00,  5.16it/s, loss=0.478] 

Final loss: 0.47788745164871216





In [14]:
torch.save(model, f"./gpt2_r{r}_16b_512.pt")

In [17]:
import torch
from datasets import load_dataset
import numpy as np
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm.auto import tqdm  # Use tqdm.auto for a progress bar that automatically adjusts to the environment
from transformers import GPT2Tokenizer, GPT2LMHeadModel


def compute_perplexity(model, tokenizer, dataset, batch_size=16):
    max_length = model.config.n_positions
    stride = 1024
    encodings = tokenizer("\n\n".join(dataset['text']), return_tensors="pt")
    seq_len = encodings.input_ids.size(1)
    
    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
    
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
    
            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss
    
        nlls.append(neg_log_likelihood)
    
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
    
    return torch.exp(torch.stack(nlls).mean())

    
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Load the WikiText validation dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")

models_r = [64, 4, 2, 0]
for r in models_r:
    model_name = f'gpt2_r{r}_16b_512.pt'
    model = torch.load(f'./{model_name}')
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'load {model_name} with total params: {pytorch_total_params} for r={r}')    
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    perplexity = compute_perplexity(model, tokenizer, dataset)
    print(f"Perplexity: {perplexity}")

load gpt2_r64_16b_512.pt with total params: 23620608 for r=64


Token indices sequence length is longer than the specified maximum sequence length for this model (251048 > 1024). Running this sequence through the model will result in indexing errors


  0%|          | 0/246 [00:00<?, ?it/s]

Perplexity: 59.72255325317383
load gpt2_r4_16b_512.pt with total params: 21408768 for r=4


  0%|          | 0/246 [00:00<?, ?it/s]

Perplexity: 58.167808532714844
load gpt2_r2_16b_512.pt with total params: 21335040 for r=2


  0%|          | 0/246 [00:00<?, ?it/s]

Perplexity: 59.17245864868164
load gpt2_r0_16b_512.pt with total params: 28348416 for r=0


  0%|          | 0/246 [00:00<?, ?it/s]

Perplexity: 81.9655990600586


In [19]:
import torch
from tqdm import tqdm
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

max_length = model.config.n_positions
stride = 1024
encodings = tokenizer("\n\n".join(dataset['text']), return_tensors="pt")
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
print(ppl)

Token indices sequence length is longer than the specified maximum sequence length for this model (251048 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████▉| 245/246 [00:03<00:00, 77.76it/s]

tensor(31.0423, device='cuda:0')



