In [49]:
from pathlib import Path
import os
import sys
import gzip
import wget
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd


# Initialize label tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

# Load label encoder
model = AutoModel.from_pretrained(
    "microsoft/biogpt",
    torch_dtype=torch.float16,
)
print(model)

mem_params = sum([param.nelement()*param.element_size() for param in model.parameters()])
mem_bufs = sum([buf.nelement()*buf.element_size() for buf in model.buffers()])
mem = mem_params + mem_bufs # in bytes

# Print in mb
print(mem / 1024**2)

BioGptModel(
  (embed_tokens): Embedding(42384, 1024, padding_idx=1)
  (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
  (layers): ModuleList(
    (0-23): 24 x BioGptDecoderLayer(
      (self_attn): BioGptAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1024, out_features=4096, bias=True)
      (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
  )
  (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
661.3984375


In [50]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

Bad pipe message: %s [b'&~\xbb\xd7!\xb6\xf5JqH5\xc2\xdd\x0e\xdc2\xa3T qu\xdekb\xe9\xb8\xd4\x9d\xba`\xc1\xa2\x1c\x1c1\xd9Z\x1c\xa6\xca\x8a\xfe79\x12\xb7\x95\x1f\xe3\xd8\xfc\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06', b'\x05\x01\x06', b'']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \x10$\x1c\x92oV\xf5\xd2\x19\x91#\xe4H\xddA\xfa\x94\xb8\xb6O\x1b\xd2']
Bad pipe message: %s [b'']
Bad pipe message: %s [b"\xf3A\xd8\x0b\x08\x89b\x9d\xa9\x84\x8c~\x90\xf5N\x8a\x02<\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$

In [43]:
print_trainable_parameters(model)

from peft import LoraConfig, get_peft_model 

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["k_proj", "v_proj"], # Also target fc1, fc2
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 346763264 || all params: 346763264 || trainable%: 100.0
trainable params: 786432 || all params: 347549696 || trainable%: 0.22627900672944337


In [44]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BioGptModel(
      (embed_tokens): Embedding(42384, 1024, padding_idx=1)
      (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-23): 24 x BioGptDecoderLayer(
          (self_attn): BioGptAttention(
            (k_proj): Linear(
              in_features=1024, out_features=1024, bias=True
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=8, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=8, out_features=1024, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
            )
            (v_proj): Linear(
              in_features=1024, out_features=1024, bias=True
          

In [1]:
# Base Case

# base

In [3]:
from transformers import AutoModel
from peft import PeftModelForFeatureExtraction, get_peft_config

config = {
    "peft_type": "LORA",
    "task_type": "FEATURE_EXTRACTION",
    "inference_mode": False,
    "r": 16,
    "target_modules": ["query", "value"],
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "fan_in_fan_out": False,
    "bias": "none",
}
peft_config = get_peft_config(config)
model = AutoModel.from_pretrained("bert-base-cased")
# peft_model = PeftModelForFeatureExtraction(model, peft_config)
# peft_model.print_trainable_parameters()


In [None]:
import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from transformers import AutoTokenizer
from transformers import AutoModel
from peft import PeftModelForFeatureExtraction, get_peft_config

# Setup as provided
config = {
    "peft_type": "LORA",
    "task_type": "FEATURE_EXTRACTION",
    "inference_mode": False,
    "r": 16,
    "target_modules": ["query", "value"],
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "fan_in_fan_out": False,
    "bias": "none",
}
peft_config = get_peft_config(config)
model = AutoModel.from_pretrained("bert-base-cased")
peft_model = PeftModelForFeatureExtraction(model, peft_config)

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
MAX_LENGTH = 512  # Choose according to your requirements

# Get sample data from IMDb
train_iter = IMDB(split='train')
data = [next(train_iter) for _ in range(1000)]  # get a subset for testing
texts = [t[1] for t in data]

def test_batch_size(batch_size, texts):
    """Test a given batch size"""
    input_ids = tokenizer(texts[:batch_size], return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)["input_ids"]
    with torch.no_grad():
        peft_model(input_ids)

def find_max_batch_size():
    """Incrementally test batch sizes"""
    batch_size = 1
    while True:
        try:
            test_batch_size(batch_size, texts)
            print(f"Batch size {batch_size} succeeded")
            batch_size *= 2
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                return batch_size // 2
            raise e

max_batch = find_max_batch_size()
print(f"Maximum supported batch size without OOM error: {max_batch}")
