# Take Two

In [1]:
import numpy as np
from datasets import Dataset
import os
from pathlib import Path
from pynvml import *

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from src.utils.data import read_pickle

# seq_len, dataset_size = 512, 512
# dummy_data = {
#     "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
#     "labels": np.random.randint(0, 1, (dataset_size)),
# }
# ds = Dataset.from_dict(dummy_data)
# ds.set_format("pt")

# Load test dataset
GO_ANNOTATIONS_PATH = "/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl"
go_annotations = read_pickle(GO_ANNOTATIONS_PATH)

# Get first 1000 labels as a list
text = go_annotations.iloc[:1000, 0].tolist()

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

print_gpu_utilization()
checkpoint = 'microsoft/biogpt'
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    # torch_dtype=torch.float16,
).to("cuda")

# Initialize label tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print_gpu_utilization()

default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "no",
    "do_eval": False,
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

GPU memory occupied: 47 MB.


Some weights of BioGptForSequenceClassification were not initialized from the model checkpoint at microsoft/biogpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU memory occupied: 2292 MB.


In [3]:
import loralib as lora
for layer in model.biogpt.layers:
    # This assumes in_features and out_features are 1024 as per the model details you shared
    in_features, out_features = 1024, 1024
    layer.self_attn.q_proj = lora.Linear(in_features, out_features, r=16)  # Choose rank r as per requirement
    layer.self_attn.v_proj = lora.Linear(in_features, out_features, r=16)  # Choose rank r as per requirement

In [4]:
lora.mark_only_lora_as_trainable(model)
print_trainable_parameters(model)

trainable params: 1572864 || all params: 348338176 || trainable%: 0.45


In [5]:
from datasets import Dataset
import numpy as np
from peft import LoraConfig, get_peft_model

# Tokenize the go_annotations list
tokenized_data = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Create random labels for the new dataset
random_labels = np.random.randint(0, 1, (len(text)))

# Create the dataset from the tokenized data and random labels
dummy_dataset = Dataset.from_dict({
    "input_ids": tokenized_data["input_ids"],
    "attention_mask": tokenized_data["attention_mask"], 
    "labels": random_labels
})

# Set the format to PyTorch tensors
dummy_dataset.set_format("pt")

print_trainable_parameters(model)

# config = LoraConfig(
#     r=4,
#     lora_alpha=8,
#     target_modules=["k_proj", "v_proj"],
#     lora_dropout=0.5,
#     bias="none",
#     modules_to_save=["classifier"],
# )
# lora_model = get_peft_model(model, config)
# print_trainable_parameters(lora_model)

# # Set requires grad to true for all parameters
# for param in lora_model.parameters():
#     param.requires_grad = True

trainable params: 1572864 || all params: 348338176 || trainable%: 0.45


In [6]:
from transformers import TrainingArguments, Trainer, logging
import transformers
import accelerate
import peft

logging.set_verbosity_error()
# model = lora_model

model.enable_input_require_grads()

test_model = model

# Check if model is on cuda
print(f"Model on cuda: {next(test_model.parameters()).is_cuda}")

batch_size = 150
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size, 
    remove_unused_columns=False,
    fp16=True,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    **default_args
)
trainer = Trainer(model=test_model, args=training_args, train_dataset=dummy_dataset)
result = trainer.train()
print_summary(result)

# 40 GB with batch size = 40, float16, trains in 5 seconds
# 73 GB with batch size = 40, trains in 34 seconds

# With LoRA, batch size = 40: 30974  MB
# Without LoRA, batch size = 40: 39228 MB.


Model on cuda: True
{'train_runtime': 18.9111, 'train_samples_per_second': 52.879, 'train_steps_per_second': 0.264, 'train_loss': 1.65550537109375, 'epoch': 1.0}
Time: 18.91
Samples/second: 52.88
GPU memory occupied: 34948 MB.


: 