# Take Two

In [1]:
import numpy as np
from datasets import Dataset
import os
from pathlib import Path
from pynvml import *

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from src.utils.data import read_pickle

# Load test dataset
GO_ANNOTATIONS_PATH = "/home/samirchar/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl"
go_annotations = read_pickle(GO_ANNOTATIONS_PATH)

# Get first 1000 labels as a list
text = go_annotations.iloc[:1000, 0].tolist()

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

print_gpu_utilization()
checkpoint = 'microsoft/biogpt'
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    # torch_dtype=torch.float16,
).to("cuda")

# Initialize label tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print_gpu_utilization()

default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "no",
    "do_eval": False,
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

GPU memory occupied: 5131 MB.


Some weights of BioGptForSequenceClassification were not initialized from the model checkpoint at microsoft/biogpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU memory occupied: 7370 MB.


In [2]:
from datasets import Dataset
import numpy as np

# Tokenize the go_annotations list
tokenized_data = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Create random labels for the new dataset
random_labels = np.random.randint(0, 1, (len(text)))

# Create the dataset from the tokenized data and random labels
dummy_dataset = Dataset.from_dict({
    "input_ids": tokenized_data["input_ids"],
    "attention_mask": tokenized_data["attention_mask"], 
    "labels": random_labels
})

# Set the format to PyTorch tensors
dummy_dataset.set_format("pt")

print_trainable_parameters(model)


NameError: name 'tokenizer' is not defined

In [4]:
import re
def biogpt_train_last_n_layers(model,n):
    if n>0:
        max_layer_num = len(model.biogpt.layers)-1
        for param_name,param in model.named_parameters():
            layer_num = re.search(r'biogpt.layers\.(\d+)', param_name)

            if layer_num:
                number = int(layer_num.group(1))
                if number>max_layer_num-n:
                    param.requires_grad = True

        for param in model.biogpt.layer_norm.parameters():
            param.requires_grad = True
        

In [5]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()

# freeze all layers
for param in model.parameters():
    param.requires_grad = False

biogpt_train_last_n_layers(model,4)


test_model = model

# Check if model is on cuda
print(f"Model on cuda: {next(test_model.parameters()).is_cuda}")

batch_size = 40
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size, 
    remove_unused_columns=False,
    fp16=True,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    **default_args
)
trainer = Trainer(model=test_model, args=training_args, train_dataset=dummy_dataset)
result = trainer.train()
print_summary(result)



Model on cuda: True
{'train_runtime': 7.7045, 'train_samples_per_second': 129.795, 'train_steps_per_second': 3.245, 'train_loss': 0.029159345626831056, 'epoch': 1.0}
Time: 7.70
Samples/second: 129.79
GPU memory occupied: 22582 MB.


train full model: memory 62150 MB

train last layer: memory 16900 MB

train last 2 layers: memory 18820  MB

train last 3 layers: memory 20620  MB

train last 4 layers: memory 22582  MB

train full model + lora: memory 54080 MB



LoRa:

Model on cuda: True
{'train_runtime': 14.3185, 'train_samples_per_second': 69.84, 'train_steps_per_second': 1.746, 'train_loss': 1.344638671875, 'epoch': 1.0}
Time: 14.32
Samples/second: 69.84
GPU memory occupied: 55019 MB.