# Take Two

In [2]:
import numpy as np
from datasets import Dataset
import os
from pathlib import Path
from pynvml import *
import loralib as lora
import torch

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from src.utils.data import read_pickle


# Load test dataset
GO_ANNOTATIONS_PATH = "/home/samirchar/ProteinFunctions/data/annotations/go_annotations_2019_07_01_updated.pkl"

go_annotations = read_pickle(GO_ANNOTATIONS_PATH)
from src.utils.data import read_json
embeddings_path ="/home/samirchar/ProteinFunctions/data/embeddings/frozen_BioGPT_label_embeddings_last_token.pt" 
embeddings_idx_path ="/home/samirchar/ProteinFunctions/data/embeddings/frozen_BioGPT_label_embeddings_last_token_index.pt" 

embeddings = torch.load(embeddings_path)
embeddings_index = torch.load(embeddings_idx_path)

# Get some labels
text = go_annotations['label'].tolist()

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [3]:
embeddings_index


Unnamed: 0,id,description_type
0,GO:0000001,name
1,GO:0000001,label
2,GO:0000001,synonym_exact
3,GO:0000002,name
4,GO:0000002,label
...,...,...
185239,GO:2001317,synonym_exact
185240,GO:2001317,synonym_exact
185241,GO:2001317,synonym_exact
185242,GO:2001317,synonym_exact


In [None]:
[-1.2802,  0.4720,  2.4675,  ..., -1.8398, -1.0484,  0.5001]
[-0.5067,  0.1378, -0.5433,  ..., -0.1567,  0.1737, -0.3209]

In [8]:
go_annotations.loc['GO:2001317']['label']

'The chemical reactions and pathways resulting in the formation of kojic acid.'

In [4]:
from src.utils.data import read_fasta
import torch

In [30]:
text=[text[47400]]

In [33]:
text =[text]

In [34]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel

print_gpu_utilization()
checkpoint = 'microsoft/biogpt'
model = AutoModel.from_pretrained(
    checkpoint,
    # torch_dtype=torch.float16,
).to("cuda")

# Initialize label tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print_gpu_utilization()

default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "no",
    "do_eval": False,
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

# Tokenize the go_annotations list
tokenized_data = tokenizer(text, padding="longest", truncation=True, max_length=512, return_tensors="pt")

# Create random labels for the new dataset
random_labels = np.random.randint(0, 1, (len(text)))

# Create the dataset from the tokenized data and random labels
dummy_dataset = Dataset.from_dict({
    "input_ids": tokenized_data["input_ids"],
    "attention_mask": tokenized_data["attention_mask"], 
    "labels": random_labels
})

# Set the format to PyTorch tensors
dummy_dataset.set_format("pt")

GPU memory occupied: 48427 MB.


GPU memory occupied: 48427 MB.


In [10]:

import re 
def apply_lora_biogpt_attention(layer,rank,alpha,device,in_features= 1024, out_features= 1024):
    # layer.self_attn.q_proj = lora.Linear(
    #     in_features, out_features, r=rank,lora_alpha=alpha)  
    # layer.self_attn.v_proj = lora.Linear(
    #     in_features, out_features, r=rank,lora_alpha=alpha)
    # layer.self_attn.k_proj = lora.Linear(
    #     in_features, out_features, r=rank,lora_alpha=alpha)
    layer.self_attn.out_proj = lora.Linear(
        in_features, out_features, r=rank,lora_alpha=alpha)
    # layer.fc1 = lora.Linear(
    #     in_features, out_features*4, r=rank,lora_alpha=alpha)
    # layer.fc2 = lora.Linear(
    #     in_features*4, out_features, r=rank,lora_alpha=alpha)
    

    layer=layer.to(device)
    
def biogpt_train_last_n_layers(model,n,lora_params=None):
    for param in model.parameters():
        param.requires_grad = False

    if n>0:
        max_layer_num = len(model.layers)-1
        for param_name,param in model.named_parameters():
            layer_num = re.search(r'layers\.(\d+)', param_name)
            if layer_num:
                number = int(layer_num.group(1))
                if number>max_layer_num-n:
                    param.requires_grad = True
                    if lora_params is not None:

                        apply_lora_biogpt_attention(**{**lora_params,
                                                     'layer':model.layers[number]}
                                                     )
        
        if lora_params is not None:
            lora.mark_only_lora_as_trainable(model)
        
        #Always train last layer norm.
        for param in model.layer_norm.parameters():
            param.requires_grad = True


In [11]:
model

BioGptModel(
  (embed_tokens): Embedding(42384, 1024, padding_idx=1)
  (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
  (layers): ModuleList(
    (0-23): 24 x BioGptDecoderLayer(
      (self_attn): BioGptAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1024, out_features=4096, bias=True)
      (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
  )
  (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)

In [12]:
biogpt_train_last_n_layers(model,
                           0,
                           lora_params={'rank':8,'alpha':4,'in_features':1024,'out_features':1024,'device':'cuda:0'}
                           )

for param_name,param in model.named_parameters():
    if param.requires_grad:
        print(param_name)
print_trainable_parameters(model)



trainable params: 0 || all params: 346763264 || trainable%: 0.00


In [13]:
def pool_embeddings(last_hidden_states,attention_mask,method):
    '''
    '''
    sequence_length = attention_mask.sum(dim=1, keepdim=True) #includind SOS token
    last_token_indices = sequence_length - 1

 
    if method=='mean':
        #Account for SOS token
        adjusted_attention_mask = attention_mask.clone()
        adjusted_attention_mask[:,0]=0
 
        # Mask the last_hidden_state tensor and compute the sum
        sum_hidden_states = (last_hidden_states *
                                adjusted_attention_mask.unsqueeze(-1)).sum(dim=1)
 
        # Compute the mean of the last hidden state
        sequence_embedding = sum_hidden_states / (sequence_length-1) #subtract -1 for SOS token
 
    elif method == 'last_token':
        last_token_indices = last_token_indices\
            .unsqueeze(-1)\
            .expand(-1, -1, last_hidden_states.size(-1))
 
        sequence_embedding = last_hidden_states.gather(1, last_token_indices).squeeze()
    elif method == 'all':
 
        sequence_embedding = last_hidden_states
    
    
 
 
    return sequence_embedding

In [None]:
raw_attn_scorer = torch.nn.Linear(1024,1, bias=True).to('cuda:0')

def additive_attention(hidden_states,attention_mask):
    raw_attn_scores = raw_attn_scorer(hidden_states).squeeze(-1)
    
    #Masked scored for softmax
    raw_attn_scores = raw_attn_scores.masked_fill(attention_mask==0,float('-inf'))

    #Normalized attention weights
    attn_weights = torch.softmax(raw_attn_scores,dim=-1)

    #Get final label embedding
    return torch.bmm(attn_weights.unsqueeze(1),hidden_states).squeeze(1)

In [None]:
def _get_joint_embeddings(P_e, L_e, num_sequences,num_labels):

    sequence_embedding_dim = P_e.shape[1]
    label_embedding_dim = L_e.shape[1]

    # Use broadcasting so we don't have to expand the tensor dimensions
    joint_embeddings = torch.cat([
        P_e[:, None, :].expand(
            num_sequences, num_labels, sequence_embedding_dim),
        L_e[None, :, :].expand(
            num_sequences, num_labels, label_embedding_dim)
    ], dim=2).reshape(-1, sequence_embedding_dim + label_embedding_dim)

    return joint_embeddings

In [None]:
from torchvision.ops import MLP


In [None]:
P_f = torch.rand((32,1100)).to('cuda:0')

#L_e = torch.rand((32102,1024))
L_f = additive_attention(label_embeddings,masks)
W_p = MLP(1100,[1024]*1,bias=False,norm_layer=torch.nn.BatchNorm1d).to('cuda:0')
W_l = MLP(1024,[1024]*1,bias=False,norm_layer=torch.nn.BatchNorm1d).to('cuda:0')

L_e = W_l(L_f)
P_e = W_p(P_f)

In [None]:
joint=_get_joint_embeddings(P_e.to('cuda:0'), L_e, 32,32102)

In [None]:
joint.shape,P_e.shape,L_e.shape,P_f.shape,L_f.shape

In [None]:
tokenized_data['input_ids']=torch.randint(0,10000,(1000,60)).to('cuda:0')
tokenized_data['attention_mask']=torch.randint(0,1,(1000,60)).to('cuda:0')

In [None]:
tokenized_data['input_ids'].shape

In [None]:
torch.cuda.memory_allocated('cuda:0')

In [20]:
np.where(go_annotations.index=='GO:2001317')

(array([47400]),)

In [22]:
tokenized_data['input_ids'][[47400]]

tensor([[    2,    18,   919,  1263,     8,   885,   946,    10,     6,   381,
             5, 15527,  1777,  1047,   156,     4,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [35]:
import torch.autograd.profiler as profiler
import torch
from torch.cuda.amp import autocast

bs = 1000
embeddings = []

for i in range(1):
    print(i)
    with autocast(), torch.set_grad_enabled(False):

        embeddings.append(
            pool_embeddings(
                model(
                    input_ids = tokenized_data['input_ids'][:bs,:].to('cuda:0'),
                    attention_mask = tokenized_data['attention_mask'][:bs,:].to('cuda:0')).last_hidden_state,
                    tokenized_data['attention_mask'][:bs,:].to('cuda:0'),
                    method='last_token'
                    )
                )   
    print(torch.cuda.memory_allocated('cuda:0'))

0
3408222720


In [38]:
embeddings[0].sum()

tensor(-44.6542, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
zero_shot_embeddings

In [None]:
zero_shot_vocabularies

In [None]:
len(embeddings)

In [None]:
81.2/2744

In [None]:
len(embeddings)

In [None]:
torch.cuda.memory_allocated('cuda:0')

In [None]:
torch.cuda.memory_allocated('cuda:0')

In [None]:
(13235055616-1387511808)/1e9

In [None]:
import torch.autograd.profiler as profiler
import torch
with profiler.profile(with_stack=True, profile_memory=True) as prof:

    outs = []
    with torch.set_grad_enabled(True):
        for i in range(13):
            print(i)
            out = model(input_ids = tokenized_data['input_ids'].to('cuda:0'),attention_mask = tokenized_data['attention_mask'].to('cuda:0')).last_hidden_state
            sequence_embedding = pool_embeddings(out,tokenized_data['attention_mask'].to('cuda:0'),method='last_token')
            outs.append(sequence_embedding)

In [None]:
print_gpu_utilization()
torch.cuda.memory_allocated('cuda:0')

In [None]:
print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cuda_memory_usage', row_limit=50))


In [None]:
print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))


In [None]:
torch.cuda.memory_allocated('cuda:0')

In [None]:
5600617472/1606418432

In [None]:
torch.cuda.memory_allocated('cuda:0')

In [None]:
5600617472 - 5599797248

In [None]:
print_gpu_utilization()

In [None]:
sequence_embedding = pool_embeddings(out.last_hidden_state,tokenized_data['attention_mask'].to('cuda:0'),method='all')

In [None]:
raw_attn_scorer = nn.Linear(sequence_embedding.shape[-1],1, bias=True).to('cuda:0')


In [None]:
 
raw_attn_scores = raw_attn_scorer(sequence_embedding).squeeze(-1)
softmax_mask = torch.where(tokenized_data['attention_mask'].to('cuda:0')==0,-torch.inf,tokenized_data['attention_mask'].to('cuda:0'))
attn_weights = torch.softmax(raw_attn_scores+softmax_mask,dim=-1)
sequence_embedding_ = (sequence_embedding*attn_weights.unsqueeze(-1)).sum(dim=1)

In [None]:
sequence_embedding_.sum()

In [None]:
 
raw_attn_scores = raw_attn_scorer(sequence_embedding).squeeze(-1)

#Masked scored for softmax
raw_attn_scores = raw_attn_scores.masked_fill(tokenized_data['attention_mask'].to('cuda:0')==0,float('-inf'))

#softmax_mask = torch.where(tokenized_data['attention_mask'].to('cuda:0')==0,-torch.inf,tokenized_data['attention_mask'].to('cuda:0'))
attn_weights = torch.softmax(raw_attn_scores+softmax_mask,dim=-1)
sequence_embedding___ = torch.bmm(attn_weights.unsqueeze(1),sequence_embedding)

In [None]:
(sequence_embedding_-sequence_embedding___.squeeze(1)).sum()

In [None]:
sequence_embedding___.sum()

In [None]:
torch.softmax(raw_attn_scores[0][:28],dim=0)

In [None]:
raw_attn_scores

In [None]:
torch.softmax(raw_attn_scores+softmax_mask,dim=-1)

In [None]:
def compute_mean_hidden_states(last_hidden_states, attention_mask):
    """Compute the mean of the last hidden state for only the relevant tokens."""
    # Compute the number of relevant tokens for each sequence
    num_relevant_tokens = attention_mask.sum(dim=1, keepdim=True)
    # Mask the last_hidden_state tensor and compute the sum
    sum_hidden_states = (last_hidden_states *
                         attention_mask.unsqueeze(-1)).sum(dim=1)
    # Compute the mean of the last hidden state
    return sum_hidden_states / num_relevant_tokens

compute_mean_hidden_states(out.last_hidden_state,tokenized_data['attention_mask'].to('cuda:0'))


In [None]:
score(out.last_hidden_state).shape

In [None]:
linear_output = score(out.last_hidden_state)
idxs = (torch.ne(tokenized_data['input_ids'].to('cuda:0'), model.config.pad_token_id).sum(-1) - 1).to('cuda:0')
linear_output[torch.arange(2, device='cuda:0'), idxs.squeeze()]


In [None]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()


test_model = model

# Check if model is on cuda
print(f"Model on cuda: {next(test_model.parameters()).is_cuda}")

batch_size = 40
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size, 
    remove_unused_columns=False,
    fp16=True,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    **default_args
)
trainer = Trainer(model=test_model, args=training_args, train_dataset=dummy_dataset)
result = trainer.train()
print_summary(result)



train full model: memory 62150 MB

train last layer: memory 16900 MB

train last 2 layers: memory 18820  MB

train last 3 layers: memory 20620  MB

train last 4 layers: memory 22582  MB

train full model + lora: memory 54080 MB



LoRa:

Model on cuda: True
{'train_runtime': 14.3185, 'train_samples_per_second': 69.84, 'train_steps_per_second': 1.746, 'train_loss': 1.344638671875, 'epoch': 1.0}
Time: 14.32
Samples/second: 69.84
GPU memory occupied: 55019 MB.