# Take Two

In [1]:
import numpy as np
from datasets import Dataset
import os
from pathlib import Path
from pynvml import *
import loralib as lora
import torch

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from src.utils.data import read_pickle


# Load test dataset
GO_ANNOTATIONS_PATH = "/home/samirchar/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl"
go_annotations = read_pickle(GO_ANNOTATIONS_PATH)

# Get first 1000 labels as a list
text = go_annotations.iloc[:, 0].tolist()

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.utils.data import read_fasta
import torch

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel

print_gpu_utilization()
checkpoint = 'microsoft/biogpt'
model = AutoModel.from_pretrained(
    checkpoint,
    # torch_dtype=torch.float16,
).to("cuda")

# Initialize label tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print_gpu_utilization()

default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "no",
    "do_eval": False,
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

# Tokenize the go_annotations list
tokenized_data = tokenizer(text, padding="longest", truncation=True, max_length=512, return_tensors="pt")

# Create random labels for the new dataset
random_labels = np.random.randint(0, 1, (len(text)))

# Create the dataset from the tokenized data and random labels
dummy_dataset = Dataset.from_dict({
    "input_ids": tokenized_data["input_ids"],
    "attention_mask": tokenized_data["attention_mask"], 
    "labels": random_labels
})

# Set the format to PyTorch tensors
dummy_dataset.set_format("pt")

GPU memory occupied: 47 MB.


GPU memory occupied: 2290 MB.


In [31]:
(tokenized_data['attention_mask'].sum(axis=1)*1.0).quantile(.90)

tensor(57.)

In [29]:
np.array(text)[((tokenized_data['attention_mask']\
 .sum(axis=1)*1.0)>103).numpy()]

array(['A SAGA-type histone acetyltransferase complex that contains Spt8 (in budding yeast) or a homolog thereof; additional polypeptides include Spt group, consisting of Spt7, Spt3, and Spt20/Ada5, which interact with the TATA-binding protein (TBP); the Ada group, consisting of Ada1, Ada2, Ada3, Ada4/Gcn5, and Ada5/Spt20, which is functionally linked to the nucleosomal HAT activity; Tra1, an ATM/PI-3 kinase-related protein that targets DNA-bound activators for recruitment to promoters; the TBP-associated factor (TAF) proteins, consisting of Taf5, Taf6, Taf9, Taf10, and Taf12, which mediate nucleosomal HAT activity and are thought to help recruit the basal transcription machinery; the ubiquitin specifc protease Ubp-8.',
       'A ribonuclease complex that has 3-prime to 5-prime processive and distributive hydrolytic exoribonuclease activity and endoribonuclease activity, producing 5-prime-phosphomonoesters. Participates in a multitude of cellular RNA processing and degradation events p

In [28]:
text[[False,False]]

TypeError: list indices must be integers or slices, not list

In [4]:

import re 
def apply_lora_biogpt_attention(layer,rank,alpha,device,in_features= 1024, out_features= 1024):
    layer.self_attn.q_proj = lora.Linear(
        in_features, out_features, r=rank,lora_alpha=alpha)  
    layer.self_attn.v_proj = lora.Linear(
        in_features, out_features, r=rank,lora_alpha=alpha)
    layer.self_attn.k_proj = lora.Linear(
        in_features, out_features, r=rank,lora_alpha=alpha)
    layer.self_attn.out_proj = lora.Linear(
        in_features, out_features, r=rank,lora_alpha=alpha)
    layer.fc1 = lora.Linear(
        in_features, out_features*4, r=rank,lora_alpha=alpha)
    layer.fc2 = lora.Linear(
        in_features*4, out_features, r=rank,lora_alpha=alpha)
    

    layer=layer.to(device)
    
def biogpt_train_last_n_layers(model,n,lora_params=None):
    for param in model.parameters():
        param.requires_grad = False

    if n>0:
        max_layer_num = len(model.layers)-1
        for param_name,param in model.named_parameters():
            layer_num = re.search(r'layers\.(\d+)', param_name)
            if layer_num:
                number = int(layer_num.group(1))
                if number>max_layer_num-n:
                    param.requires_grad = True
                    if lora_params is not None:

                        apply_lora_biogpt_attention(**{**lora_params,
                                                     'layer':model.layers[number]}
                                                     )
        
        if lora_params is not None:
            lora.mark_only_lora_as_trainable(model)
        
        #Always train last layer norm.
        for param in model.layer_norm.parameters():
            param.requires_grad = True


In [5]:
biogpt_train_last_n_layers(model,
                           1,
                           lora_params={'rank':8,'alpha':4,'in_features':1024,'out_features':1024,'device':'cuda:0'}
                           )

for param_name,param in model.named_parameters():
    if param.requires_grad:
        print(param_name)
print_trainable_parameters(model)



layers.23.self_attn.k_proj.lora_A
layers.23.self_attn.k_proj.lora_B
layers.23.self_attn.v_proj.lora_A
layers.23.self_attn.v_proj.lora_B
layers.23.self_attn.q_proj.lora_A
layers.23.self_attn.q_proj.lora_B
layers.23.self_attn.out_proj.lora_A
layers.23.self_attn.out_proj.lora_B
layers.23.fc1.lora_A
layers.23.fc1.lora_B
layers.23.fc2.lora_A
layers.23.fc2.lora_B
layer_norm.weight
layer_norm.bias
trainable params: 149504 || all params: 346910720 || trainable%: 0.04


In [6]:
def pool_embeddings(last_hidden_states,attention_mask,method):
    '''
    '''
    sequence_length = attention_mask.sum(dim=1, keepdim=True) #includind SOS token
    last_token_indices = sequence_length - 1

 
    if method=='mean':
        #Account for SOS token
        adjusted_attention_mask = attention_mask.clone()
        adjusted_attention_mask[:,0]=0
 
        # Mask the last_hidden_state tensor and compute the sum
        sum_hidden_states = (last_hidden_states *
                                adjusted_attention_mask.unsqueeze(-1)).sum(dim=1)
 
        # Compute the mean of the last hidden state
        sequence_embedding = sum_hidden_states / (sequence_length-1) #subtract -1 for SOS token
 
    elif method == 'last_token':
        last_token_indices = last_token_indices\
            .unsqueeze(-1)\
            .expand(-1, -1, last_hidden_states.size(-1))
 
        sequence_embedding = last_hidden_states.gather(1, last_token_indices).squeeze()
    elif method == 'all':
 
        sequence_embedding = last_hidden_states
    
    
 
 
    return sequence_embedding

In [6]:
label_embeddings = torch.load('../data/embeddings/frozen_BioGPT_label_embeddings_me.pkl')

In [7]:
label_embeddings = label_embeddings.to('cuda:0')

In [8]:
masks=tokenized_data['attention_mask'][0].repeat(32102,1).to('cuda:0')

In [9]:
masks=masks[:,:252]

In [10]:
raw_attn_scorer = torch.nn.Linear(1024,1, bias=True).to('cuda:0')

def additive_attention(hidden_states,attention_mask):
    raw_attn_scores = raw_attn_scorer(hidden_states).squeeze(-1)
    
    #Masked scored for softmax
    raw_attn_scores = raw_attn_scores.masked_fill(attention_mask==0,float('-inf'))

    #Normalized attention weights
    attn_weights = torch.softmax(raw_attn_scores,dim=-1)

    #Get final label embedding
    return torch.bmm(attn_weights.unsqueeze(1),hidden_states).squeeze(1)

In [11]:
def _get_joint_embeddings(P_e, L_e, num_sequences,num_labels):

    sequence_embedding_dim = P_e.shape[1]
    label_embedding_dim = L_e.shape[1]

    # Use broadcasting so we don't have to expand the tensor dimensions
    joint_embeddings = torch.cat([
        P_e[:, None, :].expand(
            num_sequences, num_labels, sequence_embedding_dim),
        L_e[None, :, :].expand(
            num_sequences, num_labels, label_embedding_dim)
    ], dim=2).reshape(-1, sequence_embedding_dim + label_embedding_dim)

    return joint_embeddings

In [13]:
from torchvision.ops import MLP


In [14]:
P_f = torch.rand((32,1100)).to('cuda:0')

#L_e = torch.rand((32102,1024))
L_f = additive_attention(label_embeddings,masks)
W_p = MLP(1100,[1024]*1,bias=False,norm_layer=torch.nn.BatchNorm1d).to('cuda:0')
W_l = MLP(1024,[1024]*1,bias=False,norm_layer=torch.nn.BatchNorm1d).to('cuda:0')

L_e = W_l(L_f)
P_e = W_p(P_f)

In [15]:
joint=_get_joint_embeddings(P_e.to('cuda:0'), L_e, 32,32102)

In [16]:
joint.shape,P_e.shape,L_e.shape,P_f.shape,L_f.shape

(torch.Size([1027264, 2048]),
 torch.Size([32, 1024]),
 torch.Size([32102, 1024]),
 torch.Size([32, 1100]),
 torch.Size([32102, 1024]))

In [7]:
tokenized_data['input_ids']=torch.randint(0,10000,(1000,60)).to('cuda:0')
tokenized_data['attention_mask']=torch.randint(0,1,(1000,60)).to('cuda:0')

In [8]:
tokenized_data['input_ids'].shape

torch.Size([1000, 60])

In [8]:
torch.cuda.memory_allocated('cuda:0')

1389062144

In [11]:
(4232723968-1389062144)/1e9

2.843661824

In [9]:
import torch.autograd.profiler as profiler
import torch
from torch.cuda.amp import autocast

bs = 1000
embeddings = []

for i in range(32):
    print(i)
    with autocast(), torch.set_grad_enabled(True):

        embeddings.append(
            pool_embeddings(
                model(
                    input_ids = tokenized_data['input_ids'][:bs,:].to('cuda:0'),
                    attention_mask = tokenized_data['attention_mask'][:bs,:].to('cuda:0')).last_hidden_state,
                    tokenized_data['attention_mask'][:bs,:].to('cuda:0'),
                    method='mean'
                    )
                )   
    print(torch.cuda.memory_allocated('cuda:0'))

0


4232723968
1
7068151296
2
9903480320
3
12738810880
4
15573959680
5
18409290240
6
21244619264
7
24079096320
8
26913573376
9
29748050432
10
32582527488
11
35417004544
12
38251481600
13
41085958656
14
43920435712
15
46754912768
16
49589389824
17
52423866880
18
55258343936
19
58092820992
20
60928150016
21
63762627072
22
66597956096
23
69433285120
24
72267762176
25


OutOfMemoryError: CUDA out of memory. Tried to allocate 470.00 MiB (GPU 0; 79.10 GiB total capacity; 73.97 GiB already allocated; 310.38 MiB free; 77.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [36]:
len(embeddings)

8

In [10]:
len(embeddings)

4

In [14]:
81.2/2744

0.02959183673469388

In [16]:
len(embeddings)

68

In [12]:
torch.cuda.memory_allocated('cuda:0')

83420813312

In [24]:
torch.cuda.memory_allocated('cuda:0')

12202863616

In [10]:
(13235055616-1387511808)/1e9

11.847543808

In [6]:
import torch.autograd.profiler as profiler
import torch
with profiler.profile(with_stack=True, profile_memory=True) as prof:

    outs = []
    with torch.set_grad_enabled(True):
        for i in range(13):
            print(i)
            out = model(input_ids = tokenized_data['input_ids'].to('cuda:0'),attention_mask = tokenized_data['attention_mask'].to('cuda:0')).last_hidden_state
            sequence_embedding = pool_embeddings(out,tokenized_data['attention_mask'].to('cuda:0'),method='last_token')
            outs.append(sequence_embedding)

STAGE:2023-12-11 12:37:18 3191885:3191885 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-12-11 12:37:46 3191885:3191885 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-12-11 12:37:46 3191885:3191885 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [8]:
print_gpu_utilization()
torch.cuda.memory_allocated('cuda:0')

GPU memory occupied: 63014 MB.


53233038336

In [7]:
print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cuda_memory_usage', row_limit=50))


-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        aten::add         0.04%      12.096ms         0.15%      41.053ms      41.552us           0 b           0 b     613.19 Gb     613.19 Gb           988  
                                        aten::bmm         0.04%      10.209ms         0.08%      21.617ms      34.643us           0 b           0 b     548.44 Gb     548.44 Gb           624  
                                      a

In [7]:
print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))


-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            cudaStreamSynchronize        72.84%        2.145s        72.84%        2.145s     715.110ms           0 b           0 b           0 b           0 b             3  
                                         cudaFree        24.41%     718.970ms        24.41%     718.970ms     359.485ms           0 b           0 b           0 b           0 b             2  
                                       

In [11]:
torch.cuda.memory_allocated('cuda:0')

5600207872

In [9]:
5600617472/1606418432

3.4864001560460185

In [7]:
torch.cuda.memory_allocated('cuda:0')

5600617472

In [9]:
5600617472 - 5599797248

820224

In [10]:
print_gpu_utilization()

GPU memory occupied: 17398 MB.


In [92]:
sequence_embedding = pool_embeddings(out.last_hidden_state,tokenized_data['attention_mask'].to('cuda:0'),method='all')

In [93]:
raw_attn_scorer = nn.Linear(sequence_embedding.shape[-1],1, bias=True).to('cuda:0')


In [94]:
 
raw_attn_scores = raw_attn_scorer(sequence_embedding).squeeze(-1)
softmax_mask = torch.where(tokenized_data['attention_mask'].to('cuda:0')==0,-torch.inf,tokenized_data['attention_mask'].to('cuda:0'))
attn_weights = torch.softmax(raw_attn_scores+softmax_mask,dim=-1)
sequence_embedding_ = (sequence_embedding*attn_weights.unsqueeze(-1)).sum(dim=1)

In [95]:
sequence_embedding_.sum()

tensor(-24.2735, device='cuda:0', grad_fn=<SumBackward0>)

In [96]:
 
raw_attn_scores = raw_attn_scorer(sequence_embedding).squeeze(-1)

#Masked scored for softmax
raw_attn_scores = raw_attn_scores.masked_fill(tokenized_data['attention_mask'].to('cuda:0')==0,float('-inf'))

#softmax_mask = torch.where(tokenized_data['attention_mask'].to('cuda:0')==0,-torch.inf,tokenized_data['attention_mask'].to('cuda:0'))
attn_weights = torch.softmax(raw_attn_scores+softmax_mask,dim=-1)
sequence_embedding___ = torch.bmm(attn_weights.unsqueeze(1),sequence_embedding)

In [97]:
(sequence_embedding_-sequence_embedding___.squeeze(1)).sum()

tensor(-4.6718e-06, device='cuda:0', grad_fn=<SumBackward0>)

In [98]:
sequence_embedding___.sum()

tensor(-24.2735, device='cuda:0', grad_fn=<SumBackward0>)

In [347]:
torch.softmax(raw_attn_scores[0][:28],dim=0)

tensor([0.0223, 0.0329, 0.0190, 0.0210, 0.0325, 0.0150, 0.0556, 0.0831, 0.3534,
        0.0090, 0.0100, 0.0238, 0.0070, 0.0072, 0.0189, 0.0219, 0.0217, 0.0133,
        0.0118, 0.0154, 0.0192, 0.0197, 0.0158, 0.0951, 0.0076, 0.0084, 0.0092,
        0.0301], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [340]:
raw_attn_scores

tensor([[ 0.3822,  0.7712,  0.2224,  ..., -0.7430, -0.7430, -0.7430],
        [ 0.3822, -0.0762,  0.4468,  ..., -0.3420, -0.3420, -0.3420]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

In [None]:
torch.softmax(raw_attn_scores+softmax_mask,dim=-1)

In [250]:
def compute_mean_hidden_states(last_hidden_states, attention_mask):
    """Compute the mean of the last hidden state for only the relevant tokens."""
    # Compute the number of relevant tokens for each sequence
    num_relevant_tokens = attention_mask.sum(dim=1, keepdim=True)
    # Mask the last_hidden_state tensor and compute the sum
    sum_hidden_states = (last_hidden_states *
                         attention_mask.unsqueeze(-1)).sum(dim=1)
    # Compute the mean of the last hidden state
    return sum_hidden_states / num_relevant_tokens

compute_mean_hidden_states(out.last_hidden_state,tokenized_data['attention_mask'].to('cuda:0'))


tensor([[-1.3425,  0.1925,  0.4544,  ..., -0.0675,  0.1730,  0.8976],
        [-0.6936, -0.4513,  0.6367,  ..., -1.5723,  1.7188,  0.1801]],
       device='cuda:0', grad_fn=<DivBackward0>)

In [268]:
score(out.last_hidden_state).shape

torch.Size([2, 512, 2])

In [247]:
linear_output = score(out.last_hidden_state)
idxs = (torch.ne(tokenized_data['input_ids'].to('cuda:0'), model.config.pad_token_id).sum(-1) - 1).to('cuda:0')
linear_output[torch.arange(2, device='cuda:0'), idxs.squeeze()]


tensor([[ 0.9007, -0.7836],
        [-0.5462, -1.2105]], device='cuda:0', grad_fn=<IndexBackward0>)

In [8]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()


test_model = model

# Check if model is on cuda
print(f"Model on cuda: {next(test_model.parameters()).is_cuda}")

batch_size = 40
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size, 
    remove_unused_columns=False,
    fp16=True,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    **default_args
)
trainer = Trainer(model=test_model, args=training_args, train_dataset=dummy_dataset)
result = trainer.train()
print_summary(result)



Model on cuda: True


ValueError: The model did not return a loss from the inputs, only the following keys: last_hidden_state,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

train full model: memory 62150 MB

train last layer: memory 16900 MB

train last 2 layers: memory 18820  MB

train last 3 layers: memory 20620  MB

train last 4 layers: memory 22582  MB

train full model + lora: memory 54080 MB



LoRa:

Model on cuda: True
{'train_runtime': 14.3185, 'train_samples_per_second': 69.84, 'train_steps_per_second': 1.746, 'train_loss': 1.344638671875, 'epoch': 1.0}
Time: 14.32
Samples/second: 69.84
GPU memory occupied: 55019 MB.