# Testing Embedding Speed

In [1]:
import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

import logging
from src.utils.data import (
    seed_everything,
    read_pickle,
)
from src.data.datasets import ProteinDataset, calculate_pos_weight, create_multiple_loaders
from src.models.ProTCLTrainer import ProTCLTrainer
from src.models.ProTCL import ProTCL
from src.models.protein_encoders import ProteInfer
from src.utils.evaluation import EvalMetrics, save_evaluation_results
from src.utils.models import count_parameters_by_layer, get_label_embeddings
from src.utils.configs import get_setup
import torch
import wandb
import os
import argparse
import json
from transformers import AutoTokenizer, AutoModel, BatchEncoding
from src.utils.main_utils import get_or_generate_vocabularies,  get_or_generate_label_embeddings, get_or_generate_sequence_embeddings, validate_arguments
from tqdm import tqdm
import os

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# TEST EMBEDDING DISTRIBUTIONS #
import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))
label_embedding_path = "/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_BioGPT_label_embeddings.pkl"
sequence_embedding_path = "/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_proteinfer_sequence_embeddings.pkl"


In [12]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import torch

def read_pickle(file_path: str):
    with open(file_path, "rb") as p:
        item = pickle.load(p)
    return item

# Load the embeddings
label_embeddings = torch.load('/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_BioGPT_label_embeddings.pkl')
sequence_embeddings = read_pickle('/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_proteinfer_sequence_embeddings.pkl')

In [13]:
# Function to calculate statistical measures
def calculate_statistics(embeddings):
    embeddings_array = np.array(embeddings)
    means = np.mean(embeddings_array, axis=0)
    variances = np.var(embeddings_array, axis=0)
    std_devs = np.std(embeddings_array, axis=0)
    return means, variances, std_devs

# Function to plot distributions
def plot_distributions(embeddings1, embeddings2):
    plt.figure(figsize=(10, 6))
    sns.kdeplot(embeddings1.ravel(), label='Label Embeddings')
    sns.kdeplot(embeddings2.ravel(), label='Sequence Embeddings')
    plt.legend()
    plt.title('Distribution of Embedding Vectors')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.show()

# Calculate statistics
label_means, label_variances, label_std_devs = calculate_statistics(label_embeddings)
sequence_means, sequence_variances, sequence_std_devs = calculate_statistics(sequence_embeddings)

# Print the statistics
print("Label Embeddings:")
print("Means:", label_means)
print("Variances:", label_variances)
print("Standard Deviations:", label_std_devs)
print("\nSequence Embeddings:")
print("Means:", sequence_means)
print("Variances:", sequence_variances)
print("Standard Deviations:", sequence_std_devs)

# Print the mean of the means, variances, and standard deviations
print("\nLabel Embeddings:")
print("Mean of Means:", np.mean(label_means))
print("Mean of Variances:", np.mean(label_variances))
print("Mean of Standard Deviations:", np.mean(label_std_devs))
print("\nSequence Embeddings:")
print("Mean of Means:", np.mean(sequence_means))
print("Mean of Variances:", np.mean(sequence_variances))
print("Mean of Standard Deviations:", np.mean(sequence_std_devs))

# Plot the distributions
# plot_distributions(label_embeddings, sequence_embeddings)

Label Embeddings:
Means: [-1.00499    -0.19348931  0.07630822 ...  0.11141217  0.4188767
  0.54361475]
Variances: [0.39269638 0.42584723 0.27725244 ... 0.2506793  0.388853   0.31982717]
Standard Deviations: [0.6266549  0.6525697  0.5265477  ... 0.50067884 0.6235808  0.5655326 ]

Sequence Embeddings:
Means: [-0.20901579 -0.04067202 -0.08404726 ...  1.6288333   0.0574921
  0.1346581 ]
Variances: [0.62677294 0.84094894 0.7439483  ... 2.7185543  0.7942297  0.89135265]
Standard Deviations: [0.79168993 0.91703266 0.8625244  ... 1.6488038  0.89119565 0.94411474]

Label Embeddings:
Mean of Means: -0.005486639
Mean of Variances: 0.36615825
Mean of Standard Deviations: 0.5966133

Sequence Embeddings:
Mean of Means: -0.06397131
Mean of Variances: 1.160447
Mean of Standard Deviations: 0.9797921


Bad pipe message: %s [b'\xdd\xad\xb3\x110v\x90r\x8d\x83\xa2\xa8%\xffq\xa2P\xf3 \xf4\xc6\x18\xe7\xc3\xd5/\xf4]\xe5\xe5\x97\x02\x0e\xcc~\xa1\xd8r\xcfa!\xb2\x8d\x98\x7f\xab\xd9\xfd%\xe9\x82\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19']
Bad pipe message: %s [b'\xba|\x1d\xe6Zk\tZ\xd7;\xf4>%\x80\xa7Un\x91 n\xa3m\xf98m\x7f\xe7l\x17\xdfja\x06\xfdBV\xe3]\xbboq\xd2\xb8\xd1eN\x15\xefdE\x0b\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \xdd\xc18w\xb3\x1e\xfbd\x82\xafl\xc1c\xb

In [2]:
os.environ["ROOT_PATH"] = "/home/ncorley/protein/ProteinFunctions"

# Unpack and process the config file
config = get_setup(
    config_path='/home/ncorley/protein/ProteinFunctions/configs/base_config.yaml',
    run_name="Test",
    overrides=None,
    train_path_name="TRAIN_DATA_PATH",
    val_path_name="VAL_DATA_PATH",
    test_paths_names=["TEST_DATA_PATH"],
    amlt=False,
    is_master=True,
)
params, paths, timestamp, logger = config["params"], config[
    "paths"], config["timestamp"], config["logger"]


Logging to /home/ncorley/protein/ProteinFunctions/outputs/logs/2023-10-08_14-01-25_Test.log and console...


In [3]:
# Load or generate the vocabularies
vocabularies = get_or_generate_vocabularies(
    paths["FULL_DATA_PATH"], paths["VOCABULARIES_DIR"], logger)

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])

# Create datasets
datasets = ProteinDataset.create_multiple_datasets(
    config['dataset_paths_list'],
    label_tokenizer=label_tokenizer,
    vocabularies=vocabularies,
    subset_fractions={
        "train": params["TRAIN_SUBSET_FRACTION"],
        "validation": params["VALIDATION_SUBSET_FRACTION"],
        "test": params["TEST_SUBSET_FRACTION"]}
)

2023-10-08 14:01:28 PDT INFO Loaded amino_acid_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/amino_acid_vocab.json
2023-10-08 14:01:28 PDT INFO Loaded GO_label_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/GO_label_vocab.json
2023-10-08 14:01:28 PDT INFO Loaded GO_label_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/GO_label_vocab.json
2023-10-08 14:01:29 PDT INFO Loaded sequence_id_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/sequence_id_vocab.json


In [4]:
# Define label sample sizes for train, validation, and test loaders
label_sample_sizes = {
    "train": params["TRAIN_LABEL_SAMPLE_SIZE"],
    "validation": params["VALIDATION_LABEL_SAMPLE_SIZE"],
    "test": None  # No sampling for the test set
}

# Define data loaders
loaders = create_multiple_loaders(
    datasets,
    params,
    label_sample_sizes=label_sample_sizes,
    num_workers=params["NUM_WORKERS"],
    world_size=1,
    rank=0,
)

In [5]:
label2int = datasets[list(datasets.keys())[0]][0].label2int
int2label = datasets[list(datasets.keys())[0]][0].int2label
label_annotation_map = datasets[list(datasets.keys())[
    0]][0].label_annotation_map

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load label encoder
label_encoder = AutoModel.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])

# # Generate all label embeddings upfront, if not training the label encoder
# label_embedding_matrix = None
# if not params["TRAIN_LABEL_ENCODER"]:
#     # Create a list of text labels
#     sorted_labels = sorted(
#         vocabularies["GO_label_vocab"], key=lambda x: label2int[x])
#     label_annotations = [label_annotation_map[label_id]
#                             for label_id in sorted_labels]
#     label_encoder = label_encoder.to(device)
#     paths["LABEL_EMBEDDING_PATH"] = None  
#     label_embedding_matrix = get_or_generate_label_embeddings(
#         paths,
#         device,
#         label_annotations,
#         label_tokenizer,
#         label_encoder,
#         logger,
#         LABEL_BATCH_SIZE_LIMIT_NO_GRAD=params["LABEL_BATCH_SIZE_LIMIT_NO_GRAD"]
#     )
#     # Move the label encoder to CPU
#     label_encoder = label_encoder.cpu()

#     import time

# FILEPATH: /home/ncorley/protein/ProteinFunctions/notebooks/datasets.ipynb
from src.utils.models import generate_label_embeddings_from_text
sorted_labels = sorted(
        vocabularies["GO_label_vocab"], key=lambda x: label2int[x])
label_annotations = [label_annotation_map[label_id]
                            for label_id in sorted_labels]
print(len(label_annotations))


32102


In [None]:
label_id = "GO:0000001"

In [6]:
from src.utils.models import tokenize_labels
label_annotations = [label_annotation_map[label_id]
                            for label_id in sorted_labels]
sampled_label_annotations = label_annotations[:2000]
print(len(sampled_label_annotations))
tokenized_labels = tokenize_labels(sampled_label_annotations, label_tokenizer)

2000


In [7]:
# Clear the cache
torch.cuda.empty_cache()

# Print memory usage
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_allocated() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_reserved() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3:.2f} GB")

tokenized_labels["input_ids"] = tokenized_labels["input_ids"].to(device)
tokenized_labels["attention_mask"] = tokenized_labels["attention_mask"].to(device)
label_encoder = label_encoder.to(device)

# Print memory usage
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_allocated() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_reserved() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3:.2f} GB")

Memory allocated: 0.00 GB (0.00%)
Memory reserved: 0.00 GB (0.00%)
Total memory: 79.10 GB
Memory allocated: 1.30 GB (1.64%)
Memory reserved: 1.31 GB (1.65%)
Total memory: 79.10 GB


In [9]:
from src.utils.models import tokenize_labels, get_label_embeddings
from torch.cuda.amp import autocast
import psutil
import time

# Clear the cache
torch.cuda.empty_cache()

start_time = time.time()
with torch.no_grad():
    with autocast():
        label_embeddings = label_encoder(**tokenized_labels).last_hidden_state.mean(dim=1)
end_time = time.time()

print(label_embeddings.shape)
# print label_embeddings memory usage
print("Memory usage of result variable: ", label_embeddings.element_size() *
        label_embeddings.nelement() / (1024 * 1024), " MB")

torch.cuda.empty_cache()

# Print memory usage
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_allocated() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_reserved() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3:.2f} GB")

# 38 seconds for 20k labels w/ batch size of 600

torch.Size([2000, 1024])
Memory usage of result variable:  7.8125  MB
Memory allocated: 1.31 GB (1.66%)
Memory reserved: 1.64 GB (2.08%)
Total memory: 79.10 GB


In [10]:
from src.utils.models import generate_label_embeddings_from_text
# Move model to GPU
label_encoder = label_encoder.to(device)
sampled_label_annotations = label_annotations[:32000]
with torch.no_grad():
    embeddings = generate_label_embeddings_from_text(
        sampled_label_annotations,
        label_tokenizer,
        label_encoder,
        batch_size_limit=1500,
    )

In [None]:
# Initialize ProteInfer
sequence_encoder = ProteInfer.from_pretrained(
    weights_path=paths["PROTEINFER_WEIGHTS_PATH"],
    num_labels=config["embed_sequences_params"]["PROTEINFER_NUM_LABELS"],
    input_channels=config["embed_sequences_params"]["INPUT_CHANNELS"],
    output_channels=config["embed_sequences_params"]["OUTPUT_CHANNELS"],
    kernel_size=config["embed_sequences_params"]["KERNEL_SIZE"],
    activation=torch.nn.ReLU,
    dilation_base=config["embed_sequences_params"]["DILATION_BASE"],
    num_resnet_blocks=config["embed_sequences_params"]["NUM_RESNET_BLOCKS"],
    bottleneck_factor=config["embed_sequences_params"]["BOTTLENECK_FACTOR"],
)

# Generate all sequence embeddings upfront, if not training the sequence encoder
sequence_embedding_dict = None
if not params["TRAIN_SEQUENCE_ENCODER"]:
    sequence_embedding_dict = get_or_generate_sequence_embeddings(
        paths,
        device,
        sequence_encoder,
        datasets,
        params,
        logger,
    )

2023-10-04 15:03:37 PDT INFO Loaded sequence embeddings from /home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer/frozen_proteinfer_sequence_embeddings.pkl


In [None]:
# sequence_embedding_dict = read_pickle(paths["SEQUENCE_EMBEDDING_PATH"])
label_embedding_matrix = torch.load(paths["LABEL_EMBEDDING_PATH"])

In [None]:

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])


# Create datasets
datasets = ProteinDataset.create_multiple_datasets(paths_list, label_tokenizer=label_tokenizer)

# Initialize new run
logger.info(
    f"################## {timestamp} RUNNING train.py ##################")

# Define label sample sizes for train, validation, and test loaders
label_sample_sizes = {
    # Assuming you have this parameter in your params dictionary
    "train": 2000,
    # Assuming you have this parameter in your params dictionary
    "validation": 100,
    "test": None  # No sampling for the test set
}

# Define data loaders
loaders = create_multiple_loaders(
    datasets,
    params,
    label_sample_sizes=label_sample_sizes,
    num_workers=0
)

In [24]:
# Load GO annotations
GO_ANNOTATIONS_PATH_OLD = "/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl"
GO_ANNOTATIONS_PATH_NEW = "/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2023_07_23.pkl"
go_annotations = read_pickle(GO_ANNOTATIONS_PATH_OLD)

In [109]:
import torch
from transformers import BioGptTokenizer, BioGptModel, set_seed

label_tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
label_encoder = BioGptModel.from_pretrained("microsoft/biogpt")
from src.utils.models import tokenize_labels, get_label_embeddings

set_seed(42)

# Get the value corresponding to the key "GO:0000001" from the df
text = [go_annotations.loc["GO:0000001"][0]]
print(text)

# checkpoint = "microsoft/biogpt"
# label_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# label_encoder = AutoModel.from_pretrained(checkpoint)

# Print the tokenized label
tokens = tokenize_labels(text, label_tokenizer)

input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
# Get the label embeddings (average across all tokens of the last hidden state)
with torch.no_grad():
    mean_hidden_states = get_label_embeddings(
        tokens,
        label_encoder,
        1
    )
print("OUTPUT:")
print(mean_hidden_states[0])
# print(output.last_hidden_state.shape)
# print(output.last_hidden_state)
# tensor([-0.4258, -0.6187, -0.2033,  ..., -0.8992,  0.1572, -0.0666])
# tensor([[[-0.1223,  0.0990, -0.9851,  ..., -0.8175,  0.1483, -0.5747],
#          [-1.9245,  0.7768,  1.8563,  ..., -0.4368,  0.2228,  0.6075],
#          [ 0.7696, -2.7319, -1.4810,  ..., -1.4431,  0.1005, -0.2327]]])

['The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.']
OUTPUT:
tensor([-1.3425,  0.1925,  0.4544,  ..., -0.0675,  0.1730,  0.8976])


  text = [go_annotations.loc["GO:0000001"][0]]


In [30]:
# Load the label embedding matrix
path = "/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_BioGPT_label_embeddings.pkl"
label_embedding_matrix = torch.load(path)
label_embedding_matrix[0]

tensor([ 0.6444,  0.2435, -0.9323,  ..., -3.6236, -0.5964, -0.8596])

In [36]:
from src.utils.main_utils import get_or_generate_label_embeddings
import logging
paths = {
    "LABEL_EMBEDDING_PATH_3": "/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_BioGPT_label_embeddings.pkl"
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get a list of all label text from the df


matrix = get_or_generate_label_embeddings(
    paths,
    device,
    go_annotations,
    label_tokenizer,
    label_encoder,
)

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [33]:
matrix[0]

tensor([ 0.6444,  0.2435, -0.9323,  ..., -3.6236, -0.5964, -0.8596],
       device='cuda:0')

# Testing LoRa

In [1]:
import os
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from transformers import AutoTokenizer, AutoModel, set_seed
import torch
from src.utils.models import generate_label_embeddings_from_text
from src.utils.data import read_pickle
import copy

# Initialize label tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

# Load label encoder
model = AutoModel.from_pretrained(
    "microsoft/biogpt",
    torch_dtype=torch.float16,
)
print(model)

def print_model_size(model):
    mem_params = sum([param.nelement()*param.element_size() for param in model.parameters()])
    mem_bufs = sum([buf.nelement()*buf.element_size() for buf in model.buffers()])
    mem = mem_params + mem_bufs # in bytes
    # Print in GB to 2 decimal places
    print("Model size:", round(mem / 1024**3, 2), "GB")

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

set_seed(42)

# Load GO annotations
GO_ANNOTATIONS_PATH_OLD = "/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl"
GO_ANNOTATIONS_PATH_NEW = "/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2023_07_23.pkl"
go_annotations = read_pickle(GO_ANNOTATIONS_PATH_OLD)

# Get first 1000 labels
text = go_annotations.iloc[:, 0].tolist()
print(len(text))
print(text[0])

tokenized_labels = tokenizer(
    text, padding='longest', truncation=True, max_length=512, return_tensors="pt"
)

# Move to GPU
tokenized_labels["input_ids"] = tokenized_labels["input_ids"].to(
    'cuda')
tokenized_labels["attention_mask"] = tokenized_labels["attention_mask"].to(
    'cuda')

# Move model to gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to('cuda')

# Copy original model
original_model = copy.deepcopy(model)

  from .autonotebook import tqdm as notebook_tqdm


BioGptModel(
  (embed_tokens): Embedding(42384, 1024, padding_idx=1)
  (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
  (layers): ModuleList(
    (0-23): 24 x BioGptDecoderLayer(
      (self_attn): BioGptAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1024, out_features=4096, bias=True)
      (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
  )
  (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
47401
The distribution of mitochondria, i

In [2]:
def compute_mean_hidden_states(last_hidden_states, attention_mask):
    """Compute the mean of the last hidden state for only the relevant tokens."""
    # Compute the number of relevant tokens for each sequence
    num_relevant_tokens = attention_mask.sum(dim=1, keepdim=True)
    # Mask the last_hidden_state tensor and compute the sum
    sum_hidden_states = (last_hidden_states *
                         attention_mask.unsqueeze(-1)).sum(dim=1)
    # Compute the mean of the last hidden state
    return sum_hidden_states / num_relevant_tokens

In [4]:
# WITHOUT LORA

from torch.cuda.amp import autocast
import gc

limit = 100
torch.cuda.reset_peak_memory_stats()
print_trainable_parameters(model)

# Note the size of the model 
a = torch.cuda.memory_allocated(device=device) / 1024 ** 3
model.to('cpu')
b = torch.cuda.memory_allocated(device=device) / 1024 ** 3
print("Model size:", a - b, "GB")
model.to('cuda')

with autocast():

    # Note the size of the outputs, including their activations and gradients
    a = torch.cuda.memory_allocated(device=device) / 1024 ** 3

    last_hidden_states = model(
        input_ids=tokenized_labels["input_ids"][:limit],
        attention_mask=tokenized_labels["attention_mask"][:limit]
    ).last_hidden_state

    b = torch.cuda.memory_allocated(device=device) / 1024 ** 3
    print("Last hidden states size:", b - a, "GB")

    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

    # Compute only the mean of the last hidden states
    a = torch.cuda.memory_allocated(device=device) / 1024 ** 3
    result = compute_mean_hidden_states(last_hidden_states, tokenized_labels["attention_mask"][:limit])
    b = torch.cuda.memory_allocated(device=device) / 1024 ** 3
    print("Result size:", b - a, "GB")

    # Delete the last hidden states
    del last_hidden_states
 
print(result)
# del last_hidden_states
max_usage = torch.cuda.max_memory_allocated(device=device) / 1024 ** 3
print("Max memory usage:", max_usage, "GB")
print("Memory allocated:", torch.cuda.memory_allocated(device=device) / 1024 ** 3, "GB")

trainable params: 346763264 || all params: 346763264 || trainable%: 100.0
Model size: 0.08339691162109375 GB
Last hidden states size: 37.00778388977051 GB
Result size: -19.01560688018799 GB
tensor([[-1.3432,  0.1918,  0.4550,  ..., -0.0691,  0.1732,  0.8977],
        [-0.5855,  0.0037,  0.9944,  ...,  0.7357,  1.3796,  1.2025],
        [-0.4864, -0.2691,  0.1813,  ..., -0.3580,  0.8959,  0.8707],
        ...,
        [-1.9372,  1.0084,  0.4448,  ...,  0.6840, -0.2855,  0.2953],
        [-1.5028,  0.3953, -0.2451,  ...,  0.4273,  0.1786, -0.1587],
        [-1.1314,  0.0936, -1.0735,  ...,  0.7942, -0.4111, -0.1751]],
       device='cuda:0', grad_fn=<DivBackward0>)
Max memory usage: 57.664806842803955 GB
Memory allocated: 38.39514446258545 GB


In [12]:
# WITH LORA
from torch import nn

print_trainable_parameters(original_model)

from peft import LoraConfig, get_peft_model 

config = LoraConfig(
    r=8,
    # lora_alpha=8,
    target_modules=["k_proj", "v_proj"], # Also target fc1, fc2 if we wanted to
    # lora_dropout=0.05,
    # bias="none",
    # task_type="FEATURE_EXTRACTION"
)

lora_model = get_peft_model(model, config)

optimizer = torch.optim.Adam(lora_model.parameters(), lr=0.00003)
criterion = nn.CrossEntropyLoss()

print_trainable_parameters(lora_model)

print(lora_model)

trainable params: 346763264 || all params: 346763264 || trainable%: 100.0
trainable params: 1179648 || all params: 347942912 || trainable%: 0.3390349276607767
PeftModel(
  (base_model): LoraModel(
    (model): BioGptModel(
      (embed_tokens): Embedding(42384, 1024, padding_idx=1)
      (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-23): 24 x BioGptDecoderLayer(
          (self_attn): BioGptAttention(
            (k_proj): Linear(
              in_features=1024, out_features=1024, bias=True
              (lora_dropout): ModuleDict(
                (default): Identity()
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=8, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=8, out_features=1024, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): Par

In [13]:
# WITH LORA

from torch.cuda.amp import autocast

limit = 100
torch.cuda.reset_peak_memory_stats()
print_trainable_parameters(lora_model)

# Note the size of the model 
a = torch.cuda.memory_allocated(device=device) / 1024 ** 3
lora_model.to('cpu')
b = torch.cuda.memory_allocated(device=device) / 1024 ** 3
print("Model size:", a - b, "GB")
lora_model.to('cuda')

with autocast():
    # Note the size of the outputs, including their activations and gradients
    a = torch.cuda.memory_allocated(device=device) / 1024 ** 3

    torch.cuda.empty_cache()
    last_hidden_states = lora_model(
        input_ids=tokenized_labels["input_ids"][:limit],
        attention_mask=tokenized_labels["attention_mask"][:limit]
    ).last_hidden_state
    b = torch.cuda.memory_allocated(device=device) / 1024 ** 3
    print("Last hidden states size:", b - a, "GB")

    # Compute only the mean of the last hidden states
    a = torch.cuda.memory_allocated(device=device) / 1024 ** 3
    result = compute_mean_hidden_states(last_hidden_states, tokenized_labels["attention_mask"][:limit])
    b = torch.cuda.memory_allocated(device=device) / 1024 ** 3
    print("Result size:", b - a, "GB")

    # Delete the last hidden states
    del last_hidden_states
 
print(result)
# del last_hidden_states
max_usage = torch.cuda.max_memory_allocated(device=device) / 1024 ** 3
print("Max memory usage:", max_usage, "GB")
print("Memory allocated:", torch.cuda.memory_allocated(device=device) / 1024 ** 3, "GB")

trainable params: 1179648 || all params: 347942912 || trainable%: 0.3390349276607767
Model size: 0.09365081787109375 GB


OutOfMemoryError: CUDA out of memory. Tried to allocate 292.00 MiB (GPU 0; 79.10 GiB total capacity; 74.59 GiB already allocated; 264.38 MiB free; 77.62 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [20]:
# Print which parameters require gradients, and how many parameters are in the model
print_model_size(lora_model)
print_trainable_parameters(lora_model)

with torch.no_grad():
    print(f"{get_max_memory(lora_model):.2f} GB")
print(f"{get_max_memory(lora_model):.2f} GB")

Model size: 0.65 GB
trainable params: 196608 || all params: 346959872 || trainable%: 0.05666591899134664


NameError: name 'get_max_memory' is not defined

In [4]:
text_subset_lora = text[:100]

print(len(text_subset_lora))
print(text_subset_lora[0])
lora_model.eval()
embeddings = generate_label_embeddings_from_text(
    text_subset_lora,
    tokenizer,
    lora_model,
    batch_size_limit=1000,
)

print_trainable_parameters(lora_model)
    
print(embeddings.shape)
print(embeddings[0])
# Print mean of first embedding
print(embeddings[0].mean(dim=0))
# Print max GPU usage
print("Max allocated (GB):", torch.cuda.max_memory_allocated(device=device) / 1024 ** 3)

# tensor([-1.3431,  0.1927,  0.4538,  ..., -0.0671,  0.1733,  0.8978],
#        grad_fn=<SelectBackward0>)

100
The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.
Label embeddings device: cuda:0
Label annotations device: cuda:0
Label attention mask device: cuda:0
Label embeddings memory usage: 40.234375 MB
GPU memory usage: 11138.88427734375 MB
GPU reserved memory: 13604.0 MB
Peak bytes requirement:  13175.6904296875  MB
trainable params: 98304 || all params: 346861568 || trainable%: 0.028340989336702763
torch.Size([100, 1024])
tensor([-1.3431,  0.1927,  0.4538,  ..., -0.0671,  0.1733,  0.8978],
       grad_fn=<SelectBackward0>)
tensor(0.0012, grad_fn=<MeanBackward1>)
Max allocated (GB): 12.8668851852417


# Testing Embedding Separation

In [5]:
%load_ext autoreload
%autoreload 2

import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from src.utils.models import tokenize_labels
from transformers import AutoTokenizer, AutoModel

checkpoint = "microsoft/biogpt"

label_encoder = AutoModel.from_pretrained(checkpoint)

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
{'input_ids': tensor([[    2,    56,    80,    21,   396,    10,     6,   370,     5,    49,
           718,     4],
        [    2,    39,    80,    22,  1603,    14,   151,    10,    49,   718,
             4,     1],
        [    2,    39,    80,  1295,    16,   360,  2829,     4,     1,     1,
             1,     1],
        [    2,    56,    80, 11648,    10,   360,  2829,     4,     1,     1,
             1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}


In [13]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def print_cosine_similarity_matrix(protein_descriptions, label_tokenizer, label_encoder, batch_size_limit=2):
    """
    Compute and print the cosine similarity matrix for a list of protein descriptions.

    Args:
    - protein_descriptions (list): List of protein descriptions.
    - label_tokenizer (Tokenizer): Tokenizer for the descriptions.
    - label_encoder (torch.nn.Module): Pretrained model to generate embeddings.
    - batch_size_limit (int): Batch size limit for embedding generation.
    """

    # Get embeddings for the tokenized descriptions
    with torch.no_grad():
        embeddings = generate_label_embeddings_from_text(protein_descriptions, label_tokenizer, label_encoder, batch_size_limit=batch_size_limit)

    # Convert the embeddings to numpy for cosine similarity computation
    embeddings_matrix = embeddings.cpu().numpy()

    # Compute the cosine similarity between the embeddings
    cosine_sim = cosine_similarity(embeddings_matrix)

    # Print as a matrix
    # Print the key for protein descriptions
    print("Key:")
    for idx, desc in enumerate(protein_descriptions, 1):
        print(f"{idx}. {desc}")
    print("\nCosine Similarity Matrix:")

    # Print the matrix header
    print("    ", end="")
    for i in range(len(protein_descriptions)):
        print(f"{i+1:4}", end="")
    print()

    # Print the matrix
    for i in range(len(protein_descriptions)):
        print(f"{i+1:2} |", end="")
        for j in range(len(protein_descriptions)):
            print(f"{cosine_sim[i][j]:.2f} ", end="")
        print()


Key:
1. This protein is involved in the process of cell signaling.
2. A protein that plays a role in cell signaling.
3. A protein responsible for muscle contraction.
4. This protein aids in muscle contraction.

Cosine Similarity Matrix:
       1   2   3   4
 1 |1.00 0.86 0.58 0.71 
 2 |0.86 1.00 0.73 0.74 
 3 |0.58 0.73 1.00 0.89 
 4 |0.71 0.74 0.89 1.00 


In [16]:
from src.utils.models import generate_label_embeddings_from_text
import torch

# protein_descriptions = [
#     "This protein is involved in the process of cell signaling.",
#     "A protein that plays a role in cell signaling.",
#     "A protein responsible for muscle contraction.",
#     "This protein aids in muscle contraction."
# ]


# Raw protein descriptions
raw_protein_descriptions = [
    "This gene product is crucial for intracellular communication mediated by signaling molecules.",
    "The protein encoded by this gene plays a pivotal role in the transmission of signals within cells.",
    "The protein expressed from this locus is involved in the correction of mismatches in DNA sequences.",
    "This gene's product is essential for repairing inaccuracies in the DNA helix."
]

# GPT-optimized protein descriptions
"""
Hello, Assistant! I'm providing you with a protein GO annotation description from SwissProt. Your task is to condense this description in a way that, when embedded with a pretrained language model, it will create a distinct and separate representation in the embedding latent space. 
- This condensed form should retain the core essence and meaning of the original description but be more concise and distinctive. 
- The most relevant and distinctive pieces of information should come at the beginning of the sentence, if possible. 
- These are all protein descriptions, so the condensed form should be a sentence that describes the function of the protein. You do not need to include the word "protein" in your condensed form.
- Do not use non-critical words like "pivotal" or "essential"
Here are the descriptions. Create one summary for each description below.
###
DESCRIPTION #1: This gene product is crucial for intracellular communication mediated by signaling molecules.
DESCRIPTION #2: The protein encoded by this gene plays a pivotal role in the transmission of signals within cells.
DESCRIPTION #3: The protein expressed from this locus is involved in the correction of mismatches in DNA sequences.
DESCRIPTION #4: This gene's product is essential for repairing inaccuracies in the DNA helix.
###
Please provide the condensed form of all descriptions.

Here is an example:
INPUT_PROTEIN_DESCRIPTION: Any process that activates or increases the frequency, rate or extent of AIM2 inflammasome complex assembly.
MODEL_OUTPUT: AIM2 inflammasome complex assembly activator or accelerator.
"""

# Raw protein descriptions
processed_protein_descriptions = [
    "Facilitates intracellular communication via signaling molecules.",
    "Key player in transmitting signals within cells.",
    "Corrects DNA sequence mismatches.",
    "Essential for DNA helix repair."
]

print_cosine_similarity_matrix(processed_protein_descriptions, label_tokenizer, label_encoder)

Key:
1. Facilitates intracellular communication via signaling molecules.
2. Key player in transmitting signals within cells.
3. Corrects DNA sequence mismatches.
4. Essential for DNA helix repair.

Cosine Similarity Matrix:
       1   2   3   4
 1 |1.00 0.76 0.57 0.62 
 2 |0.76 1.00 0.54 0.63 
 3 |0.57 0.54 1.00 0.79 
 4 |0.62 0.63 0.79 1.00 


# LoRA Baseline

In [3]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [4]:
from transformers import AutoModelForSeq2SeqLM

model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

Downloading (…)lve/main/config.json: 100%|██████████| 800/800 [00:00<00:00, 5.75MB/s]
Downloading model.safetensors: 100%|██████████| 4.92G/4.92G [00:40<00:00, 122MB/s] 


: 

In [None]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()