# Testing Embedding Speed

In [1]:
import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

import logging
from src.utils.data import (
    seed_everything,
    read_pickle,
)
from src.data.datasets import ProteinDataset, calculate_pos_weight, create_multiple_loaders
from src.models.ProTCLTrainer import ProTCLTrainer
from src.models.ProTCL import ProTCL
from src.models.protein_encoders import ProteInfer
from src.utils.evaluation import EvalMetrics, save_evaluation_results
from src.utils.models import count_parameters_by_layer, get_label_embeddings
from src.utils.configs import get_setup
import torch
import wandb
import os
import argparse
import json
from transformers import AutoTokenizer, AutoModel, BatchEncoding
from src.utils.main_utils import get_or_generate_vocabularies,  get_or_generate_label_embeddings, get_or_generate_sequence_embeddings, validate_arguments
from tqdm import tqdm
import os

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["ROOT_PATH"] = "/home/ncorley/protein/ProteinFunctions"

# Unpack and process the config file
config = get_setup(
    config_path='/home/ncorley/protein/ProteinFunctions/configs/base_config.yaml',
    run_name="Test",
    overrides=None,
    train_path_name="TRAIN_DATA_PATH",
    val_path_name="VAL_DATA_PATH",
    test_paths_names=["TEST_DATA_PATH"],
    amlt=False,
    is_master=True,
)
params, paths, timestamp, logger = config["params"], config[
    "paths"], config["timestamp"], config["logger"]


Logging to /home/ncorley/protein/ProteinFunctions/outputs/logs/2023-10-08_14-01-25_Test.log and console...


In [3]:
# Load or generate the vocabularies
vocabularies = get_or_generate_vocabularies(
    paths["FULL_DATA_PATH"], paths["VOCABULARIES_DIR"], logger)

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])

# Create datasets
datasets = ProteinDataset.create_multiple_datasets(
    config['dataset_paths_list'],
    label_tokenizer=label_tokenizer,
    vocabularies=vocabularies,
    subset_fractions={
        "train": params["TRAIN_SUBSET_FRACTION"],
        "validation": params["VALIDATION_SUBSET_FRACTION"],
        "test": params["TEST_SUBSET_FRACTION"]}
)

2023-10-08 14:01:28 PDT INFO Loaded amino_acid_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/amino_acid_vocab.json
2023-10-08 14:01:28 PDT INFO Loaded GO_label_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/GO_label_vocab.json
2023-10-08 14:01:28 PDT INFO Loaded GO_label_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/GO_label_vocab.json
2023-10-08 14:01:29 PDT INFO Loaded sequence_id_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/sequence_id_vocab.json


In [4]:
# Define label sample sizes for train, validation, and test loaders
label_sample_sizes = {
    "train": params["TRAIN_LABEL_SAMPLE_SIZE"],
    "validation": params["VALIDATION_LABEL_SAMPLE_SIZE"],
    "test": None  # No sampling for the test set
}

# Define data loaders
loaders = create_multiple_loaders(
    datasets,
    params,
    label_sample_sizes=label_sample_sizes,
    num_workers=params["NUM_WORKERS"],
    world_size=1,
    rank=0,
)

In [5]:
label2int = datasets[list(datasets.keys())[0]][0].label2int
int2label = datasets[list(datasets.keys())[0]][0].int2label
label_annotation_map = datasets[list(datasets.keys())[
    0]][0].label_annotation_map

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load label encoder
label_encoder = AutoModel.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])

# # Generate all label embeddings upfront, if not training the label encoder
# label_embedding_matrix = None
# if not params["TRAIN_LABEL_ENCODER"]:
#     # Create a list of text labels
#     sorted_labels = sorted(
#         vocabularies["GO_label_vocab"], key=lambda x: label2int[x])
#     label_annotations = [label_annotation_map[label_id]
#                             for label_id in sorted_labels]
#     label_encoder = label_encoder.to(device)
#     paths["LABEL_EMBEDDING_PATH"] = None  
#     label_embedding_matrix = get_or_generate_label_embeddings(
#         paths,
#         device,
#         label_annotations,
#         label_tokenizer,
#         label_encoder,
#         logger,
#         LABEL_BATCH_SIZE_LIMIT_NO_GRAD=params["LABEL_BATCH_SIZE_LIMIT_NO_GRAD"]
#     )
#     # Move the label encoder to CPU
#     label_encoder = label_encoder.cpu()

#     import time

# FILEPATH: /home/ncorley/protein/ProteinFunctions/notebooks/datasets.ipynb
from src.utils.models import generate_label_embeddings_from_text
sorted_labels = sorted(
        vocabularies["GO_label_vocab"], key=lambda x: label2int[x])
label_annotations = [label_annotation_map[label_id]
                            for label_id in sorted_labels]
print(len(label_annotations))


32102


In [None]:
label_id = "GO:0000001"

In [6]:
from src.utils.models import tokenize_labels
label_annotations = [label_annotation_map[label_id]
                            for label_id in sorted_labels]
sampled_label_annotations = label_annotations[:2000]
print(len(sampled_label_annotations))
tokenized_labels = tokenize_labels(sampled_label_annotations, label_tokenizer)

2000


In [7]:
# Clear the cache
torch.cuda.empty_cache()

# Print memory usage
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_allocated() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_reserved() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3:.2f} GB")

tokenized_labels["input_ids"] = tokenized_labels["input_ids"].to(device)
tokenized_labels["attention_mask"] = tokenized_labels["attention_mask"].to(device)
label_encoder = label_encoder.to(device)

# Print memory usage
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_allocated() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_reserved() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3:.2f} GB")

Memory allocated: 0.00 GB (0.00%)
Memory reserved: 0.00 GB (0.00%)
Total memory: 79.10 GB
Memory allocated: 1.30 GB (1.64%)
Memory reserved: 1.31 GB (1.65%)
Total memory: 79.10 GB


In [9]:
from src.utils.models import tokenize_labels, get_label_embeddings
from torch.cuda.amp import autocast
import psutil
import time

# Clear the cache
torch.cuda.empty_cache()

start_time = time.time()
with torch.no_grad():
    with autocast():
        label_embeddings = label_encoder(**tokenized_labels).last_hidden_state.mean(dim=1)
end_time = time.time()

print(label_embeddings.shape)
# print label_embeddings memory usage
print("Memory usage of result variable: ", label_embeddings.element_size() *
        label_embeddings.nelement() / (1024 * 1024), " MB")

torch.cuda.empty_cache()

# Print memory usage
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_allocated() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024 ** 3:.2f} GB ({100 * torch.cuda.memory_reserved() / torch.cuda.get_device_properties(0).total_memory:.2f}%)")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3:.2f} GB")

# 38 seconds for 20k labels w/ batch size of 600

torch.Size([2000, 1024])
Memory usage of result variable:  7.8125  MB
Memory allocated: 1.31 GB (1.66%)
Memory reserved: 1.64 GB (2.08%)
Total memory: 79.10 GB


In [10]:
from src.utils.models import generate_label_embeddings_from_text
# Move model to GPU
label_encoder = label_encoder.to(device)
sampled_label_annotations = label_annotations[:32000]
with torch.no_grad():
    embeddings = generate_label_embeddings_from_text(
        sampled_label_annotations,
        label_tokenizer,
        label_encoder,
        batch_size_limit=1500,
    )

In [None]:
# Initialize ProteInfer
sequence_encoder = ProteInfer.from_pretrained(
    weights_path=paths["PROTEINFER_WEIGHTS_PATH"],
    num_labels=config["embed_sequences_params"]["PROTEINFER_NUM_LABELS"],
    input_channels=config["embed_sequences_params"]["INPUT_CHANNELS"],
    output_channels=config["embed_sequences_params"]["OUTPUT_CHANNELS"],
    kernel_size=config["embed_sequences_params"]["KERNEL_SIZE"],
    activation=torch.nn.ReLU,
    dilation_base=config["embed_sequences_params"]["DILATION_BASE"],
    num_resnet_blocks=config["embed_sequences_params"]["NUM_RESNET_BLOCKS"],
    bottleneck_factor=config["embed_sequences_params"]["BOTTLENECK_FACTOR"],
)

# Generate all sequence embeddings upfront, if not training the sequence encoder
sequence_embedding_dict = None
if not params["TRAIN_SEQUENCE_ENCODER"]:
    sequence_embedding_dict = get_or_generate_sequence_embeddings(
        paths,
        device,
        sequence_encoder,
        datasets,
        params,
        logger,
    )

2023-10-04 15:03:37 PDT INFO Loaded sequence embeddings from /home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer/frozen_proteinfer_sequence_embeddings.pkl


In [None]:
# sequence_embedding_dict = read_pickle(paths["SEQUENCE_EMBEDDING_PATH"])
label_embedding_matrix = torch.load(paths["LABEL_EMBEDDING_PATH"])

In [None]:

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])


# Create datasets
datasets = ProteinDataset.create_multiple_datasets(paths_list, label_tokenizer=label_tokenizer)

# Initialize new run
logger.info(
    f"################## {timestamp} RUNNING train.py ##################")

# Define label sample sizes for train, validation, and test loaders
label_sample_sizes = {
    # Assuming you have this parameter in your params dictionary
    "train": 2000,
    # Assuming you have this parameter in your params dictionary
    "validation": 100,
    "test": None  # No sampling for the test set
}

# Define data loaders
loaders = create_multiple_loaders(
    datasets,
    params,
    label_sample_sizes=label_sample_sizes,
    num_workers=0
)

In [24]:
# Load GO annotations
GO_ANNOTATIONS_PATH_OLD = "/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl"
GO_ANNOTATIONS_PATH_NEW = "/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2023_07_23.pkl"
go_annotations = read_pickle(GO_ANNOTATIONS_PATH_OLD)

In [109]:
import torch
from transformers import BioGptTokenizer, BioGptModel, set_seed

label_tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
label_encoder = BioGptModel.from_pretrained("microsoft/biogpt")
from src.utils.models import tokenize_labels, get_label_embeddings

set_seed(42)

# Get the value corresponding to the key "GO:0000001" from the df
text = [go_annotations.loc["GO:0000001"][0]]
print(text)

# checkpoint = "microsoft/biogpt"
# label_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# label_encoder = AutoModel.from_pretrained(checkpoint)

# Print the tokenized label
tokens = tokenize_labels(text, label_tokenizer)

input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
# Get the label embeddings (average across all tokens of the last hidden state)
with torch.no_grad():
    mean_hidden_states = get_label_embeddings(
        tokens,
        label_encoder,
        1
    )
print("OUTPUT:")
print(mean_hidden_states[0])
# print(output.last_hidden_state.shape)
# print(output.last_hidden_state)
# tensor([-0.4258, -0.6187, -0.2033,  ..., -0.8992,  0.1572, -0.0666])
# tensor([[[-0.1223,  0.0990, -0.9851,  ..., -0.8175,  0.1483, -0.5747],
#          [-1.9245,  0.7768,  1.8563,  ..., -0.4368,  0.2228,  0.6075],
#          [ 0.7696, -2.7319, -1.4810,  ..., -1.4431,  0.1005, -0.2327]]])

['The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.']
OUTPUT:
tensor([-1.3425,  0.1925,  0.4544,  ..., -0.0675,  0.1730,  0.8976])


  text = [go_annotations.loc["GO:0000001"][0]]


In [30]:
# Load the label embedding matrix
path = "/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_BioGPT_label_embeddings.pkl"
label_embedding_matrix = torch.load(path)
label_embedding_matrix[0]

tensor([ 0.6444,  0.2435, -0.9323,  ..., -3.6236, -0.5964, -0.8596])

In [36]:
from src.utils.main_utils import get_or_generate_label_embeddings
import logging
paths = {
    "LABEL_EMBEDDING_PATH_3": "/home/ncorley/protein/ProteinFunctions/data/embeddings/frozen_BioGPT_label_embeddings.pkl"
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get a list of all label text from the df


matrix = get_or_generate_label_embeddings(
    paths,
    device,
    go_annotations,
    label_tokenizer,
    label_encoder,
)

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [33]:
matrix[0]

tensor([ 0.6444,  0.2435, -0.9323,  ..., -3.6236, -0.5964, -0.8596],
       device='cuda:0')

# Testing Embedding Separation

In [5]:
%load_ext autoreload
%autoreload 2

import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from src.utils.models import tokenize_labels
from transformers import AutoTokenizer, AutoModel

checkpoint = "microsoft/biogpt"

label_encoder = AutoModel.from_pretrained(checkpoint)

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
{'input_ids': tensor([[    2,    56,    80,    21,   396,    10,     6,   370,     5,    49,
           718,     4],
        [    2,    39,    80,    22,  1603,    14,   151,    10,    49,   718,
             4,     1],
        [    2,    39,    80,  1295,    16,   360,  2829,     4,     1,     1,
             1,     1],
        [    2,    56,    80, 11648,    10,   360,  2829,     4,     1,     1,
             1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}


In [13]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def print_cosine_similarity_matrix(protein_descriptions, label_tokenizer, label_encoder, batch_size_limit=2):
    """
    Compute and print the cosine similarity matrix for a list of protein descriptions.

    Args:
    - protein_descriptions (list): List of protein descriptions.
    - label_tokenizer (Tokenizer): Tokenizer for the descriptions.
    - label_encoder (torch.nn.Module): Pretrained model to generate embeddings.
    - batch_size_limit (int): Batch size limit for embedding generation.
    """

    # Get embeddings for the tokenized descriptions
    with torch.no_grad():
        embeddings = generate_label_embeddings_from_text(protein_descriptions, label_tokenizer, label_encoder, batch_size_limit=batch_size_limit)

    # Convert the embeddings to numpy for cosine similarity computation
    embeddings_matrix = embeddings.cpu().numpy()

    # Compute the cosine similarity between the embeddings
    cosine_sim = cosine_similarity(embeddings_matrix)

    # Print as a matrix
    # Print the key for protein descriptions
    print("Key:")
    for idx, desc in enumerate(protein_descriptions, 1):
        print(f"{idx}. {desc}")
    print("\nCosine Similarity Matrix:")

    # Print the matrix header
    print("    ", end="")
    for i in range(len(protein_descriptions)):
        print(f"{i+1:4}", end="")
    print()

    # Print the matrix
    for i in range(len(protein_descriptions)):
        print(f"{i+1:2} |", end="")
        for j in range(len(protein_descriptions)):
            print(f"{cosine_sim[i][j]:.2f} ", end="")
        print()


Key:
1. This protein is involved in the process of cell signaling.
2. A protein that plays a role in cell signaling.
3. A protein responsible for muscle contraction.
4. This protein aids in muscle contraction.

Cosine Similarity Matrix:
       1   2   3   4
 1 |1.00 0.86 0.58 0.71 
 2 |0.86 1.00 0.73 0.74 
 3 |0.58 0.73 1.00 0.89 
 4 |0.71 0.74 0.89 1.00 


In [16]:
from src.utils.models import generate_label_embeddings_from_text
import torch

# protein_descriptions = [
#     "This protein is involved in the process of cell signaling.",
#     "A protein that plays a role in cell signaling.",
#     "A protein responsible for muscle contraction.",
#     "This protein aids in muscle contraction."
# ]


# Raw protein descriptions
raw_protein_descriptions = [
    "This gene product is crucial for intracellular communication mediated by signaling molecules.",
    "The protein encoded by this gene plays a pivotal role in the transmission of signals within cells.",
    "The protein expressed from this locus is involved in the correction of mismatches in DNA sequences.",
    "This gene's product is essential for repairing inaccuracies in the DNA helix."
]

# GPT-optimized protein descriptions
"""
Hello, Assistant! I'm providing you with a protein GO annotation description from SwissProt. Your task is to condense this description in a way that, when embedded with a pretrained language model, it will create a distinct and separate representation in the embedding latent space. 
- This condensed form should retain the core essence and meaning of the original description but be more concise and distinctive. 
- The most relevant and distinctive pieces of information should come at the beginning of the sentence, if possible. 
- These are all protein descriptions, so the condensed form should be a sentence that describes the function of the protein. You do not need to include the word "protein" in your condensed form.
- Do not use non-critical words like "pivotal" or "essential"
Here are the descriptions. Create one summary for each description below.
###
DESCRIPTION #1: This gene product is crucial for intracellular communication mediated by signaling molecules.
DESCRIPTION #2: The protein encoded by this gene plays a pivotal role in the transmission of signals within cells.
DESCRIPTION #3: The protein expressed from this locus is involved in the correction of mismatches in DNA sequences.
DESCRIPTION #4: This gene's product is essential for repairing inaccuracies in the DNA helix.
###
Please provide the condensed form of all descriptions.

Here is an example:
INPUT_PROTEIN_DESCRIPTION: Any process that activates or increases the frequency, rate or extent of AIM2 inflammasome complex assembly.
MODEL_OUTPUT: AIM2 inflammasome complex assembly activator or accelerator.
"""

# Raw protein descriptions
processed_protein_descriptions = [
    "Facilitates intracellular communication via signaling molecules.",
    "Key player in transmitting signals within cells.",
    "Corrects DNA sequence mismatches.",
    "Essential for DNA helix repair."
]

print_cosine_similarity_matrix(processed_protein_descriptions, label_tokenizer, label_encoder)

Key:
1. Facilitates intracellular communication via signaling molecules.
2. Key player in transmitting signals within cells.
3. Corrects DNA sequence mismatches.
4. Essential for DNA helix repair.

Cosine Similarity Matrix:
       1   2   3   4
 1 |1.00 0.76 0.57 0.62 
 2 |0.76 1.00 0.54 0.63 
 3 |0.57 0.54 1.00 0.79 
 4 |0.62 0.63 0.79 1.00 
