In [44]:
import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

import logging
from src.utils.data import (
    load_model_weights,
    seed_everything,
    read_pickle,
)
from src.data.datasets import ProteinDataset, calculate_pos_weight, create_multiple_loaders
from src.models.ProTCLTrainer import ProTCLTrainer
from src.models.ProTCL import ProTCL
from src.models.protein_encoders import ProteInfer
from src.utils.evaluation import EvalMetrics, save_evaluation_results
from src.utils.models import count_parameters_by_layer, get_label_embeddings
from src.utils.configs import get_setup
import torch
import wandb
import os
import argparse
import json
from transformers import AutoTokenizer, AutoModel, BatchEncoding

%load_ext autoreload
%autoreload 2

# Data paths
TRAIN_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/train_GO.fasta'
VAL_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/dev_GO.fasta'
TEST_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/test_GO.fasta'
AMINO_ACID_VOCAB_PATH = '/home/ncorley/protein/ProteinFunctions/data/vocabularies/amino_acid_vocab.json'
GO_LABEL_VOCAB_PATH = '/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json'

import os

os.environ["ROOT_PATH"] = "/home/ncorley/protein/ProteinFunctions"

(config, params, paths, paths_list, timestamp, logger, device, ROOT_PATH) = get_setup(
    config_path='/home/ncorley/protein/ProteinFunctions/configs/base_config.yaml',
    log_to_console=True,
    run_name="Test",
    overrides=None,
    train_path_name="TRAIN_DATA_PATH",
    val_path_name="VAL_DATA_PATH",
    test_paths_names=["TEST_DATA_PATH"],
)

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])


# Create datasets
datasets = ProteinDataset.create_multiple_datasets(paths_list, label_tokenizer=label_tokenizer)

# Initialize new run
logger.info(
    f"################## {timestamp} RUNNING train.py ##################")

# Define label sample sizes for train, validation, and test loaders
label_sample_sizes = {
    # Assuming you have this parameter in your params dictionary
    "train": 2000,
    # Assuming you have this parameter in your params dictionary
    "validation": 100,
    "test": None  # No sampling for the test set
}

# Define data loaders
loaders = create_multiple_loaders(
    datasets,
    params,
    label_sample_sizes=label_sample_sizes,
    num_workers=0
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Logging to console...
2023-09-23 18:20:42 PDT INFO Using device: cuda


2023-09-23 18:21:22 PDT INFO ################## 2023-09-23_18-20-42 RUNNING train.py ##################


In [45]:

%load_ext autoreload
%autoreload 2

# Perform basic testing on train dataset
train_dataset = datasets["train"][0]

# Get one item from the dataset
sample = train_dataset[0]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:
train_loader = loaders['train'][0]
val_loader = loaders['validation'][0]
test_loader = loaders['test'][0]

%load_ext autoreload
%autoreload 2

# Get one batch from train loader
result = next(iter(train_loader))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [66]:
result

{'sequence_onehots': tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         ...,
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0.

Bad pipe message: %s [b'\xbc]H\x15', b'd(C\xa8E\x06n\xec\xe8/X\xbf \xe1f\x1d\x8f\xc9\xc8\x07\x00\xc05\xb4iS\x11\x02B\xf9\\\x92\xabw\xab\x90\xfe\xc0\x9dK\xfc\xabaO\xa9\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003']
Bad pipe message: %s [b'\xe6\x87[\xb5\xfe\xc7\x0c\x86\xc9\xa9\xb8\xe0\x98\xaf_\r\xe6\xdf \xbb%\xc3\x03\xcag\xce8[4"\x83\xf7\xaa\xa6\xa7\xd2\xc6']
Bad pipe message: %s [b"\xd7)\x95\xc7\xe3jT\xbe:\x97nu\x89\xacgB\xc1;\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0

: 

In [11]:
print(sequence_ids.shape)
print(sequence_multihots.shape)
print(label_multihots.shape)
print(sequence_lengths.shape)
print(sampled_label_indices)

torch.Size([2])
torch.Size([2, 20, 315])
torch.Size([2, 2000])
torch.Size([2])
torch.Size([2000])


Bad pipe message: %s [b"\xb7>\x1d+\xd0\x9d\x80~\x00\xff\x02Un3\xebd\xf8\x94\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t1"]
Bad pipe message: %s [b'.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04', b'\x03\x06', b'\x07\x08']
Bad pipe message: %s [b'\t\x08\n\x08\x0b\x08\x04']
Bad pipe message: %s [b"\xc1\xab\xaad(\xae6\xeb\xd5\x9c\xa1\xfdn\x13|\xf7\xa1\xd2\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S

In [None]:
import json
import logging
from src.utils.data import read_pickle
from src.utils.losses import contrastive_loss
from src.data.collators import collate_variable_sequence_length
from src.data.datasets import ProteinDataset
from src.models.ProTCL import ProTCL
import torch
import wandb


# Data paths
TRAIN_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/train_GO.fasta'
VAL_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/dev_GO.fasta'
TEST_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/test_GO.fasta'
AMINO_ACID_VOCAB_PATH = '/home/ncorley/protein/ProteinFunctions/data/vocabularies/amino_acid_vocab.json'
GO_LABEL_VOCAB_PATH = '/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json'

# Embedding paths
LABEL_EMBEDDING_PATH = "/home/ncorley/protein/ProteinFunctions/data/embeddings/label_embeddings.pk1"
SEQUENCE_EMBEDDING_PATH = "/home/ncorley/protein/ProteinFunctions/data/embeddings/sequence_embeddings.pk1"

# Load datasets
train_dataset, val_dataset, test_dataset = ProteinDataset\
    .create_multiple_datasets(data_paths=[TRAIN_DATA_PATH, VAL_DATA_PATH, TEST_DATA_PATH],
                              sequence_vocabulary_path=AMINO_ACID_VOCAB_PATH)

# Create label voculabary by merging sets from train_loader, val_loader, and test_loader
master_label_vocabulary = list(set(train_dataset.label_vocabulary) | set(val_dataset.label_vocabulary) | set(test_dataset.label_vocabulary))

# Save master_label_vocabulary to JSON file
with open('/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json', 'w') as f:
    json.dump(master_label_vocabulary, f)

In [11]:
import os
import datetime
import sys
from pathlib import Path
import torch

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

# Import read_pickle
from src.utils.data import read_pickle

# Load /home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer_sequence_id_map.pkl
sequence_id_map = read_pickle('/home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer_sequence_id_map.pkl')

# Print the shape and column names
print(len(sequence_id_map))

# Print the first 10 keys
print(list(sequence_id_map.keys())[:10])

# Check if the keys are unique
print(len(sequence_id_map) == len(set(sequence_id_map.keys())))

# Load sequence embeddings 
sequence_embeddings = read_pickle('/home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer_sequence_embeddings.pkl')
print(len(sequence_embeddings))

# Load old embeddings from /home/ncorley/protein/ProteinFunctions/data/embeddings/sequence_embeddings.pth with pytorch
old_sequence_embeddings = torch.load('/home/ncorley/protein/ProteinFunctions/data/embeddings/sequence_embeddings.pth')

# Print number of old embeddings
print(len(old_sequence_embeddings))

522607
['A0A023GPI8', 'A0A023IWD9', 'A0A023IWE0', 'A0A023IWE1', 'A0A023IWE2', 'A0A023IWE3', 'A0A023IWG1', 'A0A023IWG2', 'A0A023IWG3', 'A0A023IWG4']
True
522607
522607
439540


# Create new map from label ID to tokens

In [32]:
from pathlib import Path
import os
import sys
import gzip
import wget

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))


In [33]:
# Load vocab from /home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json
import json
with open('/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json', 'r') as f:
    label_vocab = json.load(f)

# Print the first 10 labels
print(label_vocab[:10])

['GO:0000001', 'GO:0000002', 'GO:0000003', 'GO:0000006', 'GO:0000007', 'GO:0000009', 'GO:0000010', 'GO:0000011', 'GO:0000012', 'GO:0000014']


In [57]:
# Load annotations from /home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl
from src.utils.data import read_pickle
annotations = read_pickle('/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl')

In [72]:
# Filter the annotations df to be only the labels in label_vocab. In annotations, the go id is the index
annotations = annotations[annotations.index.isin(label_vocab)]

In [73]:
# Load PubMedBERT model
from src.utils.models import load_PubMedBERT
tokenizer, model = load_PubMedBERT()

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
# Load PubMedBERT tokenizer
inputs = tokenizer(["Test text This is what it is like to live! I will run forever!", "BLAH"], return_tensors="pt",
                       truncation=True, padding=True, max_length=512)
print(inputs)
print(type(inputs))

{'input_ids': tensor([[    2,  2648,  5316,  2052,  1977,  4663,  2176,  1977,  3189,  1942,
          6922,     5,    51,  2832,  4747,  9023,  2004,     5,     3],
        [    2, 18208,  1023,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [62]:
annotations.loc[:, 'tokenized_label_text'] = annotations['label'].apply(lambda x: tokenizer(x, return_tensors="pt", truncation=True, padding=True, max_length=512))

In [81]:
annotations.columns

Index(['label', 'tokenized_label_text', 'input_ids', 'attention_mask',
       'token_type_ids'],
      dtype='object')

In [74]:
# Tokenize all labels in the dataframe in a batched manner
tokenized_outputs = tokenizer(annotations['label'].tolist(), return_tensors="pt", truncation=True, padding=True, max_length=512)


275

In [None]:
# Assign the tokenized outputs to the dataframe
annotations['input_ids'] = tokenized_outputs['input_ids']
annotations['attention_mask'] = tokenized_outputs['attention_mask']
if 'token_type_ids' in tokenized_outputs:
    annotations['token_type_ids'] = tokenized_outputs['token_type_ids']

In [66]:
import torch

# Extract input_ids and attention_mask as tensors and create new columns
annotations['input_ids'] = annotations['tokenized_label_text'].apply(lambda x: x['input_ids'].clone().detach())
annotations['attention_mask'] = annotations['tokenized_label_text'].apply(lambda x: x['attention_mask'].clone().detach())
annotations['token_type_ids'] = annotations['tokenized_label_text'].apply(lambda x: x.get('token_type_ids', torch.zeros_like(x['input_ids'])).clone().detach())

# # Drop the tokenized_label_text column
tokenized_annotations = annotations.drop(columns=['label', 'tokenized_label_text'])

# Save tokenized_annotations
tokenized_annotations.to_pickle('/home/ncorley/protein/ProteinFunctions/data/annotations/tokenized_go_annotations_2019_07_01.pkl')


In [68]:
import pandas as pd

# Load the pickled DataFrame
df = pd.read_pickle('/home/ncorley/protein/ProteinFunctions/data/annotations/tokenized_go_annotations_2019_07_01.pkl')

# Create dictionaries for each column using the DataFrame's index as the key

input_ids_dict = df['input_ids'].to_dict()
attention_mask_dict = df['attention_mask'].to_dict()
token_type_ids_dict = df['token_type_ids'].to_dict()

32102


In [71]:
print(len(token_type_ids_dict))

32102


# Transfomers

In [18]:
checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

In [25]:
# Step 1: Tokenize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life. What a relief. Wow. Crazy.",
    "Meh",
    "I wonder how many I can use before it breaks."
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(inputs)
type(inputs)

{'input_ids': tensor([[    2,    51,    11,  3428,  2252, 15215,  1958,    43,  5897,  3312,
          1941,  6016,  1972,  4914,  3057,  4292,  3308,    18,  4663,    43,
         13104,    18, 26334,  1018,    18, 24422,  1011,  1012,    18,     3],
        [    2,  2142,  1023,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    2,    51, 24184,  2307,  3086,    51,  2112,  2485,  3119,  2176,
         13065,    18,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0

transformers.tokenization_utils_base.BatchEncoding

In [26]:
# Step 2: Go through the model
from transformers import AutoModel

model = AutoModel.from_pretrained(checkpoint)

outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([3, 30, 768])


In [32]:
# Embeddings
embeddings = outputs.last_hidden_state[:,0,:]
embeddings.shape

torch.Size([3, 768])

# Example

In [3]:
from pathlib import Path
import os
import sys
curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

# Intiialize a simple ProteinDataset
from src.data.datasets import ProteinDataset
from src.utils.data import read_pickle, get_vocab_mappings, read_json

label_vocab = read_json('/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json')

label2int, int2label = get_vocab_mappings(label_vocab)

# Assert label2int values start at 0 and do not skip any integers
assert len(label2int) == max(label2int.values()) + 1, "label2int values must be sequential and start at 0"

In [4]:
from src.utils.data import read_pickle, read_json

# Load the label vocabulary and corresponding free-text annotations
annotations = read_pickle(
    '/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl')

# Filter the annotations df to be only the labels in label_vocab. In annotations, the go id is the index
filtered_annotations = annotations[annotations.index.isin(label_vocab)].copy()


# Add a new column 'numeric_id' to the dataframe based on the id_map
filtered_annotations.loc[:, 'numeric_id'] = filtered_annotations.index.map(label2int)

In [9]:
# Sort the dataframe by 'numeric_id'
annotations_sorted = filtered_annotations.sort_values(by='numeric_id')

# Extract the "label" column as a list
sorted_labels = annotations_sorted['label'].tolist()

checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

In [10]:
from src.utils.models import load_model_and_tokenizer, tokenize_inputs, get_cls_embeddings_from_tokenss
from torch.utils.data import DataLoader, TensorDataset

# Example usage:

tokenizer, model = load_model_and_tokenizer(checkpoint, freeze_weights=True)
model_inputs = tokenize_inputs(tokenizer, sorted_labels)

# Define the batch size
batch_size = 200  # Try increasing the batch size if your GPU has enough memory

# Create a DataLoader to iterate over the dataset in batches
dataloader = DataLoader(TensorDataset(*model_inputs.values()), batch_size=batch_size)

embeddings = get_cls_embeddings_from_tokenss(model, dataloader, train_model=False)

In [28]:
%load_ext autoreload
%autoreload 2

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from src.utils.models import get_cls_embeddings_from_tokens, load_HF_tokenizer, load_HF_model, tokenize_inputs

protein_descriptions = [
    "This protein is involved in the process of cell signaling.",
    "A protein that plays a role in cell signaling.",
    "A protein responsible for muscle contraction.",
    "This protein aids in muscle contraction."
]

# protein_descriptions = [
#     "This gene product is crucial for intracellular communication mediated by signaling molecules.",
#     "The protein encoded by this gene plays a pivotal role in the transmission of signals within cells.",
#     "The protein expressed from this locus is involved in the correction of mismatches in DNA sequences.",
#     "This gene's product is essential for repairing inaccuracies in the DNA helix."
# ]

# Load the tokenizer
checkpoint = "microsoft/biogpt"
tokenizer = load_HF_tokenizer(checkpoint)

# Load the model
model = load_HF_model(checkpoint)

# Tokenize the protein descriptions
tokens = tokenize_inputs(tokenizer, protein_descriptions)

# print(tokens)
# print(type(tokens))

# # Get embeddings for the tokenized descriptions
embeddings = get_cls_embeddings_from_tokens(model, tokens)

# Convert the embeddings to numpy for cosine similarity computation
embeddings_matrix = embeddings.cpu().numpy()

# Compute the cosine similarity between the embeddings
cosine_sim = cosine_similarity(embeddings_matrix)

# Print all the cosine similarities and their corresponding protein descriptions
for i in range(len(protein_descriptions)):
    for j in range(i + 1, len(protein_descriptions)):
        print("Similarity between: \n\t{} \n\t{} \n\tis: {}".format(protein_descriptions[i],
                                                                        protein_descriptions[j],
                                                                        cosine_sim[i][j]))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Similarity between: 
	This protein is involved in the process of cell signaling. 
	A protein that plays a role in cell signaling. 
	is: 1.0000001192092896
Similarity between: 
	This protein is involved in the process of cell signaling. 
	A protein responsible for muscle contraction. 
	is: 1.0000001192092896
Similarity between: 
	This protein is involved in the process of cell signaling. 
	This protein aids in muscle contraction. 
	is: 1.0000001192092896
Similarity between: 
	A protein that plays a role in cell signaling. 
	A protein responsible for muscle contraction. 
	is: 1.0000001192092896
Similarity between: 
	A protein that plays a role in cell signaling. 
	This protein aids in muscle contraction. 
	is: 1.0000001192092896
Similarity between: 
	A protein responsible for muscle contraction. 
	This protein aids in muscle contraction. 
	is: 1.0000001192092896


In [35]:
from transformers import BioGptTokenizer, BioGptForCausalLM

# Load tokenizer and model
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")

# Tokenize the input text
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')

# Get the model's output
output = model(**encoded_input)

# Extract the logits (token embeddings) from the output
logits = output.logits

# Compute the average embedding
average_embedding = logits.mean(dim=1)

# The result is a tensor of shape (batch_size, hidden_size), where hidden_size is the size of the embeddings (e.g., 768 for base models).
print(average_embedding.shape)


torch.Size([1, 42384])


In [57]:
# Load /home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl
from src.utils.data import read_pickle
annotations = read_pickle('/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl')
annotations.head(20)

Unnamed: 0,label
GO:0000001,"The distribution of mitochondria, including th..."
GO:0000002,The maintenance of the structure and integrity...
GO:0000003,The production of new individuals that contain...
GO:0000005,OBSOLETE. Assists in the correct assembly of r...
GO:0000006,Enables the transfer of zinc ions (Zn2+) from ...
GO:0000007,Enables the transfer of a solute or solutes fr...
GO:0000008,OBSOLETE. A small disulfide-containing redox p...
GO:0000009,Catalysis of the transfer of a mannose residue...
GO:0000010,Catalysis of the reaction: all-trans-hexapreny...
GO:0000011,The distribution of vacuoles into daughter cel...


In [59]:
%load_ext autoreload
%autoreload 2

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import os
import sys
import torch 

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from src.utils.models import get_aggregated_embeddings_from_tokens

protein_descriptions = [
    "Catalysis of the reaction: lactose + H2O = D-glucose + D-galactose",
    "Lactose galactohydrolase activity",
    "The chemical reactions and pathways resulting in the formation of L-glutamate, the L enantiomer anion of 2-aminopentanedioic acid",
    "L-glutamate anabolism",
]

# Load the tokenizer
checkpoint = "microsoft/biogpt"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Load the model
model = AutoModel.from_pretrained(checkpoint)

with torch.no_grad():
    # Tokenize the protein descriptions
    tokens = tokenizer(protein_descriptions, padding=True, truncation=True, return_tensors="pt")

    outputs = model(**tokens)

    # Get embeddings for the tokenized descriptions
    embeddings = outputs['last_hidden_state']

    # Take the last token
    averaged_embeddings = embeddings.mean(dim=1)

    print(averaged_embeddings.shape)  # This should print torch.Size([4, 1024])

    # Convert the averaged embeddings to numpy for cosine similarity computation
    embeddings_matrix = averaged_embeddings.cpu().numpy()

# Compute the cosine similarity between the embeddings
cosine_sim = cosine_similarity(embeddings_matrix)

# Print all the cosine similarities and their corresponding protein descriptions
for i in range(len(protein_descriptions)):
    for j in range(i + 1, len(protein_descriptions)):
        print("Similarity between: \n\t{} \n\t{} \n\tis: {}".format(protein_descriptions[i],
                                                                        protein_descriptions[j],
                                                                        cosine_sim[i][j]))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
torch.Size([4, 1024])
Similarity between: 
	Catalysis of the reaction: lactose + H2O = D-glucose + D-galactose 
	Lactose galactohydrolase activity 
	is: 0.622012197971344
Similarity between: 
	Catalysis of the reaction: lactose + H2O = D-glucose + D-galactose 
	The chemical reactions and pathways resulting in the formation of L-glutamate, the L enantiomer anion of 2-aminopentanedioic acid 
	is: 0.5596150755882263
Similarity between: 
	Catalysis of the reaction: lactose + H2O = D-glucose + D-galactose 
	L-glutamate anabolism 
	is: 0.6240302920341492
Similarity between: 
	Lactose galactohydrolase activity 
	The chemical reactions and pathways resulting in the formation of L-glutamate, the L enantiomer anion of 2-aminopentanedioic acid 
	is: 0.2663598656654358
Similarity between: 
	Lactose galactohydrolase activity 
	L-glutamate anabolism 
	is: 0.7775448560714722
Similarity between: 
	The chemical reac

Bad pipe message: %s [b'\x16\x9c\rN\xf5`\x18f\xf4Ji\x15l`\xc2]\xc4\xf2 K\xd6\xd4\x88\xfeb\x03\x81\xc6D\xa6\xbf_)}d\xbaR|J\xdc\xf5 T\xeb\xcd\x11y\x1b\x11k\xc5\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0']
Bad pipe message: %s [b'.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04']
Bad pipe message: %s [b'\x03\x06', b'\x07\x08']
Bad pipe message: %s [b'\t\x08\n\x08\x0b\x08\x04']
Bad pipe message: %s [b'\x08\x06\x04\x01\x05\x01\x06', b'']
Bad pipe message: %s [b"\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 d\xbe'n\xdd\xef\xbc\xb9Z\x1b\xb9A\x81\xde\xae\xf9\x9f\xe7\xe4i\xfd\xb1"]
Bad pipe message: %s [b'\xff\x9ae!h\x90J+\x9f\x83\x15W\xbd\xb6L\xacR=\x00\x00', b",\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\x

In [60]:
from transformers import AutoTokenizer, BioGptModel
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptModel.from_pretrained("microsoft/biogpt")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [74]:
last_hidden_states.shape
# Compute the average embedding
average_embedding = last_hidden_states.mean(dim=1)
average_embedding.sum()

tensor(-22.7061, grad_fn=<SumBackward0>)

In [72]:
average_embedding.sum()

tensor(-22.7061, grad_fn=<SumBackward0>)

# New approach

In [3]:
import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from src.utils.data import read_pickle

from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

model = AutoModel.from_pretrained("microsoft/biogpt")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [15]:

annotations = read_pickle('/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl')

In [16]:
annotations.head()

Unnamed: 0_level_0,label
numeric_id,Unnamed: 1_level_1
1,"The distribution of mitochondria, including th..."
2,The maintenance of the structure and integrity...
3,The production of new individuals that contain...
5,OBSOLETE. Assists in the correct assembly of r...
6,Enables the transfer of zinc ions (Zn2+) from ...


In [7]:
# Get a list of the first 1000 labels and their corresponding tokenized outputs
labels = annotations['label'].tolist()[:100]
tokenized_outputs = tokenizer(labels, return_tensors="pt", truncation=True, padding=True, max_length=512)

In [14]:
from torch.cuda.amp import autocast

# Now compute the embeddings for the tokenized outputs
with torch.set_grad_enabled(True), autocast():
    outputs = model(**tokenized_outputs)

In [6]:
import torch

# Define the label_multihots tensor
label_multihots = torch.tensor([[0, 1, 0],
                                [1, 1, 1],
                                [0, 0, 1]])

label_indices_list = [torch.nonzero(row).squeeze() for row in label_multihots]
print(label_indices_list)

[tensor(1), tensor([0, 1, 2]), tensor(2)]


In [11]:
print(torch.nonzero(label_multihots[2]))
print(torch.nonzero(label_multihots[2]).squeeze())

tensor([[2]])
tensor(2)
