In [None]:
import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

import json
import logging
from src.utils.data import read_pickle
from src.utils.losses import contrastive_loss
from src.data.collators import collate_variable_sequence_length
from src.data.datasets import ProteinDataset
from src.models.ProTCL import ProTCL
import torch
import wandb


# Data paths
TRAIN_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/train_GO.fasta'
VAL_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/dev_GO.fasta'
TEST_DATA_PATH = '/home/ncorley/protein/ProteinFunctions/data/swissprot/proteinfer_splits/random/test_GO.fasta'
AMINO_ACID_VOCAB_PATH = '/home/ncorley/protein/ProteinFunctions/data/vocabularies/amino_acid_vocab.json'
GO_LABEL_VOCAB_PATH = '/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json'

# Embedding paths
LABEL_EMBEDDING_PATH = "/home/ncorley/protein/ProteinFunctions/data/embeddings/label_embeddings.pk1"
SEQUENCE_EMBEDDING_PATH = "/home/ncorley/protein/ProteinFunctions/data/embeddings/sequence_embeddings.pk1"

# Load datasets
train_dataset, val_dataset, test_dataset = ProteinDataset\
    .create_multiple_datasets(data_paths=[TRAIN_DATA_PATH, VAL_DATA_PATH, TEST_DATA_PATH],
                              sequence_vocabulary_path=AMINO_ACID_VOCAB_PATH)

# Create label voculabary by merging sets from train_loader, val_loader, and test_loader
master_label_vocabulary = list(set(train_dataset.label_vocabulary) | set(val_dataset.label_vocabulary) | set(test_dataset.label_vocabulary))

# Save master_label_vocabulary to JSON file
with open('/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json', 'w') as f:
    json.dump(master_label_vocabulary, f)

In [11]:
import os
import datetime
import sys
from pathlib import Path
import torch

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

# Import read_pickle
from src.utils.data import read_pickle

# Load /home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer_sequence_id_map.pkl
sequence_id_map = read_pickle('/home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer_sequence_id_map.pkl')

# Print the shape and column names
print(len(sequence_id_map))

# Print the first 10 keys
print(list(sequence_id_map.keys())[:10])

# Check if the keys are unique
print(len(sequence_id_map) == len(set(sequence_id_map.keys())))

# Load sequence embeddings 
sequence_embeddings = read_pickle('/home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer_sequence_embeddings.pkl')
print(len(sequence_embeddings))

# Load old embeddings from /home/ncorley/protein/ProteinFunctions/data/embeddings/sequence_embeddings.pth with pytorch
old_sequence_embeddings = torch.load('/home/ncorley/protein/ProteinFunctions/data/embeddings/sequence_embeddings.pth')

# Print number of old embeddings
print(len(old_sequence_embeddings))

522607
['A0A023GPI8', 'A0A023IWD9', 'A0A023IWE0', 'A0A023IWE1', 'A0A023IWE2', 'A0A023IWE3', 'A0A023IWG1', 'A0A023IWG2', 'A0A023IWG3', 'A0A023IWG4']
True
522607
522607
439540


# Create new map from label ID to tokens

In [32]:
from pathlib import Path
import os
import sys
import gzip
import wget

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))


In [33]:
# Load vocab from /home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json
import json
with open('/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json', 'r') as f:
    label_vocab = json.load(f)

# Print the first 10 labels
print(label_vocab[:10])

['GO:0000001', 'GO:0000002', 'GO:0000003', 'GO:0000006', 'GO:0000007', 'GO:0000009', 'GO:0000010', 'GO:0000011', 'GO:0000012', 'GO:0000014']


In [57]:
# Load annotations from /home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl
from src.utils.data import read_pickle
annotations = read_pickle('/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl')

In [72]:
# Filter the annotations df to be only the labels in label_vocab. In annotations, the go id is the index
annotations = annotations[annotations.index.isin(label_vocab)]

In [73]:
# Load PubMedBERT model
from src.utils.models import load_PubMedBERT
tokenizer, model = load_PubMedBERT()

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
# Load PubMedBERT tokenizer
inputs = tokenizer(["Test text This is what it is like to live! I will run forever!", "BLAH"], return_tensors="pt",
                       truncation=True, padding=True, max_length=512)
print(inputs)
print(type(inputs))

{'input_ids': tensor([[    2,  2648,  5316,  2052,  1977,  4663,  2176,  1977,  3189,  1942,
          6922,     5,    51,  2832,  4747,  9023,  2004,     5,     3],
        [    2, 18208,  1023,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [62]:
annotations.loc[:, 'tokenized_label_text'] = annotations['label'].apply(lambda x: tokenizer(x, return_tensors="pt", truncation=True, padding=True, max_length=512))

In [81]:
annotations.columns

Index(['label', 'tokenized_label_text', 'input_ids', 'attention_mask',
       'token_type_ids'],
      dtype='object')

In [74]:
# Tokenize all labels in the dataframe in a batched manner
tokenized_outputs = tokenizer(annotations['label'].tolist(), return_tensors="pt", truncation=True, padding=True, max_length=512)


275

In [None]:
# Assign the tokenized outputs to the dataframe
annotations['input_ids'] = tokenized_outputs['input_ids']
annotations['attention_mask'] = tokenized_outputs['attention_mask']
if 'token_type_ids' in tokenized_outputs:
    annotations['token_type_ids'] = tokenized_outputs['token_type_ids']

In [66]:
import torch

# Extract input_ids and attention_mask as tensors and create new columns
annotations['input_ids'] = annotations['tokenized_label_text'].apply(lambda x: x['input_ids'].clone().detach())
annotations['attention_mask'] = annotations['tokenized_label_text'].apply(lambda x: x['attention_mask'].clone().detach())
annotations['token_type_ids'] = annotations['tokenized_label_text'].apply(lambda x: x.get('token_type_ids', torch.zeros_like(x['input_ids'])).clone().detach())

# # Drop the tokenized_label_text column
tokenized_annotations = annotations.drop(columns=['label', 'tokenized_label_text'])

# Save tokenized_annotations
tokenized_annotations.to_pickle('/home/ncorley/protein/ProteinFunctions/data/annotations/tokenized_go_annotations_2019_07_01.pkl')


In [68]:
import pandas as pd

# Load the pickled DataFrame
df = pd.read_pickle('/home/ncorley/protein/ProteinFunctions/data/annotations/tokenized_go_annotations_2019_07_01.pkl')

# Create dictionaries for each column using the DataFrame's index as the key

input_ids_dict = df['input_ids'].to_dict()
attention_mask_dict = df['attention_mask'].to_dict()
token_type_ids_dict = df['token_type_ids'].to_dict()

32102


In [71]:
print(len(token_type_ids_dict))

32102


# Transfomers

In [18]:
checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

In [25]:
# Step 1: Tokenize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life. What a relief. Wow. Crazy.",
    "Meh",
    "I wonder how many I can use before it breaks."
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(inputs)
type(inputs)

{'input_ids': tensor([[    2,    51,    11,  3428,  2252, 15215,  1958,    43,  5897,  3312,
          1941,  6016,  1972,  4914,  3057,  4292,  3308,    18,  4663,    43,
         13104,    18, 26334,  1018,    18, 24422,  1011,  1012,    18,     3],
        [    2,  2142,  1023,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    2,    51, 24184,  2307,  3086,    51,  2112,  2485,  3119,  2176,
         13065,    18,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0

transformers.tokenization_utils_base.BatchEncoding

In [26]:
# Step 2: Go through the model
from transformers import AutoModel

model = AutoModel.from_pretrained(checkpoint)

outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([3, 30, 768])


In [32]:
# Embeddings
embeddings = outputs.last_hidden_state[:,0,:]
embeddings.shape

torch.Size([3, 768])

# Example

In [3]:
from pathlib import Path
import os
import sys
curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

# Intiialize a simple ProteinDataset
from src.data.datasets import ProteinDataset
from src.utils.data import read_pickle, get_vocab_mappings, read_json

label_vocab = read_json('/home/ncorley/protein/ProteinFunctions/data/vocabularies/GO_label_vocab.json')

label2int, int2label = get_vocab_mappings(label_vocab)

# Assert label2int values start at 0 and do not skip any integers
assert len(label2int) == max(label2int.values()) + 1, "label2int values must be sequential and start at 0"

In [4]:
from src.utils.data import read_pickle, read_json

# Load the label vocabulary and corresponding free-text annotations
annotations = read_pickle(
    '/home/ncorley/protein/ProteinFunctions/data/annotations/go_annotations_2019_07_01.pkl')

# Filter the annotations df to be only the labels in label_vocab. In annotations, the go id is the index
filtered_annotations = annotations[annotations.index.isin(label_vocab)].copy()


# Add a new column 'numeric_id' to the dataframe based on the id_map
filtered_annotations.loc[:, 'numeric_id'] = filtered_annotations.index.map(label2int)

In [9]:
# Sort the dataframe by 'numeric_id'
annotations_sorted = filtered_annotations.sort_values(by='numeric_id')

# Extract the "label" column as a list
sorted_labels = annotations_sorted['label'].tolist()

checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

In [10]:
from src.utils.models import load_model_and_tokenizer, tokenize_inputs, get_embeddings_from_tokens
from torch.utils.data import DataLoader, TensorDataset

# Example usage:

tokenizer, model = load_model_and_tokenizer(checkpoint, freeze_weights=True)
model_inputs = tokenize_inputs(tokenizer, sorted_labels)

# Define the batch size
batch_size = 200  # Try increasing the batch size if your GPU has enough memory

# Create a DataLoader to iterate over the dataset in batches
dataloader = DataLoader(TensorDataset(*model_inputs.values()), batch_size=batch_size)

embeddings = get_embeddings_from_tokens(model, dataloader, train_model=False)

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

%load_ext autoreload
%autoreload 2

checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

def test_protein_embeddings():
    # Define several strings that describe proteins

    protein_descriptions = [
        "This protein is involved in the process of cell signaling.",
        "A protein that plays a role in cell signaling.",
        "A protein responsible for muscle contraction.",
        "This protein aids in muscle contraction."
    ]

    # protein_descriptions = [
    #     "This gene product is crucial for intracellular communication mediated by signaling molecules.",
    #     "The protein encoded by this gene plays a pivotal role in the transmission of signals within cells.",
    #     "The protein expressed from this locus is involved in the correction of mismatches in DNA sequences.",
    #     "This gene's product is essential for repairing inaccuracies in the DNA helix."
    # ]

    # Load the model and tokenizer
    checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
    tokenizer, model = load_model_and_tokenizer(checkpoint)

    # Tokenize the protein descriptions
    tokens = tokenize_inputs(tokenizer, protein_descriptions)

    # print(tokens)
    # print(type(tokens))

    # # Get embeddings for the tokenized descriptions
    embeddings = get_embeddings_from_tokens(model, tokens)

    # Convert the embeddings to numpy for cosine similarity computation
    embeddings_matrix = embeddings.cpu().numpy()

    # Compute the cosine similarity between the embeddings
    cosine_sim = cosine_similarity(embeddings_matrix)

    # Print all the cosine similarities and their corresponding protein descriptions
    for i in range(len(protein_descriptions)):
        for j in range(i + 1, len(protein_descriptions)):
            print("Similarity between: \n\t{} \n\t{} \n\tis: {}".format(protein_descriptions[i],
                                                                         protein_descriptions[j],
                                                                         cosine_sim[i][j]))

    # Check if strings that describe similar proteins have a high cosine similarity
    # assert cosine_sim[0][1] > 0.8  # Similar descriptions about cell signaling
    # assert cosine_sim[2][3] > 0.8  # Similar descriptions about muscle contraction
    # assert cosine_sim[0][2] < 0.5  # Different descriptions
    # assert cosine_sim[1][3] < 0.5  # Different descriptions

test_protein_embeddings()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Similarity between: 
	This protein is involved in the process of cell signaling. 
	A protein that plays a role in cell signaling. 
	is: 0.9465829730033875
Similarity between: 
	This protein is involved in the process of cell signaling. 
	A protein responsible for muscle contraction. 
	is: 0.9293946027755737
Similarity between: 
	This protein is involved in the process of cell signaling. 
	This protein aids in muscle contraction. 
	is: 0.973195493221283
Similarity between: 
	A protein that plays a role in cell signaling. 
	A protein responsible for muscle contraction. 
	is: 0.9717705249786377
Similarity between: 
	A protein that plays a role in cell signaling. 
	This protein aids in muscle contraction. 
	is: 0.9410324692726135
Similarity between: 
	A protein responsible for muscle contraction. 
	This protein aids in muscle contraction. 
	is: 0.9525930285453796


: 

In [None]:
# 