In [1]:
import os
import datetime
import sys
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

import logging
from src.utils.data import (
    load_model_weights,
    seed_everything,
    read_pickle,
)
from src.data.datasets import ProteinDataset, calculate_pos_weight, create_multiple_loaders
from src.models.ProTCLTrainer import ProTCLTrainer
from src.models.ProTCL import ProTCL
from src.models.protein_encoders import ProteInfer
from src.utils.evaluation import EvalMetrics, save_evaluation_results
from src.utils.models import count_parameters_by_layer, get_label_embeddings
from src.utils.configs import get_setup
import torch
import wandb
import os
import argparse
import json
from transformers import AutoTokenizer, AutoModel, BatchEncoding
from src.utils.main_utils import get_or_generate_vocabularies,  get_or_generate_label_embeddings, get_or_generate_sequence_embeddings, validate_arguments
from tqdm import tqdm
import os

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["ROOT_PATH"] = "/home/ncorley/protein/ProteinFunctions"

# Unpack and process the config file
config = get_setup(
    config_path='/home/ncorley/protein/ProteinFunctions/configs/base_config.yaml',
    run_name="Test",
    overrides=None,
    train_path_name="TRAIN_DATA_PATH",
    val_path_name="VAL_DATA_PATH",
    test_paths_names=["TEST_DATA_PATH"],
    amlt=False,
    is_master=True,
)
params, paths, timestamp, logger = config["params"], config[
    "paths"], config["timestamp"], config["logger"]


Logging to /home/ncorley/protein/ProteinFunctions/outputs/logs/2023-10-04_15-02-22_Test.log and console...


In [3]:
# Load or generate the vocabularies
vocabularies = get_or_generate_vocabularies(
    paths["FULL_DATA_PATH"], paths["VOCABULARIES_DIR"], logger)

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])

# Create datasets
datasets = ProteinDataset.create_multiple_datasets(
    config['dataset_paths_list'],
    label_tokenizer=label_tokenizer,
    vocabularies=vocabularies,
    subset_fractions={
        "train": params["TRAIN_SUBSET_FRACTION"],
        "validation": params["VALIDATION_SUBSET_FRACTION"],
        "test": params["TEST_SUBSET_FRACTION"]}
)

2023-10-04 15:02:26 PDT INFO Loaded amino_acid_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/amino_acid_vocab.json
2023-10-04 15:02:26 PDT INFO Loaded GO_label_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/GO_label_vocab.json
2023-10-04 15:02:26 PDT INFO Loaded sequence_id_vocab vocabulary from /home/ncorley/protein/ProteinFunctions/data/vocabularies/proteinfer/sequence_id_vocab.json


2023-10-04 15:02:33 PDT INFO Subsetting 1.0% of the train set...
2023-10-04 15:02:47 PDT INFO Subsetting 10.0% of the validation set...
2023-10-04 15:03:00 PDT INFO Subsetting 10.0% of the test set...


In [4]:
# Define label sample sizes for train, validation, and test loaders
label_sample_sizes = {
    "train": params["TRAIN_LABEL_SAMPLE_SIZE"],
    "validation": params["VALIDATION_LABEL_SAMPLE_SIZE"],
    "test": None  # No sampling for the test set
}

# Define data loaders
loaders = create_multiple_loaders(
    datasets,
    params,
    label_sample_sizes=label_sample_sizes,
    num_workers=params["NUM_WORKERS"],
    world_size=1,
    rank=0,
)

In [5]:
label2int = datasets[list(datasets.keys())[0]][0].label2int
int2label = datasets[list(datasets.keys())[0]][0].int2label
label_annotation_map = datasets[list(datasets.keys())[
    0]][0].label_annotation_map

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load label encoder
label_encoder = AutoModel.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])

# Generate all label embeddings upfront, if not training the label encoder
label_embedding_matrix = None
if not params["TRAIN_LABEL_ENCODER"]:
    # Create a list of text labels
    sorted_labels = sorted(
        vocabularies["GO_label_vocab"], key=lambda x: label2int[x])
    label_annotations = [label_annotation_map[label_id]
                            for label_id in sorted_labels]
    label_encoder = label_encoder.to(device)
    label_embedding_matrix = get_or_generate_label_embeddings(
        paths,
        device,
        label_annotations,
        label_tokenizer,
        label_encoder,
        logger,
        label_batch_size_limit=params["LABEL_BATCH_SIZE_LIMIT"]
    )
    # Move the label encoder to CPU
    label_encoder = label_encoder.cpu()

2023-10-04 15:03:22 PDT INFO Loaded label embeddings from /home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer/frozen_BioGPT_label_embeddings.pkl


In [27]:
save_path = '/home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer/frozen_proteinfer_sequence_embeddings2.pkl'
sequence_embedding_dict = read_pickle(save_path)


In [36]:
torch.tensor(sequence_embedding_dict.loc['Q87BZ2'].values)

tensor([ 1.0928, -0.0995, -2.6018,  ...,  3.2864,  0.9259,  0.6427])

In [11]:
import pandas as pd
df = pd.DataFrame.from_dict(sequence_embedding_dict, orient='index')

In [37]:
sequence_embedding_dict

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099
Q87BZ2,1.092764,-0.099463,-2.601835,2.458945,-0.339574,2.064416,-0.848034,-1.263663,-0.666355,-0.584040,...,2.465072,2.808122,-0.712308,2.776648,0.593045,0.510704,-2.817227,3.286357,0.925872,0.642687
B2I618,1.092764,-0.099463,-2.601835,2.458945,-0.339574,2.064416,-0.848034,-1.263663,-0.666355,-0.584040,...,2.465072,2.808122,-0.712308,2.776648,0.593045,0.510704,-2.817227,3.286357,0.925872,0.642687
O00743,-0.188256,-0.415908,-1.294522,-0.278369,0.395180,-0.067900,0.195210,0.293406,1.203846,-0.466863,...,0.379824,1.035259,-0.067635,-0.234045,0.263829,0.450876,0.246535,-0.981389,0.572166,0.408862
Q96CS3,-0.847522,1.361721,0.534706,-0.530007,-0.516629,0.764049,-0.267206,-0.561586,0.207192,0.955521,...,1.399858,0.312532,-1.479529,0.621946,0.767535,-0.652033,-0.860084,2.540756,-0.455240,-0.558114
A5EV13,-0.565193,-0.461003,-0.247423,-0.892777,-0.562745,0.474514,0.460123,0.699957,-1.729306,-0.371923,...,-0.451353,-1.819441,0.079773,0.206323,-0.611991,0.214988,-0.271421,0.492002,0.541757,0.434492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B7MSY5,-0.591338,0.748619,-1.509524,-0.807719,0.436589,1.188093,0.823268,-0.398200,-0.611090,-0.207004,...,0.799393,3.384445,0.479437,0.078235,1.387250,-1.308722,0.590093,2.364542,1.312730,0.180884
Q6D1G2,0.193848,0.205633,0.584557,-0.960804,-1.104567,0.661462,-1.718542,-0.513420,0.261798,0.486313,...,0.064423,4.810921,-1.706972,-0.102495,-0.941660,-0.922620,-2.007272,2.247572,1.968454,-0.573427
Q5DX17,-0.353879,-1.079458,-0.193481,-0.066858,-0.182472,1.215882,0.510298,1.840419,1.065090,-0.590107,...,-0.004379,-0.635328,0.077098,0.123644,0.588070,1.816708,0.062458,4.811610,0.100297,0.281657
A6URT3,-0.111527,-0.000087,-1.765674,2.011913,-0.966421,1.384929,0.655796,0.688660,-1.952361,-1.194839,...,0.824637,1.588302,2.583304,0.525150,-1.032218,-3.185447,-1.837594,4.994675,-2.293851,-0.554146


In [38]:
save_path = '/home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer/frozen_proteinfer_sequence_embeddings.pkl'
# sequence_embedding_dict = read_pickle(save_path)

# Save df to save_path
sequence_embedding_dict.to_pickle(save_path)

In [6]:
# Initialize ProteInfer
sequence_encoder = ProteInfer.from_pretrained(
    weights_path=paths["PROTEINFER_WEIGHTS_PATH"],
    num_labels=config["embed_sequences_params"]["PROTEINFER_NUM_LABELS"],
    input_channels=config["embed_sequences_params"]["INPUT_CHANNELS"],
    output_channels=config["embed_sequences_params"]["OUTPUT_CHANNELS"],
    kernel_size=config["embed_sequences_params"]["KERNEL_SIZE"],
    activation=torch.nn.ReLU,
    dilation_base=config["embed_sequences_params"]["DILATION_BASE"],
    num_resnet_blocks=config["embed_sequences_params"]["NUM_RESNET_BLOCKS"],
    bottleneck_factor=config["embed_sequences_params"]["BOTTLENECK_FACTOR"],
)

# Generate all sequence embeddings upfront, if not training the sequence encoder
sequence_embedding_dict = None
if not params["TRAIN_SEQUENCE_ENCODER"]:
    sequence_embedding_dict = get_or_generate_sequence_embeddings(
        paths,
        device,
        sequence_encoder,
        datasets,
        params,
        logger,
    )

2023-10-04 15:03:37 PDT INFO Loaded sequence embeddings from /home/ncorley/protein/ProteinFunctions/data/embeddings/proteinfer/frozen_proteinfer_sequence_embeddings.pkl


In [19]:
label_embedding_matrix.shape

torch.Size([32102, 1024])

In [20]:
sequence_embedding_dict

{'Q87BZ2': tensor([ 1.0928, -0.0995, -2.6018,  ...,  3.2864,  0.9259,  0.6427]),
 'B2I618': tensor([ 1.0928, -0.0995, -2.6018,  ...,  3.2864,  0.9259,  0.6427]),
 'O00743': tensor([-0.1883, -0.4159, -1.2945,  ..., -0.9814,  0.5722,  0.4089]),
 'Q96CS3': tensor([-0.8475,  1.3617,  0.5347,  ...,  2.5408, -0.4552, -0.5581]),
 'A5EV13': tensor([-0.5652, -0.4610, -0.2474,  ...,  0.4920,  0.5418,  0.4345]),
 'A2SPK8': tensor([-1.2468,  1.7673, -0.0867,  ...,  0.6570, -0.7937,  1.2841]),
 'P92671': tensor([-0.2117, -0.6820, -0.4195,  ...,  3.8961,  0.2996,  0.0754]),
 'A4G0R3': tensor([-0.5658, -1.4708,  1.6804,  ...,  2.7701,  0.4024, -0.9928]),
 'Q9JH45': tensor([ 0.7996,  2.2551,  1.8127,  ...,  6.6851, -0.8541,  0.7263]),
 'Q55BC0': tensor([ 0.5047,  0.7145,  0.4801,  ...,  0.1280, -0.2950,  0.4498]),
 'Q3YVX4': tensor([-0.6308, -0.1415,  0.3238,  ...,  3.5421,  0.8719,  1.2313]),
 'A1XGN9': tensor([-0.4027,  0.0550,  0.4111,  ...,  1.6935, -0.0390,  1.5880]),
 'Q3SLN6': tensor([-0.0379, 

In [26]:
# sequence_embedding_dict = read_pickle(paths["SEQUENCE_EMBEDDING_PATH"])
label_embedding_matrix = torch.load(paths["LABEL_EMBEDDING_PATH"])

In [31]:
from src.utils.data import hash_alphanumeric_sequence_id

hash_alphanumeric_sequence_id("P68956")

249015376130836471476860981285364416514

Bad pipe message: %s [b'x\x06\xc7\xd1\xcd%@\xd2\xee\x07;O\xcf\xee\xd7\x9e#m \xca\x1c\xf5\xdbF;Ti\xc5\x9b!\x17yO\x01\xe7a\xa2\x80\xef\x87\x85Q\x93\x04\xf1a\xd6\xcdp\xf5\x05\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127', b'.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00']
Bad pipe message: %s [b'\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01']
Bad pipe message: %s [b'\t\x7f\xa0c~P\xbd\x08\xf1q\xc9.\xb5\xc4M\xdcWD h\xb8w\xc2v\\\x0b\x88\xd99\xc3~\\\x8a\xd0\x0e\x02\x1d&_\xbc\x01Bn\xab\t\xc6\x11\xec\xbd\t\xcd\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03

In [29]:
label_embedding_matrix.shape

torch.Size([32102, 1024])

In [None]:

# Initialize label tokenizer
label_tokenizer = AutoTokenizer.from_pretrained(
    params['LABEL_ENCODER_CHECKPOINT'])


# Create datasets
datasets = ProteinDataset.create_multiple_datasets(paths_list, label_tokenizer=label_tokenizer)

# Initialize new run
logger.info(
    f"################## {timestamp} RUNNING train.py ##################")

# Define label sample sizes for train, validation, and test loaders
label_sample_sizes = {
    # Assuming you have this parameter in your params dictionary
    "train": 2000,
    # Assuming you have this parameter in your params dictionary
    "validation": 100,
    "test": None  # No sampling for the test set
}

# Define data loaders
loaders = create_multiple_loaders(
    datasets,
    params,
    label_sample_sizes=label_sample_sizes,
    num_workers=0
)