In [None]:
import os

import torch
import pandas as pd

from procyon.data.inference_utils import (
    create_caption_input_simple,
    create_qa_input_simple,
    uniprot_id_to_index,
    ProCyonQAInference,
)
from procyon.model.model_unified import UnifiedProCyon

This notebook contains a minimal example of using a pre-trained ProCyon model for phenotype generation with QA filtering, as shown in Figure 4A of the main manuscript.

# Load pre-trained model

In [3]:
# Replace with the path where you downloaded a pre-trained ProCyon model (e.g. ProCyon-Full)
checkpoint_path = "~/storage/Lab/PLM/huggingface_repos/ProCyon-Full"
CKPT_NAME = os.path.expanduser(checkpoint_path)

In [4]:
data_args = torch.load(os.path.join(CKPT_NAME, "data_args.pt"))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, _ = UnifiedProCyon.from_pretrained(checkpoint_dir=CKPT_NAME)
model.to(device)
model.bfloat16() # Quantize the model to a smaller precision
model.eval()

# Generate model input

In this example, we'll generate phenotypes for the protein AKNAD1, as shown in Figure 4A in the main text. First, we generate the input to the ProCyon model.

The inputs to a ProCyon model are structed as a dict. At a high-level, these dicts support large training batches while minimizing
data duplication within the dict. To support this, the dicts have the following structure:
- `data` - The unique elements of the ProCyon-Instruct used in this data batch
  - `seq` - Integer IDs of the associated proteins, as generated by `uniprot_id_to_index()`
  - `text` - The text descriptions of phenotypes in this batch
- `input` - The elements within `data` used in each individual input in this batch.
  - `seq` - List of lists, where the `i`'th list is the proteins used in the `i`'th input in the batch.
  - `text` - List of lists, where the `i`'th list is the phenotypes used in the `i`'th input in the batch.
- `target` - Used in training only. The target output for each input in the batch.
- `instructions` - The natural language prompt for each input, which will be populated with the appropriate texts and protein embeddings.

In [6]:
# Internally, ProCyon uses integer IDs that have been assigned to UniProt proteins in ProCyon-Instruct.
want_proteins = ["Q5T1N1"]
protein_ids = [uniprot_id_to_index(x) for x in want_proteins]

In [7]:
input_simple = create_caption_input_simple(
    input_aaseq_ids=protein_ids,
    data_args=data_args,
    # The `instruction_source_dataset` and `instruction_source_relation` here control the style
    # of pre-templated instruction used in these queries. In particular, here we query for UniProt-style
    # functional descriptions.
    instruction_source_dataset="uniprot",
    instruction_source_relation="all",
    aaseq_type="protein",
    task_type="caption",
    icl_example_number=1,
    device=device,
)

In [8]:
input_simple

{'data': {'seq': tensor([  5, 530], device='cuda:0'),
  'seq_idx': tensor([  5, 530], device='cuda:0'),
  'text': ['Adapter protein implicated in the regulation of a large spectrum of both general and specialized signaling pathways. Binds to a large number of partners, usually by recognition of a phosphoserine or phosphothreonine motif. Binding generally results in the modulation of the activity of the binding partner. Negatively regulates the kinase activity of PDPK1.'],
  'drug': None},
 'input': {'seq': [[0, 1]], 'text': [[0]], 'drug': None},
 'target': {'seq': None, 'text': None, 'drug': None},
 'instructions': ["Definition: You will be shown a protein. Your job is to describe function(s) of a protein, and this protein should be involved in the function. This response provides any useful information about the protein, mostly biological knowledge. When a protein is a precursor of different chains/peptides having different functions, specify the role of each peptide in separate parag

In [9]:
# Note the special `<|protein|>` tokens, which will be replaced with the corresponding protein's embedding.
print(input_simple["instructions"][0])

Definition: You will be shown a protein. Your job is to describe function(s) of a protein, and this protein should be involved in the function. This response provides any useful information about the protein, mostly biological knowledge. When a protein is a precursor of different chains/peptides having different functions, specify the role of each peptide in separate paragraphs. The same holds true for isoform-specific functions. Multifunctional enzymes should have only one 'Function' section for the different activities. When a protein displays some very different or contradictory functions depending on the context, describe them in different sentences. This case is rare.
Positive example 1:
Protein: <|protein|>
Output: [ANSWER] [EXT]
Now, complete the following instance:
Protein: <|protein|>
Output: [ANSWER] 


# Generate phenotypes

Now we'll use the input to generate phenotypes using the pre-trained model. We'll use diverse beam search (DBS) as the text generation method, as this allows generating multiple, distinct phenotypes for a single input. Note that we also support greedy decoding, temperature sampling, and nucleus sampling, but find that diverse beam search produces the highest quality outputs.

In [10]:
text_gen_args = {
    "method": "beam",
    # Maximum length of generated text.
    "max_len": 200,
    # Total number of beams maintained per input. `beam_size` / `beam_group_size` = number of phenotypes returned per input.
    "beam_size": 20,
    # Size of the individual beam groups in DBS.
    "beam_group_size": 2,
    # Penalty applied to repetition within a beam group.
    "diversity_penalty": 0.8,
}

out_tokens, log_probs, output_logits, out_text = model.generate(
    inputs=input_simple,
    aaseq_type="protein",
    **text_gen_args
)

In [11]:
out_text

[['Subunit of the V1 complex of vacuolar(H+)-ATPase (V-ATPase) which is required for the transport of proteins into the endoplasmic reticulum (ER) to the outer membrane.',
  'Subunit of the V1 complex of vacuolar(H+)-ATPase (V-ATPase) which is required for the transport of proteins into the endoplasmic reticulum (ER) to the outer membrane.',
  'May be involved in transcriptional regulation',
  'May be involved in transcriptional regulation',
  'Component of the cytosolic machinery in which it is required for the formation of the actin cytoskeleton. May play a role in dendrite formation. May play a role in dendrite formation.',
  'Component of the cytosolic machinery in which it is required for the formation of the actin cytoskeleton. May play a role in dendrite formation. May play a role in dendrite formation.',
  'Plays a role in spermatogenesis. May play a role in spermatogenesis.',
  'Plays a role in spermatogenesis. May play a role in spermatogenesis.',
  'Putative regulator of cAM

In [12]:
# Typically we see a lot of repetition within a DBS group, so for the final set of generated phenotypes, we take
# the first output per group.
output_phenotypes = [
    phen for i, phen in enumerate(out_text[0]) if i % text_gen_args["beam_group_size"] == 0
]
output_phenotypes

['Subunit of the V1 complex of vacuolar(H+)-ATPase (V-ATPase) which is required for the transport of proteins into the endoplasmic reticulum (ER) to the outer membrane.',
 'May be involved in transcriptional regulation',
 'Component of the cytosolic machinery in which it is required for the formation of the actin cytoskeleton. May play a role in dendrite formation. May play a role in dendrite formation.',
 'Plays a role in spermatogenesis. May play a role in spermatogenesis.',
 'Putative regulator of cAMP-dependent protein kinase activity.',
 'Functions as an adapter protein that is implicated in the regulation of cell cycle progression.',
 'Transcription factor that binds to and repress transcription of target genes.',
 'Guanine nucleotide exchange factor (GEF) for RAB9A, RAB9B and RAB9B, RAB9B, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9B, RAB9B, RAB9B, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB9, RAB

# QA filtering of generated phenotypes

Next, we'll leverage ProCyon's question-answering capabilities to further filter down the generated phenotypes to a set of high-confidence outputs.

In [13]:
# Wrap the pre-trained model with a helper class specifically for QA.
qa_model = ProCyonQAInference(model, device=device)

In [15]:
# Try QA filtering
results = []
for i, query_text in enumerate(output_phenotypes):
    input_qa_simple = create_qa_input_simple(
        input_aaseq_ids=protein_ids,
        data_args=data_args,
        input_description=query_text,
        instruction_source_dataset="uniprot",
        instruction_source_relation="all",
        aaseq_type="protein",
        icl_example_number=1,
        device=device,
    )

    with torch.no_grad():
        model_qa_out = qa_model(input_qa_simple)

    yes_prob = model_qa_out["pred"][0, qa_model.yes_token].item()
    no_prob = model_qa_out["pred"][0, qa_model.no_token].item()


    print(f"TEXT {i} --------------------------------------------")
    print(query_text)
    print(f"Yes: {yes_prob:0.3f}")
    print(f"No: {no_prob:0.3f}")

    results.append({
        "phenotype": query_text,
        "yes_prob": yes_prob
    })

results = pd.DataFrame(results)

TEXT 0 --------------------------------------------
Subunit of the V1 complex of vacuolar(H+)-ATPase (V-ATPase) which is required for the transport of proteins into the endoplasmic reticulum (ER) to the outer membrane.
Yes: 0.173
No: 0.827
TEXT 1 --------------------------------------------
May be involved in transcriptional regulation
Yes: 0.015
No: 0.985
TEXT 2 --------------------------------------------
Component of the cytosolic machinery in which it is required for the formation of the actin cytoskeleton. May play a role in dendrite formation. May play a role in dendrite formation.
Yes: 0.860
No: 0.140
TEXT 3 --------------------------------------------
Plays a role in spermatogenesis. May play a role in spermatogenesis.
Yes: 0.867
No: 0.133
TEXT 4 --------------------------------------------
Putative regulator of cAMP-dependent protein kinase activity.
Yes: 0.827
No: 0.173
TEXT 5 --------------------------------------------
Functions as an adapter protein that is implicated in t

In [16]:
results.query("yes_prob >= 0.8")

Unnamed: 0,phenotype,yes_prob
2,Component of the cytosolic machinery in which ...,0.859625
3,Plays a role in spermatogenesis. May play a ro...,0.866983
4,Putative regulator of cAMP-dependent protein k...,0.826651
