In [6]:
import pandas as pd
import numpy as np
import torch, os
from Bio.PDB import PDBParser
import warnings
from tqdm import tqdm, trange
from time import sleep
from pathlib import Path
import wandb

warnings.filterwarnings("ignore")
# pip3 install biopython, torch

In [8]:
wandb.sdk.wandb_run.Run

<property at 0x107aeaea0>

In [5]:
from asep.asep.utils.utils import time_stamp

Path("/tmp/ckpts").joinpath(
            time_stamp()
        )

PosixPath('/tmp/ckpts/20241230-124311')

In [21]:
proj_dir = "/Users/mansoor/Documents/GSU/Projects/Antibody-Design/epitope-prediction/"
dataset_dir = os.path.join(proj_dir, "data/")

asep_data_dir = os.path.join(dataset_dir, "asep/")
graphs_asep_path = asep_data_dir + "asepv1_interim_graphs/"
structures_asep_path = asep_data_dir + "structures/"
ids_asep = os.path.join(asep_data_dir, "asepv1-AbDb-IDs.txt")

iedb_epi_file_path = dataset_dir + "iedb/epitope_full_v3.csv"
iedb_ag_file_path = dataset_dir + "iedb/antigen_full_v3.csv"


In [None]:
from asep.data.asepv1_dataset import AsEPv1Dataset, EmbeddingConfig

# one-hot encoding
config = EmbeddingConfig(node_feat_type="one-hot")
asepv1_dataset = AsEPv1Dataset(
    root="/path/to/asep/download/folder",  # replace with the path to the parent folder of downloaded AsEP
    name="AsEP",
    embedding_config=config,
)

# pre-calculated embeddings with AntiBERTy (via igfold) and ESM2
config = EmbeddingConfig(
    node_feat_type='pre_cal',
    ab={"embedding_model": "igfold"},  # change this "esm2" for ESM2 embeddings
    ag={"embedding_model": "esm2"},
)
asepv1_dataset = AsEPv1Dataset(
    root="/path/to/asep/download/folder",   # replace with the path to the parent folder of downloaded AsEP
    name="AsEP",
    embedding_config=config,
)

# get i-th graph pair and node labels
i = 0
graph_pair = asepv1_dataset[i]
node_labels_b = graph_pair.y_b  # antibody graph node labels (1 => interface nodes)
node_labels_g = graph_pair.y_g  # antigen  graph node labels (1 => interface nodes)

# bipartite graph edges
edge_index_bg = graph_pair.edge_index_bg  # bipartite graph edge indices between the antibody and antigen graphs of shape (2, E), 
# 1st col is antibody node indices, 2nd col is antigen node indices

In [22]:
# Load the file
data = torch.load(graphs_asep_path + '1a14_0P.pt')

print(data.keys())

dict_keys(['abdbid', 'seqres', 'mapping', 'embedding', 'edges', 'stats', 'Nb', 'Ng'])


In [19]:
print(data["abdbid"], data["stats"], "\nNum of Ab residues: ", data["Nb"], "\nNum of Ag residues: ", data["Ng"])

1a14_0P {'cdr': 60, 'surf': 322, 'epitope': 20, 'epitope2surf_ratio': 0.062112} 
Num of Ab residues:  60 
Num of Ag residues:  322


In [11]:
data["seqres"]

{'ab': OrderedDict([('H',
               'QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLEWIGIFYPGNGDTSYNQKFKDKATLTADKSSNTAYMQLSSLTSEDSAVYYCARSGGSYRYDGGFDYWGQGTTVTV'),
              ('L',
               'DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKLLIYYTSNLHSEVPSRFSGSGSGTDYSLTISNLEQEDIATYFCQQDFTLPFTFGGGTAA')]),
 'ag': {'N': 'RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPDECRFYALSQGTTIRGKHSNGTIHDRSQYRALISWPLSSPPTVYNSRVECIGWSSTSCHDGKTRMSICISGPNNNASAVIWYNRRPVTEINTWARNILRTQESECVCHNGVCPVVFTDGSATGPAETRIYYFKEGKILKWEPLAGTAKHIEECSCYGERAEITCTCRDNWQGSNRPVIRIDPVAMTHTSQYICSPVLTDNPRPNDPTVGKCNDPYPGNNNNGVKGFSYLDGVNTWLGRTISIASRSGYEMLKVPNALTDDKSKPTQGQTIVLNTDWSGYSGSFMDYWAEGECYRACFYVELIRGRPKEDKVWWTSNSIVSMCSSTEFLGQWDWPDGAKIEYFL'}}

In [14]:
data["edges"]

{'ab': tensor(indices=tensor([[ 0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,
                          3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,
                          5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,
                          6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,
                          7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,
                          9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10,
                         10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
                         11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
                         12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14,
                         14, 14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
                         16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18,
                         18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
                      

In [13]:
data["mapping"]

{'ab': {'seqres2cdr': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0])},
 'ag': {'seqres2surf': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0,

In [None]:

# Create a PDB parser object
parser = PDBParser()

# Load the PDB file
structure = parser.get_structure("1a14_0P", structures_asep_path + "1a14_0P.pdb")

# Access the structure
print(structure)

<Structure id=1a14_0P>


In [None]:
import nglview as nv

# Visualize the structure
view = nv.show_biopython(structure)
view

NGLWidget()

In [7]:
epitope = pd.read_csv(iedb_epi_file_path)
epitope.head()

  epitope = pd.read_csv(iedb_epi_file_path)


Unnamed: 0,Epitope ID,Epitope,Epitope.1,Epitope.2,Epitope.3,Epitope.4,Epitope.5,Epitope.6,Epitope.7,Epitope.8,...,Related Object.5,Related Object.6,Related Object.7,Related Object.8,Related Object.9,Related Object.10,Related Object.11,Related Object.12,Related Object.13,Related Object.14
0,IEDB IRI,Object Type,Name,Modified Residue(s),Modifications,Starting Position,Ending Position,IRI,Synonyms,Source Molecule,...,IRI,Synonyms,Source Molecule,Source Molecule IRI,Molecule Parent,Molecule Parent IRI,Source Organism,Source Organism IRI,Species,Species IRI
1,http://www.iedb.org/epitope/1,Linear peptide,"AA + MCM(A1,A2)","A1,A2",Main chain modification,200,201,,,"streptokinase, SKase",...,,,,,,,,,,
2,http://www.iedb.org/epitope/2,Linear peptide,AAAAAAAAAAAAA,,,489,501,,,RNA-binding protein 47,...,,,,,,,,,,
3,http://www.iedb.org/epitope/3,Linear peptide,AAAAAAAAAAAANANIAAAA,,,,,,,,...,,lpqH,Lipoprotein lpqH precursor,http://www.ncbi.nlm.nih.gov/protein/P0A5J0.1,Lipoprotein LpqH,http://www.uniprot.org/uniprot/P9WK61,Mycobacterium tuberculosis,http://purl.obolibrary.org/obo/NCBITaxon_1773,Mycobacterium tuberculosis,http://purl.obolibrary.org/obo/NCBITaxon_1773
4,http://www.iedb.org/epitope/4,Linear peptide,AAAAAAAAAAAGNVNIAAAA,,,,,,,,...,,lpqH,Lipoprotein lpqH precursor,http://www.ncbi.nlm.nih.gov/protein/P0A5J0.1,Lipoprotein LpqH,http://www.uniprot.org/uniprot/P9WK61,Mycobacterium tuberculosis,http://purl.obolibrary.org/obo/NCBITaxon_1773,Mycobacterium tuberculosis,http://purl.obolibrary.org/obo/NCBITaxon_1773


In [13]:
epitope.shape

(2236698, 32)

In [18]:
antigens = pd.read_csv(iedb_ag_file_path)
antigens.head()

Unnamed: 0,Antigen,Antigen.1,Antigen.2,Antigen.3,Antigen.4,Antigen.5,Antigen.6
0,Antigen Name,Antigen IRI,Organism Name,Organism IRI,# Epitopes,# Assays,# References
1,Isoform 2 of Glutamine-rich protein 2 (UniProt...,http://www.uniprot.org/uniprot/Q9H0J4-2,Homo sapiens (human),http://purl.obolibrary.org/obo/NCBITaxon_9606,26,45,7
2,Exported protein (UniProt:Q0WFC5),http://www.uniprot.org/uniprot/Q0WFC5,Yersinia pestis,http://purl.obolibrary.org/obo/NCBITaxon_632,1,1,1
3,Isoform 3 of Glutamine-rich protein 2 (UniProt...,http://www.uniprot.org/uniprot/Q9H0J4-3,Homo sapiens (human),http://purl.obolibrary.org/obo/NCBITaxon_9606,4,6,2
4,18 kDa oncosphere antigen (UniProt:E3VWN2),http://www.uniprot.org/uniprot/E3VWN2,Taenia saginata (beef tapeworm),http://purl.obolibrary.org/obo/NCBITaxon_6206,1,3,1
