In [88]:
import pandas as pd
import requests
import json

In [None]:
def get_uniprot_from_pdb_entity(pdb_id, entity_id = 1):
    """
    Fetches the UniProt accession and name for a given PDB ID and entity ID using the PDBe API.
    Where multiple mappings exist, the mapping with the highest coverage is returned.
    """
    url = f"https://www.ebi.ac.uk/pdbe/api/v2/pdb/entry/uniprot_mapping/{pdb_id}/{entity_id}"
    response = requests.get(url)
    response.raise_for_status()
    if response.status_code == 200:
        data = response.json()
        if pdb_id in data:
            mapping = [{"accession": item['accession'], "name": item['name'], "coverage": item['residues'][0]['endIndex'] - item['residues'][0]['startIndex']} for item in data[pdb_id]['data']]
            max_mapping = max(mapping, key=lambda x: x['coverage'])
            if max_mapping:
                accession = max_mapping['accession']
                name = max_mapping['name']
                return accession, name
        raise ValueError(f"Unexpected response structure: {data}")
    # return None

def get_ec_from_uniprot(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}?fields=ec"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    if response.status_code == 200:
        data = response.json()
        protein_description = data.get('proteinDescription', {}).get('recommendedName', {})
        if 'ecNumbers' in protein_description:
            ec_values = [ec['value'] for ec in protein_description['ecNumbers']]
            return ec_values
        else:
            return None
    raise ValueError(f"Unexpected response structure: {data}")

def get_alphafold_structure(uniprot_id):
    #TODO: Return none on 404
    """
    Returns the URL of the AlphaFold structure for a given UniProt ID, or None if not available.
    """
    url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}"
    response = requests.get(url)
    response.raise_for_status()
    if response.status_code == 200:
        data = response.json()
        for item in data:
            if item.get("uniprotAccession", None) == uniprot_id:
                cif_url = item.get("cifUrl", None)
                if cif_url:
                    return cif_url
    elif response.status_code == 404:
        return None
    raise ValueError(f"Unexpected response structure: {data}")

In [None]:
#First write a function to get the uniprot ID from kahraman dataset (assume that no _1 suffix means entity ID 1)

kahraman_table_1 = pd.read_csv("kahraman_dataset_table1_updated.tsv", sep = "\t")

kahraman_table_1[["uniprot_accession", "uniprot_name"]] = kahraman_table_1.apply(lambda row: get_uniprot_from_pdb_entity(row["updated_pdb_id"], row["entity_id"]), axis=1, result_type="expand")

kahraman_table_1["uniprot_ec"] = kahraman_table_1["uniprot_accession"].apply(lambda x: get_ec_from_uniprot(x) if pd.notna(x) else None)

In [221]:
kahraman_table_1["af_cif_url"] = kahraman_table_1["uniprot_accession"].apply(lambda x: get_alphafold_structure(x) if pd.notna(x) else None)

HTTPError: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/P00969

In [None]:
kahraman_table_1

Unnamed: 0,No,Ligand set,pdb_id,updated_pdb_id,entity_id,Chain Id,Protein,EC code,CATH code,Ligand,Ligand chain Id,Ligand residue number,Ligand altern loc,UniProt_manual,Notes,Unnamed: 15,uniprot_accession,uniprot_name,uniprot_ec
0,1,AMP,12as,12as,1,A,Asparagine synthetase,6.3.1.1,3.30.930.10,AMP,X,2,-,P00963,,,P00963,ASNA_ECOLI,[6.3.1.1]
1,2,AMP,1amu,1amu,1,A,Gramicidin synthetase,5.1.1.11,2.30.38.10;3.40.50.980,AMP,A,551,-,P0C061,,,P0C061,GRSA_ANEMI,
2,3,AMP,1c0a,1c0a,2,A,Aspartyl t-RNA synthetase,6.1.1.12,3.30.1360.30,AMP,E,800,-,P21889,,,P21889,SYD_ECOLI,[6.1.1.12]
3,4,AMP,1ct9,1ct9,1,A,Asparagine synthetase,6.3.5.4,3.40.50.620,AMP,A,1100,-,P22106,,,P22106,ASNB_ECOLI,[6.3.5.4]
4,5,AMP,1jp4,1jp4,1,A,Bisphosphate nucleotidase,3.1.3.7,3.40.190.80,AMP,B,601,-,Q9Z1N4,,,Q9Z1N4,BPNT1_RAT,[3.1.3.7]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,Steroid,1e3r,1e3r,1,B,Isomerase,5.3.3.1,3.10.450.50,AND,B,801,-,P07445,,,P07445,SDIS_PSEPU,[5.3.3.1]
96,97,Steroid,1fds,1fds,1,A,Hydroxysteroid-dehydrogenase,1.1.1.62,3.40.50.720,EST,A,350,-,P14061,,,P14061,DHB1_HUMAN,[1.1.1.51]
97,98,Steroid,1j99,1j99,1,A,Alcohol sulfotransferase,2.8.2.2,3.40.50.300,AND,B,401,A,Q06520,,,Q06520,ST2A1_HUMAN,[2.8.2.2]
98,99,Steroid,1lhu,1lhu,1,A,Sex hormone-binding globulin,?.?.?.?,2.60.120.200,EST,G,301,-,P04278,,,P04278,SHBG_HUMAN,


In [118]:
accession, name

('P00963', 'ASNA_ECOLI')

In [85]:
kahraman_table_1

Unnamed: 0,No,Ligand set,PQS Id,Chain Id,Protein,EC code,CATH code,Ligand,Ligand chain Id,Ligand residue number,Ligand altern loc,UniProt_manual,Notes,Unnamed: 13
0,1,AMP,12as,A,Asparagine synthetase,6.3.1.1,3.30.930.10,AMP,X,2,-,P00963,,
1,2,AMP,1amu_1,A,Gramicidin synthetase,5.1.1.11,2.30.38.10;3.40.50.980,AMP,A,551,-,P0C061,,
2,3,AMP,1c0a,A,Aspartyl t-RNA synthetase,6.1.1.12,3.30.1360.30,AMP,E,800,-,P21889,,
3,4,AMP,1ct9_1,A,Asparagine synthetase,6.3.5.4,3.40.50.620,AMP,A,1100,-,P22106,,
4,5,AMP,1jp4,A,Bisphosphate nucleotidase,3.1.3.7,3.40.190.80,AMP,B,601,-,Q9Z1N4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,Steroid,1e3r,B,Isomerase,5.3.3.1,3.10.450.50,AND,B,801,-,P07445,,
96,97,Steroid,1fds,A,Hydroxysteroid-dehydrogenase,1.1.1.62,3.40.50.720,EST,A,350,-,P14061,,
97,98,Steroid,1j99,A,Alcohol sulfotransferase,2.8.2.2,3.40.50.300,AND,B,401,A,Q06520,,
98,99,Steroid,1lhu,A,Sex hormone-binding globulin,?.?.?.?,2.60.120.200,EST,G,301,-,P04278,,


In [74]:
input_uniprots = pd.read_csv("kahraman_uniprot_ids.tsv", sep = "\t", header = None)
input_uniprots["AF"] = "AF-" + input_uniprots[0] + "-F1-model_4"
input_uniprots["AF"].nunique()

103

In [75]:
input_uniprots.loc[input_uniprots["AF"].isin(all_structures["accession"]) == False]

Unnamed: 0,0,AF
9,P00969,AF-P00969-F1-model_4
74,P00588,AF-P00588-F1-model_4


These two structures are a bacterial protein and diptheria protein, both with no alphafold DB structures, hence they are missing from the outputs of the analysis.

In [76]:
total_input_proteins = input_uniprots["AF"].nunique() - 2

In [77]:
print(f"Total number of input proteins: {total_input_proteins}")

Total number of input proteins: 101


In [78]:
all_structures = pd.read_csv("kahraman_alphacognate_cathaf/combined_structure_summaries.tsv.gz", sep="\t")

no_transplants = all_structures.loc[all_structures.num_transplants == 0]
print(f"Total number of structures with no transplants: {no_transplants.accession.nunique()}")

success = all_structures.loc[all_structures.num_transplants > 0]
print(f"Total number of structures with transplants (any): {success.accession.nunique()}")


Total number of structures with no transplants: 60
Total number of structures with transplants (any): 41


So first of all, we were only able to transplant any ligands to 40% of the dataset. Why is that? The ProCogGraph dataset of structures/ligands is limited to enzyme structures in the PDB, and many of the proteins in the Kahraman dataset are non-enzymes.

In [None]:
all_transplants = pd.read_csv("kahraman_alphacognate_cathaf/combined_transplants.tsv.gz", sep="\t")

In [40]:
all_transplants.accession.nunique() #.loc[all_transplants.top_ranked == True]

40

In [20]:
all_transplants.loc[(all_transplants.accession == "AF-P00963-F1-model_4") & (all_transplants.top_ranked == True)]

Unnamed: 0,accession,transplant_structure,foldseek_rmsd,global_rmsd,local_rmsd,ligand,ligand_het_code,ligand_name,ligand_chain,ligand_residues,...,cognate_mapping_smiles,cognate_mapping_xref,cluster,cluster_center,Score,Type,nrgrank_runtime,top_ranked,transplanted_chain_id,nrgrank_tcs
4116,AF-P00963-F1-model_4,6chd_bio-h_A,6.868,6.867851,1.334752,6chd_bm1_C,KAA,5'-O-[(L-LYSYLAMINO)SULFONYL]ADENOSINE,T,601,...,Cc1ccc(C(=O)OP(=O)(O)OCC2OC(n3cnc4c(N)ncnc43)C...,Pubchem:102515309|KEGG:C21460|CHEBI:91232,2.0,"-0.8359554015638798,1.3352133429718456,-3.0370...",-1921856,ligand,191.055988,1,CH,0.871002
4136,AF-P00963-F1-model_4,6ilh_bio-h_A,6.764,6.764052,2.901763,6ilh_bm1_C,KAA,5'-O-[(L-LYSYLAMINO)SULFONYL]ADENOSINE,V,601,...,Cc1ccc(C(=O)OP(=O)(O)OCC2OC(n3cnc4c(N)ncnc43)C...,Pubchem:102515309|KEGG:C21460|CHEBI:91232,2.0,"-0.8359554015638798,1.3352133429718456,-3.0370...",-1921856,ligand,191.055988,1,CH,0.871002
