In [1]:
import asyncio
import aiohttp
import requests 
import json
import pandas as pd
import warnings
import time
from bioster_search_23_06_24 import *

warnings.filterwarnings("ignore")

### Replace with the proper API URL and parameters for your desired search
API_URL = "https://search.rcsb.org/rcsbsearch/v2/query"
### add adress from your local repository
PDB_ligands_file = r"C:\Users\moise\Desktop\projects\lab44\PDB_parser\PDB_parser\sources_files\Components-pub.sdf"
bioisosteres_file = r"C:\Users\moise\Desktop\projects\lab44\PDB_parser\PDB_parser\created_files\F_H.sdf"

In [2]:
bioisosters = search_bioisosters_morgan(PDB_ligands_file, bioisosteres_file, deep=10)
#test_ligands = ["ALA", "G6C", "L0I"]

[19:46:03] Explicit valence for atom # 15 N, 4, is greater than permitted
[19:46:03] ERROR: Could not sanitize molecule ending on line 30363
[19:46:03] ERROR: Explicit valence for atom # 15 N, 4, is greater than permitted
[19:46:05] Explicit valence for atom # 15 C, 5, is greater than permitted
[19:46:05] ERROR: Could not sanitize molecule ending on line 128301
[19:46:05] ERROR: Explicit valence for atom # 15 C, 5, is greater than permitted
[19:46:05] Explicit valence for atom # 7 O, 3, is greater than permitted
[19:46:05] ERROR: Could not sanitize molecule ending on line 130723
[19:46:05] ERROR: Explicit valence for atom # 7 O, 3, is greater than permitted
[19:46:05] Explicit valence for atom # 17 N, 4, is greater than permitted
[19:46:05] ERROR: Could not sanitize molecule ending on line 138835
[19:46:05] ERROR: Explicit valence for atom # 17 N, 4, is greater than permitted
[19:46:05] Explicit valence for atom # 27 O, 3, is greater than permitted
[19:46:05] ERROR: Could not sanitize 

In [3]:
ligands = [] # ligand's list
for cont in bioisosters:
    for lig in cont:
        if lig not in ligands:
            ligands.append(lig)
            
#test_ligands = ligands[:50]

In [13]:
file = open("Bioisosters.txt", "a")
for i in bioisosters:
    h = i[0] + "    " + i[1] + "\n"
    file.write(h)
file.close()

In [5]:
async def search_protein_id(session, url, semaphore, ligand_name):
    
    search_payload = {
  "query": {
    "type": "group",
    "logical_operator": "and",
    "nodes": [
      {
        "type": "terminal",
        "service": "text",
        "parameters": {
          "attribute": "rcsb_nonpolymer_entity_container_identifiers.nonpolymer_comp_id",
          "operator": "exact_match",
          "negation": False,
          "value": ligand_name
        }
      },
      {
        "type": "terminal",
        "service": "text",
        "parameters": {
          "attribute": "rcsb_nonpolymer_entity_container_identifiers.nonpolymer_comp_id",
          "operator": "exact_match",
          "negation": True,
          "value": "HEM"
        }
      }
    ],
    "label": "text"
  },
  "return_type": "entry",
  "request_options": {
    "paginate": {
      "start": 0,
      "rows": 10000
    },
    "results_content_type": [
      "experimental"
    ],
    "sort": [
      {
        "sort_by": "score",
        "direction": "desc"
      }
    ],
    "scoring_strategy": "combined"
  }
}
    
    async with semaphore:  # Guard block with semaphore to limit concurrent requests
        async with session.post(url, json=search_payload) as response:
            #print(response.status)
            
            data = await response.json()
            res = [ligand_name, []]
            try:
                for i in data["result_set"]:
                    name = i['identifier']
                    score = i['score']
                    #if score == 1:
                    res[1].append(name)
                return res
            except:
                return res


async def main_search_protein_id(ligand_cont):
    
    ''' This function takes as input an array with the id
    of the ligands, and returns a dictionary with complements
    that contain the specified lignades '''
    
    semaphore = asyncio.Semaphore(10)  
    async with aiohttp.ClientSession() as session:
        tasks = [search_protein_id(session, API_URL, semaphore, ligand_name) for ligand_name in ligand_cont]  
        results = await asyncio.gather(*tasks)
        
        tmp = {}
        for i in results:
            lig = i[0]
            prot_cont = i[1]
            tmp[lig] = prot_cont
            
        
        return tmp
        

In [6]:
async def search_protein_info(session, url, semaphore, protein_id):
    
    url = "https://data.rcsb.org/graphql"
    
    query = """
    query structure ($id: String!) {
      entry(entry_id:$id){
        rcsb_id
        entry {
          id
        }
        pubmed {
        rcsb_pubmed_container_identifiers {
          pubmed_id
          }
        }
        struct_keywords {
          pdbx_keywords
        }
        rcsb_primary_citation {
        pdbx_database_id_DOI
        }
        refine {
          ls_d_res_high
        }
        exptl {
          method
        }
        exptl_crystal_grow{
            pH
        } 
        rcsb_accession_info {
          deposit_date
          initial_release_date
        }
        refine {
          pdbx_refine_id
        }
        polymer_entities {
          
          rcsb_polymer_entity_container_identifiers {
            uniprot_ids
          }
        }
        rcsb_binding_affinity {
          comp_id
          type
          value
          unit
        }
      }
    }
    """

    # Define the variables for our query
    variables = {
      "id": protein_id
    }

    
    async with semaphore: 
        async with session.post(url, json={'query': query, 'variables': variables}) as response:
            #print(response.status)
            
            data = await response.json()
            
            return data


async def main_search_protein_info(protein_cont):
    
    ''' This function takes as input an array with the id
    of the ligands, and returns a dictionary with complements
    that contain the specified lignades '''
    
    semaphore = asyncio.Semaphore(10)  
    async with aiohttp.ClientSession() as session:
        tasks = [search_protein_info(session, API_URL, semaphore, protein_name) for protein_name in protein_cont]  
        results = await asyncio.gather(*tasks)
        
        
        # after receiving a response from the server, it is necessary 
        # to present the information in a convenient form, for this we will use a dictionary  
        
        RES = {}
        for i in results:

            tmp = {}

            protein_id = i["data"]["entry"]["rcsb_id"] # extract protein's ID
            tmp["prot_id"] = protein_id # add ID to dicitonary

            try: 
                tmp["pubmed"] = i["data"]["entry"]["pubmed"]["rcsb_pubmed_container_identifiers"]["pubmed_id"]
            except: tmp["pubmed"]= None
                
            try:    
                 tmp["doi"] = i["data"]["entry"]["rcsb_primary_citation"]["pdbx_database_id_DOI"]
            except: tmp["doi"] = None
                
            try:
                tmp["pH"] = i["data"]["entry"]["exptl_crystal_grow"][0]["pH"]
            except: tmp["pH"] = None
                
            try:
                tmp["rentgen"] = i["data"]["entry"]["refine"][0]["ls_d_res_high"]
            except:
                tmp["rentgen"] = None
                
            tmp["method"] = i["data"]["entry"]["exptl"][0]["method"]
            tmp["chtype"] =i["data"]["entry"]["struct_keywords"]["pdbx_keywords"]
    
            tmp["structure"] = []
            try:
                for j in i["data"]["entry"]["polymer_entities"]:
                    tmp["structure"].append(j["rcsb_polymer_entity_container_identifiers"]["uniprot_ids"][0])
            except: None
            
            afinity = i["data"]["entry"]["rcsb_binding_affinity"] # information about chemical constants; next, we will parse it

            tmp["afinity"] = {} # create dictionary, wich contains afinity information for all ligands 
            tmp["ligands_have_afinity"] = [] # create list of ligands, wich have afinity constant

            if afinity != None:
                
                for i in afinity:
                    
                    lig_id = i["comp_id"]
                    value = str(i["value"])
                    value_type = i["type"]
                    value_unit = i["unit"]
                    
                    if lig_id not in tmp["ligands_have_afinity"]:
                        tmp["ligands_have_afinity"].append(lig_id)
                        tmp["afinity"][lig_id] = []

                    tmp["afinity"][lig_id].append(value_type + "/" + value + "/" + value_unit)


            RES[protein_id] = tmp
                
        return RES
        

In [7]:
async def search_experiment(session, url, semaphore, protein_id):
    
    url = "https://data.rcsb.org/graphql"
    
    query = """
    query XRAY ($id: String!) {
  entry(entry_id:$id){
    rcsb_id
    entry {
      id
    }
    exptl {
      method
    }
    exptl_crystal_grow {
      method
      pH
      temp
      pdbx_details
        }
      }
    }
    """

    # Define the variables for our query
    variables = {
      "id": protein_id
    }


    
    async with semaphore: 
        async with session.post(url, json={'query': query, 'variables': variables}) as response:
            #print(response.status)
            
            data = await response.json()
            
            return data


async def main_search_experiment(protein_cont):
    
    ''' This function takes as input an array with the id
    of the ligands, and returns a dictionary with ibformatioin about experiment from PDB '''
    
    semaphore = asyncio.Semaphore(10)  
    async with aiohttp.ClientSession() as session:
        tasks = [search_experiment(session, API_URL, semaphore, protein_name) for protein_name in protein_cont]  
        results = await asyncio.gather(*tasks)
        
        # after receiving a response from the server, it is necessary 
        # to present the information in a convenient form, for this we will use a dictionary  
        
        RES = {}
        for i in results:
            
            tmp = {}
            
            prot_id = i["data"]["entry"]["entry"]["id"]
                
            try:
                tmp["details"] = i["data"]["entry"]["exptl_crystal_grow"][0]["pdbx_details"]
            except: tmp["details"] = None
                
            RES[prot_id] = tmp        
                
        return RES

In [8]:
async def search_ligand_info(session, url, semaphore, lig_id):
    
    url = "https://data.rcsb.org/graphql"
    
    query = """
    query molecule ($id: String!) {
    chem_comp(comp_id:$id){
        chem_comp {
            id
            name
            pdbx_formal_charge
            formula_weight
            type
        }
        rcsb_chem_comp_descriptor {
            InChI
            InChIKey
            SMILES
            SMILES_stereo
            }
        }
    }

    """

    # Define the variables for our query
    variables = {
      "id": lig_id
    }


    
    async with semaphore: 
        async with session.post(url, json={'query': query, 'variables': variables}) as response:
            #print(response.status)
            
            data = await response.json()
            
            return data


async def main_search_ligand_info(ligand_cont):
    
    ''' This function takes as input an array with the id
    of the ligands, and returns a dictionary with ibformatioin about experiment from PDB '''
    
    semaphore = asyncio.Semaphore(10)  
    async with aiohttp.ClientSession() as session:
        tasks = [search_ligand_info(session, API_URL, semaphore, ligand_name) for ligand_name in ligand_cont]  
        results = await asyncio.gather(*tasks)
        
        # after receiving a response from the server, it is necessary 
        # to present the information in a convenient form, for this we will use a dictionary  
        
        RES = {}
        
        for i in results:
                    
            tmp = {}
            
            try:
                lig_id = i["data"]["chem_comp"]["chem_comp"]["id"]
            except:
                continue
                
            try:
                tmp["SMILES"] = i["data"]["chem_comp"]["rcsb_chem_comp_descriptor"]["SMILES"]
            except: tmp["SMILES"] = None
                
            RES[lig_id] = tmp        
                
        return RES

In [9]:
if __name__ == "__main__":
    
    
    ligands_in_proteins = await main_search_protein_id(ligands) # dictionary of ligand's proteins
        
    proteins = []
    for lig, prot_cont in ligands_in_proteins.items():

        for prot in prot_cont:
            proteins.append(prot)

In [10]:
proteins_data = await main_search_protein_info(proteins)

In [11]:
experiment = await main_search_experiment(proteins)

In [12]:
ligands_data = await main_search_ligand_info(ligands)

In [14]:
results = []

for ligs in bioisosters:
    
    lig1, lig2 = ligs[0], ligs[1]
    prots1, prots2 = ligands_in_proteins[lig1], ligands_in_proteins[lig2]
    ###
    
    for prot1 in prots1:
        
        for prot2 in prots2:

            prd1, prd2 = proteins_data[prot1], proteins_data[prot2] # proten's data for prot1 and prot2 

            doi1, doi2 = prd1["doi"], prd2["doi"]
            pubmed1, pubmed2 = prd1["pubmed"], prd2["pubmed"]
            method1, method2 = prd1["method"], prd2["method"]
            pH1, pH2 = prd1["pH"], prd2["pH"]
            struct1, struct2 = prd1["structure"], prd2["structure"]

            lha1, lha2 = prd1["ligands_have_afinity"], prd2["ligands_have_afinity"]

            if (lig1 in lha1) and (lig2 in lha2):
    
                if (doi1 == doi2) or (pubmed1 == pubmed2):
    
                    if (set(struct1) == set(struct2)) and (method1 == method2):
    
                        if [lig1, lig2, prot1, prot2] not in results:
                            results.append([lig1, lig2, prot1, prot2])


In [15]:
def parse_const(typ, cont):
    const = "|"
    
    try:
        for af in cont:
            i = af.split("/")
            if i[0] == typ:
                const = const + i[1] + "|"
    except: return None
        
    if const != "|":
        return const
    else:
        return None
    
def parse_uniprot(cont):
    struct = " "
    try:
        for i in cont:
            struct = struct + i + " "
        return struct
    except: return None
    

table_data = {"NUM": [i for i in range(1, len(results)+1)],
              "DOI": [proteins_data[i[2]]["doi"] for i in results],
              "PUBMED": [proteins_data[i[2]]["pubmed"] for i in results],
              "LIG_ID": [i[0] for i in results],
              "SMILES": [ligands_data[i[0]]["SMILES"] for i in results],
              "LIG_ID_REPL": [i[1] for i in results],
              "SMILES_REPL": [ligands_data[i[1]]["SMILES"] for i in results],
              "PROT_ID": [i[2] for i in results],
              "PROT_ID_REPL": [i[3] for i in results], 
              "UNIPROT_ID": [parse_uniprot(proteins_data[i[2]]["structure"]) for i in results], 
              "TYPE": [proteins_data[i[2]]["chtype"] for i in results], 
              "TYPE_REPL": [proteins_data[i[3]]["chtype"] for i in results],
              "DETAILS":  [experiment[i[2]]["details"] for i in results],
              "DETAILS_repl":  [experiment[i[3]]["details"] for i in results],
              "METHOD": [proteins_data[i[2]]["method"] for i in results],
              "RENTGEN": [proteins_data[i[2]]["rentgen"] for i in results],
              "RENTGEN_REPL": [proteins_data[i[3]]["rentgen"] for i in results],
              "pH": [proteins_data[i[2]]["pH"] for i in results],
              "pH_REPL": [proteins_data[i[3]]["pH"] for i in results],
              "Ki": [parse_const("Ki", proteins_data[i[2]]["afinity"][i[0]]) for i in results],
              "Ki_REPL": [parse_const("Ki", proteins_data[i[3]]["afinity"][i[1]]) for i in results], 
              "Kd": [parse_const("Kd", proteins_data[i[2]]["afinity"][i[0]]) for i in results], 
              "Kd_REPL": [parse_const("Kd", proteins_data[i[3]]["afinity"][i[1]]) for i in results],
              "Ka": [parse_const("Ka", proteins_data[i[2]]["afinity"][i[0]]) for i in results], 
              "Ka_REPL": [parse_const("Ka", proteins_data[i[3]]["afinity"][i[1]]) for i in results],
              "IC50": [parse_const("IC50", proteins_data[i[2]]["afinity"][i[0]]) for i in results],
              "IC50_REPL": [parse_const("IC50", proteins_data[i[3]]["afinity"][i[1]]) for i in results],
              "EC50": [parse_const("EC50", proteins_data[i[2]]["afinity"][i[0]]) for i in results],
              "EC50": [parse_const("EC50", proteins_data[i[3]]["afinity"][i[1]]) for i in results]}

df = pd.DataFrame.from_dict(table_data)
df



Unnamed: 0,NUM,DOI,PUBMED,LIG_ID,SMILES,LIG_ID_REPL,SMILES_REPL,PROT_ID,PROT_ID_REPL,UNIPROT_ID,...,pH_REPL,Ki,Ki_REPL,Kd,Kd_REPL,Ka,Ka_REPL,IC50,IC50_REPL,EC50
0,1,10.1021/acs.jmedchem.9b00518,31099559.0,AK0,COc1cc2c(cc1OC)C(N(CC2)C=O)CCc3c[nH]c4c3ccc(c4)F,AKU,COc1cc2c(cc1OC)C(N(CC2)C=O)CCc3c[nH]c4c3cccc4,6IMT,6INK,Q08499,...,,,,,,,,|250.0|,|270.0|,
1,2,10.1021/acs.jmedchem.9b00518,31099559.0,AK0,COc1cc2c(cc1OC)C(N(CC2)C=O)CCc3c[nH]c4c3ccc(c4)F,AKU,COc1cc2c(cc1OC)C(N(CC2)C=O)CCc3c[nH]c4c3cccc4,6IMT,6INM,Q08499,...,7.0,,,,,,,|250.0|,|510.0|,
2,3,10.1042/BJ20100651,20642456.0,ET2,c1cc2c(cc1F)[nH]c(n2)N,AX7,c1ccc2c(c1)[nH]c(n2)N,3KR2,3KQS,P11086,...,5.8,,,|7200.0|7200.0|,|6300.0|6300.0|,,,,,
3,4,10.1002/cmdc.201800158,29575754.0,EWJ,c1ccc2cc(ccc2c1)Oc3ccc(cc3C(=O)O)c4c[nH]nc4F,EV7,c1ccc2cc(ccc2c1)Oc3ccc(cc3C(=O)O)c4c[nH]nc4,6CBH,6CB5,P14174,...,7.0,|510.0|510.0|,|4300.0|,,,,,,,
4,5,10.1021/ja011034p,11572683.0,IOC,c1cc(ccc1C(=O)NCc2ccc(c(c2F)F)F)S(=O)(=O)N,INW,c1cc(ccc1C(=O)NCc2ccc(cc2F)F)S(=O)(=O)N,1I9O,1I9M,P00918,...,8.0,,,|3.799999952316284|2.299999952316284|3.7999999...,|3.299999952316284|3.299999952316284|3.2999999...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,111,10.1016/j.ejmech.2020.113035,33303238.0,VZD,c1cc(ccc1NC(=O)NCCS(=O)(=O)Nc2ccc(cc2)S(=O)(=O...,VYV,c1ccc(cc1)NC(=O)NCCS(=O)(=O)Nc2ccc(cc2)S(=O)(=O)N,7K6L,7K6I,P00918,...,6.4,|458.29998779296875|,|444.5|,,,,,,,
111,112,10.1021/jm301234k,23600925.0,Z71,c1cc(ccc1Nc2nc(c(s2)C(=O)Nc3cc(cc(c3)F)F)N)S(=...,X6B,c1cc(cc(c1)F)NC(=O)c2c(nc(s2)Nc3ccc(cc3)S(=O)(...,3R9O,3R9D,P24941,...,7.5,,,,,,,|100000.0|,|71000.0|,
112,113,10.1021/CB2001846,21732689.0,ZA5,[B-](CNC(=O)c1c(cccc1F)F)(O)(O)O,ZA4,[B-](CNC(=O)c1ccccc1F)(O)(O)O,2Y2K,2Y2J,Q7CRA4,...,7.2,,,,,,,|6900.0|,|16000.0|,
113,114,10.1021/acschembio.6b00382,27359042.0,388,c1cc(c(cc1Cl)OCC(=O)O)C(=O)NCc2ccc(cc2F)Br,W8X,c1cc(ccc1CNC(=O)c2ccc(cc2OCC(=O)O)Cl)Br,5LIU,5LIK,O60218,...,9.0,,,,,,,|2700.0|4400.0|4400.0|,|4500.0|3700.0|10200.0|,


In [16]:
#df.to_excel("F_OH.xlsx", index=False)