# Recherche de molécules similaires à celles sélectionnées.
Objectif : Questionner la base de données PubChem sur des composés similaires à ceux sélectionnés.

En cas de doublons nous conserverons le couple molécule-candidat avec le maximum de similarité
## Pré-requis

In [1]:
import time
from urllib.parse import quote

from pickle import load, dump
from tqdm import trange

from IPython.display import Markdown, Image
import requests
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolsToGridImage

In [2]:
# Fonction de recherche de molécules similaires, seuil fixé par défaut à 75% 
def query_pubchem_for_similar_compounds(smiles, threshold=75, n_records=10):
    """
    Query PubChem for similar compounds and return the job key.

    Parameters
    ----------
    smiles : str
        The canonical SMILES string for the given compound.
    threshold : int
        The threshold of similarity, default 75%. In PubChem, the default threshold is 90%.
    n_records : int
        The maximum number of feedback records.

    Returns
    -------
    str
        The job key from the PubChem web service.
    """
    escaped_smiles = quote(smiles).replace("/", ".")
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/{escaped_smiles}/JSON?Threshold={threshold}&MaxRecords={n_records}"
    r = requests.get(url)
    r.raise_for_status()
    key = r.json()["Waiting"]["ListKey"]
    return key

# Fonction de téléchargement des résultats
def check_and_download(key, attempts=30):
    """
    Check job status and download PubChem CIDs when the job finished

    Parameters
    ----------
    key : str
        The job key of the PubChem service.
    attempts : int
        The time waiting for the feedback from the PubChem service.

    Returns
    -------
    list
        The PubChem CIDs of similar compounds.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{key}/cids/JSON"
    #print(f"Querying for job {key} at URL {url}...", end="")
    while attempts:
        r = requests.get(url)
        r.raise_for_status()
        response = r.json()
        if "IdentifierList" in response:
            cids = response["IdentifierList"]["CID"]
            break
        attempts -= 1
        #print(".", end="")
        time.sleep(10)
    else:
        raise ValueError(f"Could not find matches for job key: {key}")
    return cids

# Récupération de l'ID SMILES à partir du CID PubChem
def smiles_from_pubchem_cids(cids):
    """
    Get the canonical SMILES string from the PubChem CIDs.

    Parameters
    ----------
    cids : list
        A list of PubChem CIDs.

    Returns
    -------
    list
        The canonical SMILES strings of the PubChem CIDs.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/CanonicalSMILES/JSON"
    r = requests.get(url)
    r.raise_for_status()
    return [item["CanonicalSMILES"] for item in r.json()["PropertyTable"]["Properties"]]

## Récupération des molécules filtrées

In [3]:
# Récupération de l'identifiant uniprot
with open("../output/01_compoundDataAcquisition/uniprot_id.txt", "rb") as file:
    uniprot_id = load(file)

# Chargement des données
molecules = pd.read_csv("../output/03_molecularSubstructureFiltering//"+uniprot_id+".csv")

## Recherche de molécumes similaires

In [109]:
# Enregistrement des IDs restant à traiter
"""
LIGNE A LANCER QU'UNE SEULE FOIS LORS DE L'INITIALISATION DU PIPELINE!
"""
#molecules.loc[:,["molecule_chembl_id","smiles"]].to_csv("../input/05_findSimilarCompounfPubChem/idToProcess_" + uniprot_id + ".csv", index=False)

In [27]:
# Il faut ajouter des gardes-fous, des moyens de contrôle du bon déroulement de la boucle.
idToProc = pd.read_csv("../input/05_findSimilarCompounfPubChem/idToProcess_" + uniprot_id + ".csv")
listQueries = idToProc.iloc[:,1].to_list()
for i in trange (len(listQueries)):
    print(i)
    query = listQueries[i]
    #print(i)
    #print("\t get similar CID")
    job_key = query_pubchem_for_similar_compounds(query, n_records=400)
    #print("\t Download ...")
    similar_cids = check_and_download(job_key)
    #print("\t Convert SMILES to PubMEd ID")
    similar_smiles = smiles_from_pubchem_cids(similar_cids)
    
    dataframe = pd.DataFrame(similar_smiles).T
    dataframe = dataframe.set_index(pd.Index([idToProc.iloc[i,0]]))
    dataframe.to_csv("../output/05_findSimilarCompounfPubChem/processedIds_" + uniprot_id + ".csv", mode = "a", header=False)
    
    #print("\t Remove value")
    (idToProc.
     drop(idToProc[idToProc.smiles == query].index).
     to_csv("../input/05_findSimilarCompounfPubChem/idToProcess_" + uniprot_id + ".csv", index=False))
    


0it [00:00, ?it/s]
