# Recherche de molécules similaires à celles sélectionnées.
Objectif : Questionner la base de données PubChem sur des composés similaires à ceux sélectionnés.

En cas de doublons nous conserverons le couple molécule-candidat avec le maximum de similarité
## Pré-requis

In [1]:
!pip install rdkit &> /dev/null

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import time
from urllib.parse import quote

import seaborn as sns
sns.set(rc = {'figure.figsize':(15,8)})
import matplotlib.pyplot as plt

from pickle import load, dump
from tqdm import trange

from IPython.display import Markdown, Image
import requests
import pandas as pd

from os import chdir
chdir("/content/drive/MyDrive/Colab Notebooks/these_exercice/src")


from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolsToGridImage

In [4]:
# Fonction de recherche de molécules similaires, seuil fixé par défaut à 75% 
def query_pubchem_for_similar_compounds(smiles, threshold=75, n_records=10):
    """
    Query PubChem for similar compounds and return the job key.

    Parameters
    ----------
    smiles : str
        The canonical SMILES string for the given compound.
    threshold : int
        The threshold of similarity, default 75%. In PubChem, the default threshold is 90%.
    n_records : int
        The maximum number of feedback records.

    Returns
    -------
    str
        The job key from the PubChem web service.
    """
    escaped_smiles = quote(smiles).replace("/", ".")
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/{escaped_smiles}/JSON?Threshold={threshold}&MaxRecords={n_records}"
    r = requests.get(url)
    r.raise_for_status()
    key = r.json()["Waiting"]["ListKey"]
    return key

# Fonction de téléchargement des résultats
def check_and_download(key, attempts=30):
    """
    Check job status and download PubChem CIDs when the job finished

    Parameters
    ----------
    key : str
        The job key of the PubChem service.
    attempts : int
        The time waiting for the feedback from the PubChem service.

    Returns
    -------
    list
        The PubChem CIDs of similar compounds.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{key}/cids/JSON"
    #print(f"Querying for job {key} at URL {url}...", end="")
    while attempts:
        r = requests.get(url)
        r.raise_for_status()
        response = r.json()
        if "IdentifierList" in response:
            cids = response["IdentifierList"]["CID"]
            break
        attempts -= 1
        #print(".", end="")
        time.sleep(10)
    else:
        raise ValueError(f"Could not find matches for job key: {key}")
    return cids

# Récupération de l'ID SMILES à partir du CID PubChem
def smiles_from_pubchem_cids(cids):
    """
    Get the canonical SMILES string from the PubChem CIDs.

    Parameters
    ----------
    cids : list
        A list of PubChem CIDs.

    Returns
    -------
    list
        The canonical SMILES strings of the PubChem CIDs.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/CanonicalSMILES/JSON"
    r = requests.get(url)
    r.raise_for_status()
    return [item["CanonicalSMILES"] for item in r.json()["PropertyTable"]["Properties"]]

## Récupération des molécules filtrées

In [5]:
# Récupération de l'identifiant uniprot
with open("../output/01_compoundDataAcquisition/uniprot_id.txt", "rb") as file:
    uniprot_id = load(file)

# Chargement des données
molecules = pd.read_csv("../output/03_molecularSubstructureFiltering//"+uniprot_id+".csv")

print(uniprot_id)
print(molecules.shape)

P08581
(1308, 10)


## Recherche de molécumes similaires

In [None]:
# Enregistrement des IDs restant à traiter
"""
LIGNE A LANCER QU'UNE SEULE FOIS LORS DE L'INITIALISATION DU PIPELINE!
"""
#molecules.loc[:,["molecule_chembl_id","smiles"]].to_csv("../input/05_findSimilarCompounfPubChem/idToProcess_" + uniprot_id + ".csv", index=False)

In [None]:
print(idToProc.iloc[:,1].to_list())

['CNC(=O)c1ccc(-c2cnc3nnc(Cc4ccc5ncccc5c4)n3n2)cc1F', 'CNC(=O)c1ccc(-c2cnc3ncc(Cc4ccc5ncccc5c4)n3n2)cc1F', 'COc1ccc2c(OCc3nnc4ncc(-c5ccccc5)nn34)ccnc2c1', 'Cc1c(O)ccc2nc(-c3cc(-c4cnn(C5CCNCC5)c4)cnc3N)oc12.O=C(O)C(F)(F)F', 'CC(c1c(F)cc2ncc(-c3cnn(C)c3)cc2c1F)n1nnc2ncc(-c3ccc(C(N)=O)c(F)c3)nc21', 'CC(c1ccc2ncc(-c3cnn(C)c3)cc2c1)n1nnc2ncc(-c3ccncc3)nc21', 'CC(N=O)c1ccc2nnc(Cc3c(F)cc4ncccc4c3F)n2n1', 'CC(c1c(F)cc2ncc(-c3cnn(C)c3)cc2c1F)n1nnc2ncc(-c3cnn(C)c3)nc21', 'Oc1cccc(-c2cnc3nnc(Cc4ccc5ncccc5c4)n3n2)c1', 'Cn1cc(-c2ccc(=O)n(CCOc3ccnc4cc(OCc5ccc(C(=O)Nc6ccccc6N)cc5)ccc34)n2)cn1', 'Cn1cc(-c2cnc3nnc(Cc4ccc5ncccc5c4)n3n2)cn1', 'Fc1ccc(-c2cnc3nnc(C(F)(F)c4ccc5ncccc5c4)n3n2)cc1', 'CC(c1cc2cc(-c3cnn(C)c3)cnc2cc1F)n1nnc2ncc(-c3cnn(CCO)c3)nc21', 'c1ccc(-c2cnc3nnc(Cc4ccc5ncccc5c4)n3n2)cc1', 'CC(c1ccc2ncc(-c3cnn(C)c3)cc2c1)n1nnc2ncc(-c3cnn(CCO)c3)nc21', 'CNC(=O)c1ccc(-c2cnc3nnn(C(C)c4ccc5ncc(-c6cnn(C)c6)cc5c4)c3n2)cc1F', 'CC(c1c(F)cc2ncc(-c3cnn(C)c3)cc2c1F)n1nnc2ncc(-c3ccncc3)nc21', 'C[C@@H](c1c

In [None]:
idToProc.head()

Unnamed: 0,molecule_chembl_id,smiles
0,CHEMBL3582305,CNC(=O)c1ccc(-c2cnc3nnc(Cc4ccc5ncccc5c4)n3n2)cc1F
1,CHEMBL3188267,CNC(=O)c1ccc(-c2cnc3ncc(Cc4ccc5ncccc5c4)n3n2)cc1F
2,CHEMBL3797911,COc1ccc2c(OCc3nnc4ncc(-c5ccccc5)nn34)ccnc2c1
3,CHEMBL2032280,Cc1c(O)ccc2nc(-c3cc(-c4cnn(C5CCNCC5)c4)cnc3N)o...
4,CHEMBL4104884,CC(c1c(F)cc2ncc(-c3cnn(C)c3)cc2c1F)n1nnc2ncc(-...


In [None]:
# Il faut ajouter des gardes-fous, des moyens de contrôle du bon déroulement de la boucle.
idToProc = pd.read_csv("../input/05_findSimilarCompounfPubChem/idToProcess_" + uniprot_id + ".csv")
listQueries = idToProc.iloc[:,1].to_list()
for i in trange(len(listQueries)):
  #print(f" \ni : {i} \n")
  #print(f"shape idToProc : {idToProc.shape}")
  #print(f"\n index : {i}")
  query = listQueries[i]
  #print(f"\n query : \n{query} \n")
  #print(idToProc.iloc[i,0:2])
  #print(f"query : {query}")
  #print("\t get similar CID")
  job_key = query_pubchem_for_similar_compounds(query, n_records=400)
  #print("\t Download ...")
  similar_cids = check_and_download(job_key)
  #print("\t Convert SMILES to PubMEd ID")
  similar_smiles = smiles_from_pubchem_cids(similar_cids)
  
  dataframe = pd.DataFrame(similar_smiles).T
  dataframe = dataframe.set_index(pd.Index([idToProc.iloc[i,0]]))
  dataframe.to_csv("../output/05_findSimilarCompounfPubChem/processedIds_" + uniprot_id + ".csv", mode = "a", header=False)
  
  #print(f"shape processedIds : {pd.read_csv('../output/05_findSimilarCompounfPubChem/processedIds_' + uniprot_id + '.csv').shape}")
  #print(f"\t Remove value at index : {idToProc[idToProc.smiles == query].index.values}")
  # save remaining ids to process
  if i == 0 :
    tmpIdToProc = idToProc.copy()
  tmpIdToProc.drop(tmpIdToProc[tmpIdToProc.smiles == query].index, axis = 0, inplace = True)
  #idToProc.drop(idToProc[idToProc.smiles == query].index, axis = 0, inplace = True)
  #print(idToProc.shape)
  #(idToProc.
   #drop(idToProc[idToProc.smiles == query].index).
   #to_csv("../input/05_findSimilarCompounfPubChem/idToProcess_" + uniprot_id + ".csv", index=False))
  tmpIdToProc.to_csv("../input/05_findSimilarCompounfPubChem/idToProcess_" + uniprot_id + ".csv", index=False)
  #print("\n")
    


100%|██████████| 353/353 [1:34:13<00:00, 16.02s/it]
