# Raffinage des composés selon leur structure moléculaire
Objectif : Filtrer les composés selon la présence de sous-structures non désirées.
## Prérequis

In [3]:
from os import listdir
from pickle import load

import pandas as pd
from tqdm.auto import tqdm
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

## Chargement des données filtrés sur la règle de Lipinski

In [5]:
# Récupération de l'identifiant uniprot
with open("../output/01_compoundDataAcquisition/uniprot_id.txt", "rb") as file:
    uniprot_id = load(file)
    
# chargement des molécules 
moleculesData = pd.read_csv("../output/02_molecularFiltering/"+uniprot_id+"_ro5Compliant.csv")
# Suppression des informations inutiles
moleculesData.drop(columns=["molecular_weight", "n_hbd", "n_hba", "logp"], inplace=True)
moleculesData.head()

Unnamed: 0,molecule_chembl_id,IC50,units,smiles,pIC50,lipinski
0,CHEMBL63786,0.003,nM,Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1,11.522879,True
1,CHEMBL35820,0.006,nM,CCOc1cc2ncnc(Nc3cccc(Br)c3)c2cc1OCC,11.221849,True
2,CHEMBL53711,0.006,nM,CN(C)c1cc2c(Nc3cccc(Br)c3)ncnc2cn1,11.221849,True
3,CHEMBL53753,0.008,nM,CNc1cc2c(Nc3cccc(Br)c3)ncnc2cn1,11.09691,True
4,CHEMBL66031,0.008,nM,Brc1cccc(Nc2ncnc3cc4[nH]cnc4cc23)c1,11.09691,True


## Filtre sur la présence de motifs sur-intéragissant, générateurs de bruit

In [6]:
# initialisation du filtre
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)

In [7]:
""" 
On ne garde que les molécules sans motifs
intéragissant avec toute sorte de protéines
""" 

matches = []
clean = []
for index, row in tqdm(moleculesData.iterrows(), total=moleculesData.shape[0]):
    molecule = Chem.MolFromSmiles(row.smiles)
    entry = catalog.GetFirstMatch(molecule)  # Get the first matching PAINS
    if entry is not None:
        # store PAINS information
        matches.append(
            {
                "chembl_id": row.molecule_chembl_id,
                "rdkit_molecule": molecule,
                "pains": entry.GetDescription().capitalize(),
            }
        )
    else:
        # collect indices of molecules without PAINS
        clean.append(index)

matches = pd.DataFrame(matches)
moleculesData = moleculesData.loc[clean]  


  0%|          | 0/5171 [00:00<?, ?it/s]

In [8]:
# NBVAL_CHECK_OUTPUT
print(f"Number of compounds with PAINS: {len(matches)}")
print(f"Number of compounds without PAINS: {len(moleculesData)}")

Number of compounds with PAINS: 466
Number of compounds without PAINS: 4705


## Filtre sur la présence de motifs indésirables

In [14]:
# Chargement des sous-structures non désirées
substructures = pd.read_csv("../input/unwantedCompounds.csv", sep=";")
substructures["rdkit_molecule"] = substructures.smarts.apply(Chem.MolFromSmarts)
print("Number of unwanted substructures in collection:", len(substructures))
# NBVAL_CHECK_OUTPUT

Number of unwanted substructures in collection: 101


In [18]:
# search for unwanted substructure
matches = []
clean = []
for index, row in tqdm(moleculesData.iterrows(), total=moleculesData.shape[0]):
    molecule = Chem.MolFromSmiles(row.smiles)
    match = False
    for _, substructure in substructures.iterrows():
        if molecule.HasSubstructMatch(substructure.rdkit_molecule):
            matches.append(
                {
                    "chembl_id": row.molecule_chembl_id,
                    "rdkit_molecule": molecule,
                    "substructure": substructure.rdkit_molecule,
                    "substructure_name": substructure["name"],
                }
            )
            match = True
    if not match:
        clean.append(index)

matches = pd.DataFrame(matches)
moleculesData = moleculesData.loc[clean]

  0%|          | 0/4705 [00:00<?, ?it/s]

In [20]:
# Vérification du nombres de molécules sans sous structures non voules
print(f"Number of found unwanted substructure: {len(matches)}")
print(f"Number of compounds without unwanted substructure: {len(moleculesData)}")

Number of found unwanted substructure: 3006
Number of compounds without unwanted substructure: 2569


## Enregistrement des molécules

In [22]:
moleculesData.to_csv("../output/03_molecularSubstructureFiltering//"+uniprot_id+".csv",
                               index=False)