# Raffinage des composés selon leur structure moléculaire
Objectif : Filtrer les composés selon la présence de sous-structures non désirées.
## Prérequis

In [1]:
!pip install rdkit &> /dev/null

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from os import listdir, chdir
from pickle import load

import seaborn as sns
sns.set(rc = {'figure.figsize':(15,8)})
import matplotlib.pyplot as plt

import pandas as pd
from tqdm.auto import tqdm
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

## Chargement des données filtrés sur la règle de Lipinski

In [4]:
chdir("/content/drive/MyDrive/Colab Notebooks/these_exercice/src")

In [12]:
# Récupération de l'identifiant uniprot
with open("../output/01_compoundDataAcquisition/uniprot_id.txt", "rb") as file:
    uniprot_id = load(file)
    
# chargement des molécules 
moleculesData = pd.read_csv("../output/02_molecularFiltering/"+uniprot_id+"_ro5Compliant.csv")

print(uniprot_id)

P08581


## Filtre sur la présence de motifs sur-intéragissant, générateurs de bruit

In [13]:
# initialisation du filtre
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)

In [14]:
""" 
On ne garde que les molécules sans motifs
intéragissant avec toute sorte de protéines
""" 

matches = []
clean = []
for index, row in tqdm(moleculesData.iterrows(), total=moleculesData.shape[0]):
    molecule = Chem.MolFromSmiles(row.smiles)
    entry = catalog.GetFirstMatch(molecule)  # Get the first matching PAINS
    if entry is not None:
        # store PAINS information
        matches.append(
            {
                "chembl_id": row.molecule_chembl_id,
                "rdkit_molecule": molecule,
                "pains": entry.GetDescription().capitalize(),
            }
        )
    else:
        # collect indices of molecules without PAINS
        clean.append(index)

matches = pd.DataFrame(matches)
moleculesData = moleculesData.loc[clean]  


  0%|          | 0/2111 [00:00<?, ?it/s]

In [16]:
# NBVAL_CHECK_OUTPUT
print(f"Number of compounds with PAINS: {len(matches)}")
print(f"Number of compounds without PAINS: {len(moleculesData)}")

Number of compounds with PAINS: 188
Number of compounds without PAINS: 1923


## Filtre sur la présence de motifs indésirables



In [17]:
# Chargement des sous-structures non désirées
substructures = pd.read_csv("../input/unwantedCompounds.csv", sep=";")
substructures["rdkit_molecule"] = substructures.smarts.apply(Chem.MolFromSmarts)
print("Number of unwanted substructures in collection:", len(substructures))
# NBVAL_CHECK_OUTPUT

Number of unwanted substructures in collection: 101


In [18]:
print(moleculesData.shape)

(1923, 10)


In [19]:
# search for unwanted substructure
matches = []
clean = []
for index, row in tqdm(moleculesData.iterrows(), total=moleculesData.shape[0]):
    molecule = Chem.MolFromSmiles(row.smiles)
    match = False
    for _, substructure in substructures.iterrows():
        if molecule.HasSubstructMatch(substructure.rdkit_molecule):
            matches.append(
                {
                    "chembl_id": row.molecule_chembl_id,
                    "rdkit_molecule": molecule,
                    "substructure": substructure.rdkit_molecule,
                    "substructure_name": substructure["name"],
                }
            )
            match = True
    if not match:
        clean.append(index)



  0%|          | 0/1923 [00:00<?, ?it/s]

In [22]:
print(len(clean))

1308


In [23]:
matches = pd.DataFrame(matches)
moleculesData = moleculesData.loc[clean]

In [11]:
# Vérification du nombres de molécules sans sous structures non voules
print(f"Number of found unwanted substructure: {len(matches)}")
print(f"Number of compounds without unwanted substructure: {len(moleculesData)}")

Number of found unwanted substructure: 923
Number of compounds without unwanted substructure: 1308


## Enregistrement des molécules

In [None]:
moleculesData.to_csv("../output/03_molecularSubstructureFiltering//"+uniprot_id+".csv",
                               index=False)