In [1]:
#|default_exp tools.filtering

In [2]:
#|echo: false
%load_ext autoreload
%autoreload 2

In [3]:
#|export
import json
from collections import defaultdict
import multiprocessing as mp
import pandas as pd
import numpy as np
from rdkit import Chem
from chemtools.tools.sanitizer import convert_smiles, normalize_mol, MolCleaner
from rdkit.Chem import AllChem,rdMolDescriptors
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from typing import List, Collection, Tuple

from fastprogress.fastprogress import master_bar, progress_bar
from time import sleep

In [4]:
#|echo: false
from nbdev.showdoc import show_doc

# Filtering

In [5]:
#|export
class MolFiltering:
    
    """Filter a molecular dataset from unwanted structures
    
        Use  factory methods `MolFiltering.from_list`, `MolFiltering.from_df` or `MolFiltering.from_csv` instead of accessing the class directly.
       
    """
    
    @classmethod
    def get_mol_alerts(cls, smi, alerts_dict:dict) -> pd.DataFrame:

        """
        Find structural alerts for a single SMILES.

        Arguments:
        
            smi : str
                A SMILES representing a molecule.

            alerts_dict : dict
                A dict with alerts definitions.

        Returns:
        
            rule_sets : pandas.DataFrame
                A `pandas.DataFrame` with substructure alerts for `smi`.

        """
        _columns = ['_smiles','Alert_SMARTS','Alert_description','Alert_rule_set','Alert_num_hits']
        try:
            mol = normalize_mol(smi)
            rule_sets = []
            for alert, (rule_set, description, max_value) in alerts_dict.items():
                hits = len(mol.GetSubstructMatches(Chem.MolFromSmarts(alert)))
                if hits > max_value:
                    rule_sets.append((smi, alert, description, rule_set, hits))
            rule_sets = pd.DataFrame(rule_sets, columns=_columns)
            return rule_sets
        except:
            return None

    @classmethod
    def get_alerts(cls, smiles_list, alerts_dict:dict, n_jobs:int=None) -> pd.DataFrame:
        
        """
        Find structural alerts for a list of SMILES.

        Arguments:
        
            smiles_list : Collection
                A collection of SMILES.

            alerts_dict : dict
                A dict with alerts definitions.

            n_jobs : int
                The number of jobs to run in parallel.

        Returns:
        
            alerts_df : `pandas.DataFrame`
            

                A `pandas.DataFrames` with flagged molecules.

        """

            
        from functools import partial
        
        filtering_func = partial(cls.get_mol_alerts, alerts_dict=alerts_dict)


        if n_jobs is None: n_jobs = mp.cpu_count()
            
        #try:
        with mp.Pool(n_jobs) as mp_pool:
            all_alerts = pd.concat(list(progress_bar(mp_pool.imap(filtering_func, smiles_list), total=len(smiles_list))))
            
        if all_alerts.empty: 
            print('No compounds were flagged.')
        
        return all_alerts

            

    @classmethod
    def from_list(cls, smiles_list,alerts_dict:dict=None,n_jobs:int=1, **kwargs) -> pd.DataFrame:
        
        """Factory method to process a list of SMILES.

        Arguments:

            smiles : A List, Array, or any Iterable (except strings)
                SMILES ready for sanitization

        Returns:
        
            alerts_df : `pandas.DataFrame`
            

                A `pandas.DataFrames` with flagged molecules.

        """
        
        id_col = 'ID'
        smiles_col = 'smiles'

        df = pd.DataFrame({smiles_col:smiles_list, id_col:[f'mol{idx}' for idx in range(len(smiles_list))]})


        return cls.from_df(df, smiles_col=smiles_col, alerts_dict=alerts_dict, n_jobs=n_jobs)


    @classmethod
    def from_df(cls,
                df: pd.DataFrame, 
                smiles_col:str,
               alerts_dict:dict=None,
                n_jobs:int=1) -> pd.DataFrame:
        
        """Factory method to process a `pandas.DataFrame`

        Arguments:
        
            df : pd.DataFrame
                A pandas Dataframe with molecular data for sanitization.

            smiles_col : str
                The name of the column with SMILES for each molecule.


        Returns:
        
            alerts_df : `pandas.DataFrame`
            

                A `pandas.DataFrames` with flagged molecules.

        """     
        

        
        _data = df.copy()
        _data.reset_index(drop=True,inplace=True)
        
        if not isinstance(alerts_dict, dict) and alerts_dict is not None:
            raise TypeError('Please provide a valid dictionary of structural alerts')
        
        if alerts_dict is None:
            with open('../data/libraries/Glaxo_alerts.json') as f:
                alerts_dict = json.load(f)['structural_alerts']

        return cls.get_alerts(smiles_list=_data[smiles_col].values, alerts_dict=alerts_dict, n_jobs=n_jobs)

    @classmethod
    def from_csv(cls,
                 data_path: str,
                 smiles_col: str,
                 alerts_dict:dict=None,
                 n_jobs:int=1,
                 sep: str = ',') -> pd.DataFrame:
        
        """Factory method to process a CSV file.

        Arguments:

            data_path : str
                Path to CSV file

            smiles_col : str
                The name of the column with SMILES for each molecule.

 
        Returns:
        
            alerts_df : `pandas.DataFrame`
            

                A `pandas.DataFrames` with flagged molecules.
            
        """
        
        return cls.from_df(pd.read_csv(data_path, sep=sep), 
                           smiles_col=smiles_col,alerts_dict=alerts_dict)


In [6]:
#|skip
data = pd.read_csv('../data/example_data.csv')
print(data.shape)
data.head(2)

(500, 11)


Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,IC50,units,smiles,pIC50,molecular_weight,n_hba,n_hbd,logp,ro5_fulfilled
0,5347,CHEMBL1641996,55600.0,nM,N#Cc1cnc(Nc2cccc(Br)c2)c2cc(NC(=O)c3ccco3)ccc12,4.254925,432.022188,5,2,5.45788,True
1,2985,CHEMBL424375,300.0,nM,COc1cccc(-c2cn(-c3ccc(CNCCO)cc3)c3ncnc(N)c23)c1,6.522879,389.185175,7,3,2.7602,True


In [7]:
#|skip
with open('../data/libraries/PAINS_alerts.json') as f:
    alerts_dict = json.load(f)['structural_alerts']
    structural_alerts = alerts_dict.get('structural_alerts', None)

In [8]:
#|skip
ab=MolFiltering.from_df(data,'smiles', alerts_dict)
ab

Unnamed: 0,_smiles,Alert_SMARTS,Alert_description,Alert_rule_set,Alert_num_hits
0,N#CC(C#N)=C(C#N)c1cc(O)cc(O)c1,[#6](-[#6]#[#7])(-[#6]#[#7])=[#6]-c:1:c:c:c:c:c:1,ene_cyano_A(19),PAINS,1
0,COc1cc(C=C(C#N)C#N)cc(CSCC(=O)O)c1O,[#6](-[#6]#[#7])(-[#6]#[#7])=[#6]-c:1:c:c:c:c:c:1,ene_cyano_A(19),PAINS,1
0,O=C(Nc1ccccc1)N(Cc1ccc(F)cc1)Cc1cc(Cl)cc(Cl)c1O,[#7]-[C;X4]-c1ccccc1-[O;H1],mannich_A(296),PAINS,1
0,COc1ccc(/C=C2\CCC/C(=C\c3cc(OC)cc(OC)c3)C2=O)cc1,[#6]=!@[#6](-[!#1])-@[#6](=!@[!#6&!#1])-@[#6](...,ene_one_ene_A(57),PAINS,1
0,C=COC(=O)N(CCN(C)C)/N=N/c1ccc2ncnc(Nc3cccc(Cl)...,[#7;!R]=[#7],azo_A(324),PAINS,1
0,N#CC(C#N)=Cc1cc2c(cc1[N+](=O)[O-])OCO2,[#6](-[#6]#[#7])(-[#6]#[#7])=[#6]-c:1:c:c:c:c:c:1,ene_cyano_A(19),PAINS,1
0,COc1cc(C=C(C#N)C#N)cc(CSc2ccccc2)c1O,[#6](-[#6]#[#7])(-[#6]#[#7])=[#6]-c:1:c:c:c:c:c:1,ene_cyano_A(19),PAINS,1
0,C/N=N/Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1,[#7;!R]=[#7],azo_A(324),PAINS,1
0,COc1cc(C=C(C#N)C#N)cc(OC)c1O,[#6](-[#6]#[#7])(-[#6]#[#7])=[#6]-c:1:c:c:c:c:c:1,ene_cyano_A(19),PAINS,1
0,Oc1cc2cc(-c3cccnc3)cnc2cc1O,c:1:c:c(:c(:c:c:1)-[#8;H1])-[#8;H1],catechol_A(92),PAINS,1


In [9]:
show_doc(MolFiltering)

---

### MolFiltering

>      MolFiltering ()

Filter a molecular dataset from unwanted structures

Use  factory methods `MolFiltering.from_list`, `MolFiltering.from_df` or `MolFiltering.from_csv` instead of accessing the class directly.

In [10]:
show_doc(MolFiltering.from_df)

---

### MolFiltering.from_df

>      MolFiltering.from_df (df:pandas.core.frame.DataFrame, smiles_col:str,
>                            alerts_dict:dict=None, n_jobs:int=1)

Factory method to process a `pandas.DataFrame`

Arguments:

    df : pd.DataFrame
        A pandas Dataframe with molecular data for sanitization.

    smiles_col : str
        The name of the column with SMILES for each molecule.

Returns:

    alerts_df : `pandas.DataFrame`

        A `pandas.DataFrames` with flagged molecules.

In [11]:
show_doc(MolFiltering.from_csv)

---

### MolFiltering.from_csv

>      MolFiltering.from_csv (data_path:str, smiles_col:str,
>                             alerts_dict:dict=None, n_jobs:int=1, sep:str=',')

Factory method to process a CSV file.

Arguments:

    data_path : str
        Path to CSV file

    smiles_col : str
        The name of the column with SMILES for each molecule.

Returns:

    alerts_df : `pandas.DataFrame`

        A `pandas.DataFrames` with flagged molecules.

In [12]:
show_doc(MolFiltering.from_list)

---

### MolFiltering.from_list

>      MolFiltering.from_list (smiles_list, alerts_dict:dict=None, n_jobs:int=1,
>                              **kwargs)

Factory method to process a list of SMILES.

Arguments:

    smiles : A List, Array, or any Iterable (except strings)
        SMILES ready for sanitization

Returns:

    alerts_df : `pandas.DataFrame`

        A `pandas.DataFrames` with flagged molecules.

In [13]:
#|echo: false
from nbdev import nbdev_export
nbdev_export()