In [None]:
#|default_exp tools.featurizer

In [None]:
#|echo: false
%load_ext autoreload
%autoreload 2

In [None]:
#|export
import pandas as pd
import numpy as np
from rdkit import Chem
from chemtools.utils import convert_smiles
from rdkit.Chem import MACCSkeys,AllChem,rdMolDescriptors, rdFingerprintGenerator, Descriptors
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from functools import partial
from typing import List, Collection
import multiprocessing as mp
from fastprogress.fastprogress import master_bar, progress_bar
from time import sleep

In [None]:
#|echo: false
from nbdev.showdoc import show_doc

# Featurizer

## Utilities

In [None]:
RDKIT_PROPERTIES = ['BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n',
                         'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v',
                         'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2',
                         'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6',
                         'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'ExactMolWt',
                         'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3',
                         'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 'HeavyAtomMolWt',
                         'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex',
                         'MaxAbsPartialCharge', 'MaxEStateIndex', 'MaxPartialCharge',
                         'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex',
                         'MinPartialCharge', 'MolLogP', 'MolMR', 'MolWt', 'NHOHCount',
                         'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
                         'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
                         'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
                         'NumRadicalElectrons', 'NumRotatableBonds', 'NumSaturatedCarbocycles',
                         'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumValenceElectrons',
                         'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13',
                         'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5',
                         'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'RingCount',
                         'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5',
                         'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10',
                         'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4',
                         'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9',
                         'TPSA', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3',
                         'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
                         'VSA_EState9', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN',
                         'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2',
                         'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0',
                         'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2',
                         'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide',
                         'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl',
                         'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine',
                         'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester',
                         'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone',
                         'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone',
                         'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine',
                         'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho',
                         'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol',
                         'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine',
                         'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN',
                         'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole',
                         'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'qed']

In [None]:
descriptor_dict = {name:func for name,func in Descriptors.descList if name in RDKIT_PROPERTIES}

In [None]:
#|export
def get_rdkit2d_descriptors(mol:Chem.rdchem.Mol):
    
    """
    Generates 200 RDKit constitutional descriptors for a `mol` object.
    
    Arguments:
        mol : Chem.rdchem.Mol
            A RDKit Mol object.
            
    Returns:
        descs : numpy.array
            An array with the calculated descriptors.
    
    
    """
    descs = np.array([func(mol) for name,func in descriptor_dict.items()]).reshape(1, -1)
    return descs

# MolFeaturizer

In [None]:
#|export
class MolFeaturizer:
    
    """Creates a Fingerprinter to perform molecular featurization

    Attributes:

        params : dict, optional
            A dictionary of parameters for an rdkit generator.

        descriptor_type : str
            A string representing a descriptor available in ´rdFingerprintGenerator´

        generator : 
            A fingerprinter generator available in ´rdFingerprintGenerator´
            


    Arguments:

        descriptor_type : str
            A string representing a descriptor available in ´rdFingerprintGenerator´

        params : dict, optional
            A dictionary of parameters for an rdkit generator.


          """
              
    def __init__(self, descriptor_type : str, params:dict={}):
        
        self.params = params
        self.descriptor_type = descriptor_type
        
        self.DESCS = {'morgan': rdFingerprintGenerator.GetMorganGenerator,
                      'atom_pairs':rdFingerprintGenerator.GetAtomPairGenerator,
                      'rdkit':rdFingerprintGenerator.GetRDKitFPGenerator, 
                      'rdkit2d':get_rdkit2d_descriptors, 
                      'torsion':rdFingerprintGenerator.GetTopologicalTorsionGenerator,
                      'maccs': MACCSkeys.GenMACCSKeys}
        
        if descriptor_type in ['morgan','atom_pairs','rdkit','torsion']:
            self.generator = self.set_params(self.DESCS[descriptor_type], params)
        else:
            self.generator = self.DESCS[descriptor_type]
            
    
    def set_params(self, generator, params:dict):
        
        """
        Set parameters ´params´ for ´generator´
        
        """

        
        try:
            generator = generator(**params)
            
        except:
            print(f'The parameters {params} are not valid for generator {self.DESCS[self.descriptor_type].__name__}.\nSee RDKit: https://www.rdkit.org/docs/source/rdkit.Chem.rdFingerprintGenerator.html')
            print('Returning the generator with default parameters.')
            generator = generator()

        return generator
    
    
    def process_smiles(self, smi, use_counts:bool=False):
        
        """
        Generate features for one SMILES.
        
        Arguments:
        
            smi : str
                A SMILES representing a molecular structure
                
            use_counts : bool (default=False)
                Whether to consider feature's counts for fingerprint generation.
                
                
        
        """
        mol = convert_smiles(smi,sanitize=True)
        
        if not mol:
            return None
        
        if self.descriptor_type == 'maccs':
            fps = np.array([])
            ConvertToNumpyArray(self.generator(mol), fps)
            return fps.reshape(1, -1)
        
        elif self.descriptor_type == 'rdkit2d':
            return self.generator(mol)
        
        else:
        
            if use_counts:
                fps =  self.generator.GetCountFingerprintAsNumPy(mol)
                return fps.reshape(1, -1)

            fps = self.generator.GetFingerprintAsNumPy(mol)
            return fps.reshape(1, -1)
    
    def process_smiles_list(self, smiles_list : List[str], **kwargs):
        
        """
        Generate features for a list of SMILES.
        
        Arguments:
        
            smiles_list : List[str]
                A list of SMILES.
                
        Keyword arguments:
        
            use_counts : bool (default=False)
                Whether to consider feature's counts for fingerprint generation.
                
        
        """
        func = partial(self.process_smiles, **kwargs)
        fps = list(progress_bar(map(func, smiles_list), total=len(smiles_list)))
        #fps = list(map(func, tqdm(smiles_list, desc='Calculating fingerprints', position=0, leave=True)))
        #fps = list(progress_bar(mp_pool.imap(cls.process_mol, cls.raw_smiles), total=len(cls.raw_smiles), comment='Processing SMILES.'))
        
        if len(fps)>1:
            return np.vstack(fps)
        return fps[-1]
            

In [None]:
show_doc(MolFeaturizer)

In [None]:
show_doc(MolFeaturizer.process_smiles)

In [None]:
show_doc(MolFeaturizer.process_smiles_list)

In [None]:
#|echo: false
from nbdev import nbdev_export
nbdev_export()