In [None]:
# default_exp fingerprinter

# Fingerprinter
> Fingerprinter class

In [1]:
#hide
%reload_ext autoreload
%autoreload 2
%load_ext watermark
%watermark -a 'Marcos Santana' -d -p numpy,pandas,rdkit,descriptastorus -v



Author: Marcos Santana

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.22.0

numpy          : 1.20.1
pandas         : 1.2.4
rdkit          : 2021.03.5
descriptastorus: 2.3.0.2



In [19]:
#export
import pandas as pd
import numpy as np
from typing import List
from rdkit import Chem

from rdkit.Chem.AllChem import GetMACCSKeysFingerprint
from descriptastorus.descriptors import DescriptorGenerator, MorganCounts, Morgan, RDKit2D, RDKitFPBits, FeatureMorganCounts, FeatureMorgan, AtomPair, AtomPairCounts

# Fingerprinter

In [13]:

    print('yes')

yes


In [34]:
#export
class MACCSGenerator(DescriptorGenerator):
    """Computes MACCS bitvector"""
    NAME = "MACCS"
    def __init__(self):
            
        DescriptorGenerator.__init__(self)
        # specify names and numpy types for all columns
        maccs = [("maccs-%d"%d, np.uint8) for d in range(166)]
        self.columns += maccs

    def calculateMol(self, m, smiles, internalParsing=False):
        counts = list(GetMACCSKeysFingerprint(m))[1: ]
        return counts       
    

class Fingerprinter():
    '''Calculate fingerprints for a collection of molecules
    
    Attributes:
    
    smiles : list
        A list of SMILES
    
    '''
    def __init__(self, smiles : List):
        
        if isinstance(smiles, list):
            self.smiles = smiles
        else:
            raise TypeError(f'Your SMILES is not a list. Trying passing a list or tuple')
        
        
    def _define_generators(self, gen, **kwargs):
    
        '''Define feature generator.

        Arguments:
        gen : str
            Name of the generator to use. Options: 

            ecfp : Morgan fingerprints (extended connectivity fingerprint)
            fcfp : Feature-based Morgan fingerprints (functional connectivity fingerprint)
            atom_pairs : Atom pairs as defined in https://pubs.acs.org/doi/10.1021/ci00046a002

        nbits : int
            Final number of bits in the fingerprint

        radius : int
            Radius around central atom to calculate Morgan fingerprints'''
    
        radius = kwargs.get('radius', 3)
        nbits = kwargs.get('nbits', 2048)


        if gen == 'ecfp':
            feature_generator = Morgan(radius=radius,nbits=nbits)
            
        elif gen == 'atom_pairs':
            feature_generator = AtomPairCounts(nbits=nbits)
            
        elif gen == 'fcfp':
            feature_generator = FeatureMorgan(radius=radius,nbits=nbits)
            
        elif gen == 'maccs':
            feature_generator = MACCSGenerator()
            
        return feature_generator 


    def generate_fingerprint(self, gen, **kwargs):
        '''Calculate features for a list of SMILES using a feature generator.

        Arguments:
        gen : str
            Name of the generator to use. Options: 

            ecfp : Morgan fingerprints (extended connectivity fingerprint)
            fcfp : Feature-based Morgan fingerprints (functional connectivity fingerprint)
            atom_pairs : Atom pairs as defined in https://pubs.acs.org/doi/10.1021/ci00046a002


        nbits : int
            Final number of bits in the fingerprint

        radius : int
            Radius around central atom to calculate Morgan fingerprints'''
    
        if not isinstance(gen, str):
            raise TypeError("Please provide a string that represents a valid generator name\nViable options are: 'maccs','ecfp', 'fcfp', 'atom_pairs'.")

        if gen not in ['maccs','ecfp', 'fcfp', 'atom_pairs','maccs']:
            raise ValueError("Please provide a valid generator. Viable options are: 'maccs','ecfp', 'fcfp', 'atom_pairs'")

        
        feature_generator = self._define_generators(gen=gen, **kwargs)
        return np.array(feature_generator.processSmiles(self.smiles)[1])[:, 1:]
    
        
    def __len__(self): 
        return len(self.smiles)
    
    def __getitem__(self, i):
        return self.smiles[i]
    
    def __str__(self):
        return 'Size of SMILES dataset: {}\nFirst SMILES: {}'.format(len(self.smiles), self.smiles[0])

In [35]:
smiles = ['COc1ccc(NC(=O)c2ccc(C(=N)N(C)C)cc2)c(C(=O)Nc2ccc(Cl)cn2)c1',
 'Cc1ccc(Oc2nc(Oc3cccc(C(=N)N)c3)c(F)c(NC(C)CCc3ccccc3)c2F)c(C(=O)O)c1',
 'N=C(N)c1ccc(CNC(=O)CN2C(=O)[C@H](NS(=O)(=O)Cc3ccccc3)CSc3ccc(N)cc32)cc1',
 'CC(C)CNC(=O)c1ccc(-c2ccc(-c3nccs3)cc2C(=O)Nc2ccc(C(=N)N)cc2)c(C(=O)O)c1',
 'O=C(Nc1ccc(N2CCOCC2=O)cc1)[C@H]1CCCCN1C(=O)Cc1ccc(Cl)cc1',
 'Cn1cc(NS(=O)(=O)Cc2ccccc2)c(=O)n(CC(=O)N[C@H]2CCCN(C(=N)N)C2O)c1=O',
 'CN(Cc1ccc(C(=O)Nc2ccc(Cl)cc2C(=O)Nc2ccc(Cl)cn2)cc1)C1=NCCS1',
 'NC(N)=NCCC[C@@H](C=O)NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)c1ccccc1',
 'CC(C)(C)C[C@H](NC(=O)[C@@H](Cc1ccccc1)NS(=O)(=O)Cc1ccccc1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)c1nccs1',
 'CC(C)[C@H](NC(=O)[C@H](CC1CCCCC1)NC(=O)N[C@H](C)c1ccc(Br)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)c1nccs1',
 'CN(C)C(=N)c1ccc(C(=O)Nc2ccc(Cl)cc2C(=O)Nc2ccc(Cl)cn2)c(N2CCCC2)c1',
 'Cc1cc(C(=O)Nc2ccc(N3CCN(C)CC3)cc2)n(-c2ccc3cc(Cl)ccc3c2)n1',
 'N=C(N)C1CCC[C@H](NC(=O)CN2CCC[C@H](NS(=O)(=O)CCc3ccccc3)C2=O)C1O',
 'CCCC(CCC)C(=O)N[C@@H](CC(=O)OC)C(=O)N1CCC[C@@H]1C(=O)N[C@H]1CCCC(C(=N)N)C1O',
 'O=C(CN1CCC[C@H](NS(=O)(=O)c2cc3cc(Cl)ccc3s2)C1=O)N1CCCC1',
 'CN1CCc2nc(C(=O)NC3CCN(S(C)(=O)=O)CC3NC(=O)c3cc4cc(Cl)ccc4[nH]3)sc2C1',
 'N=C(c1ccc(CN2CCN(S(=O)(=O)c3cc4ccc(Cl)cc4s3)CC2=O)cc1)N1CCCCC1',
 'CN(C(=O)[C@@H]1Cc2ccccc2CN1C(=O)Cc1ccc(Cl)cc1Cl)c1ccc(N2CCCCC2=O)cc1',
 'CN1CCc2nc(C(=O)N[C@@H]3C[C@@H](C(=O)N(C)C)CC[C@@H]3NC(=O)c3cc4ccc(Cl)cc4[nH]c3=O)sc2C1',
 'O=C(NC[C@@H]1OC(=O)N2c3ccc(N4CCOCC4=O)cc3OC[C@@H]12)c1cc(Cl)c(Cl)s1']

In [29]:
gen_fp = Fingerprinter([smiles[0]])

In [30]:
print(gen_fp)

Size of SMILES dataset: 1
First SMILES: COc1ccc(NC(=O)c2ccc(C(=N)N(C)C)cc2)c(C(=O)Nc2ccc(Cl)cn2)c1


In [33]:
X = gen_fp.generate_fingerprint(gen='maccs')

In [26]:
X.shape

(1, 166)