In [18]:
%reload_ext autoreload
%autoreload 2
%load_ext watermark
%watermark -a 'Marcos Santana' -d -p numpy,pandas,rdkit,descriptastorus -v

Author: Marcos Santana

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.22.0

numpy          : 1.20.1
pandas         : 1.2.4
rdkit          : 2021.03.5
descriptastorus: 2.3.0.2



In [19]:
#export
import pandas as pd
import numpy as np
from rdkit import Chem

from rdkit.Chem.AllChem import GetMACCSKeysFingerprint
from descriptastorus.descriptors import DescriptorGenerator, MorganCounts, Morgan, RDKit2D, RDKitFPBits, FeatureMorganCounts, FeatureMorgan, AtomPair, AtomPairCounts

# Load data

In [23]:
data = pd.read_csv('../data/fxa_ic50_processed.csv',sep=';')
data.head()

Unnamed: 0,doc_id,standard_value,standard_type,standard_relation,pchembl,molregno,canonical_smiles,chembl_id,target_dictionary,target_chembl_id,l1,l2,l3,confidence_score,act,processed_smiles,is_valid
0,47181,1.5,IC50,=,8.82,459679,COc1ccc(NC(=O)c2ccc(C(=N)N(C)C)cc2)c(C(=O)Nc2c...,CHEMBL512351,194,CHEMBL244,Enzyme,Protease,Serine protease,8,Active,COc1ccc(NC(=O)c2ccc(C(=N)N(C)C)cc2)c(C(=O)Nc2c...,False
1,30088,29000.0,IC50,=,4.54,655811,Cc1ccc(Oc2nc(Oc3cccc(C(=N)N)c3)c(F)c(NC(C)CCc3...,CHEMBL193933,194,CHEMBL244,Enzyme,Protease,Serine protease,9,Inactive,Cc1ccc(Oc2nc(Oc3cccc(C(=N)N)c3)c(F)c(NC(C)CCc3...,False
2,47295,1520.0,IC50,=,5.82,668808,N=C(N)c1ccc(CNC(=O)CN2C(=O)[C@H](NS(=O)(=O)Cc3...,CHEMBL553408,194,CHEMBL244,Enzyme,Protease,Serine protease,9,Inactive,N=C(N)c1ccc(CNC(=O)CN2C(=O)[C@H](NS(=O)(=O)Cc3...,False
3,46966,0.23,IC50,=,9.64,522778,CC(C)CNC(=O)c1ccc(-c2ccc(-c3nccs3)cc2C(=O)Nc2c...,CHEMBL524548,194,CHEMBL244,Enzyme,Protease,Serine protease,8,Active,CC(C)CNC(=O)c1ccc(-c2ccc(-c3nccs3)cc2C(=O)Nc2c...,False
4,62387,23000.0,IC50,=,4.64,1347407,O=C(Nc1ccc(N2CCOCC2=O)cc1)[C@H]1CCCCN1C(=O)Cc1...,CHEMBL2040990,194,CHEMBL244,Enzyme,Protease,Serine protease,9,Inactive,O=C(Nc1ccc(N2CCOCC2=O)cc1)[C@H]1CCCCN1C(=O)Cc1...,False


In [24]:
smiles = data.processed_smiles.tolist()

In [30]:
smiles[0:20]

['COc1ccc(NC(=O)c2ccc(C(=N)N(C)C)cc2)c(C(=O)Nc2ccc(Cl)cn2)c1',
 'Cc1ccc(Oc2nc(Oc3cccc(C(=N)N)c3)c(F)c(NC(C)CCc3ccccc3)c2F)c(C(=O)O)c1',
 'N=C(N)c1ccc(CNC(=O)CN2C(=O)[C@H](NS(=O)(=O)Cc3ccccc3)CSc3ccc(N)cc32)cc1',
 'CC(C)CNC(=O)c1ccc(-c2ccc(-c3nccs3)cc2C(=O)Nc2ccc(C(=N)N)cc2)c(C(=O)O)c1',
 'O=C(Nc1ccc(N2CCOCC2=O)cc1)[C@H]1CCCCN1C(=O)Cc1ccc(Cl)cc1',
 'Cn1cc(NS(=O)(=O)Cc2ccccc2)c(=O)n(CC(=O)N[C@H]2CCCN(C(=N)N)C2O)c1=O',
 'CN(Cc1ccc(C(=O)Nc2ccc(Cl)cc2C(=O)Nc2ccc(Cl)cn2)cc1)C1=NCCS1',
 'NC(N)=NCCC[C@@H](C=O)NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)c1ccccc1',
 'CC(C)(C)C[C@H](NC(=O)[C@@H](Cc1ccccc1)NS(=O)(=O)Cc1ccccc1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)c1nccs1',
 'CC(C)[C@H](NC(=O)[C@H](CC1CCCCC1)NC(=O)N[C@H](C)c1ccc(Br)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)c1nccs1',
 'CN(C)C(=N)c1ccc(C(=O)Nc2ccc(Cl)cc2C(=O)Nc2ccc(Cl)cn2)c(N2CCCC2)c1',
 'Cc1cc(C(=O)Nc2ccc(N3CCN(C)CC3)cc2)n(-c2ccc3cc(Cl)ccc3c2)n1',
 'N=C(N)C1CCC[C@H](NC(=O)CN2CCC[C@H](NS(=O)(=O)CCc3ccccc3)C2=O)C1O',
 'CCCC(CCC)C(=O)N[C@@H](CC(=O)OC)C(=O)N1CCC[C@@H

# Fingerprinter

In [25]:
#export
class MACCSGenerator(DescriptorGenerator):
    """Computes MACCS bitvector"""
    NAME = "MACCS"
    def __init__(self):
            
        DescriptorGenerator.__init__(self)
        # specify names and numpy types for all columns
        maccs = [("maccs-%d"%d, np.uint8) for d in range(166)]
        self.columns += maccs

    def calculateMol(self, m, smiles, internalParsing=False):
        counts = list(GetMACCSKeysFingerprint(m))[1: ]
        return counts       
    

class Fingerprinter():
    def __init__(self, smiles):
        self.smiles = smiles
        
        
    def _define_generators(self, gen, **kwargs):
    
        '''Define feature generator.

        Arguments:
        gen : str
            Name of the generator to use. Options: 

            ecfp : Morgan fingerprints (extended connectivity fingerprint)
            fcfp : Feature-based Morgan fingerprints (functional connectivity fingerprint)
            atom_pairs : Atom pairs as defined in https://pubs.acs.org/doi/10.1021/ci00046a002

        nbits : int
            Final number of bits in the fingerprint

        radius : int
            Radius around central atom to calculate Morgan fingerprints'''
    
        radius = kwargs.get('radius', 3)
        nbits = kwargs.get('nbits', 2048)


        if gen == 'ecfp':
            feature_generator = Morgan(radius=radius,nbits=nbits)
            
        elif gen == 'atom_pairs':
            feature_generator = AtomPairCounts(nbits=nbits)
            
        elif gen == 'fcfp':
            feature_generator = FeatureMorgan(radius=radius,nbits=nbits)
            
        elif gen == 'maccs':
            feature_generator = MACCSGenerator()
            
        return feature_generator 


    def generate_fingerprint(self, gen, **kwargs):
        '''Calculate features for a list of SMILES using a feature generator.

        Arguments:
        gen : str
            Name of the generator to use. Options: 

            ecfp : Morgan fingerprints (extended connectivity fingerprint)
            fcfp : Feature-based Morgan fingerprints (functional connectivity fingerprint)
            atom_pairs : Atom pairs as defined in https://pubs.acs.org/doi/10.1021/ci00046a002


        nbits : int
            Final number of bits in the fingerprint

        radius : int
            Radius around central atom to calculate Morgan fingerprints'''
    
        if not isinstance(gen, str):
            raise TypeError('Please provide a string that represents a valid generator name.')

        if gen not in ['maccs','ecfp', 'fcfp', 'atom_pairs','maccs']:
            raise ValueError("Please provide a valid generator. Viable options are: 'maccs','ecfp', 'fcfp', 'atom_pairs'")

        
        feature_generator = self._define_generators(gen=gen, **kwargs)
        return np.array(feature_generator.processSmiles(self.smiles)[1])[:, 1:]
    
        
    def __len__(self): 
        return len(self.smiles)
    
    def __getitem__(self, i):
        return self.smiles[i]
    
    def __str__(self):
        return 'Size of SMILES dataset: {}\nFirst SMILES: {}'.format(len(self.smiles), self.smiles[0])

In [26]:
fp_class = Fingerprinter(smiles)

In [27]:
print(fp_class)

Size of SMILES dataset: 2129
First SMILES: COc1ccc(NC(=O)c2ccc(C(=N)N(C)C)cc2)c(C(=O)Nc2ccc(Cl)cn2)c1


In [28]:
X = fp_class.generate_fingerprint(gen='atom_pairs')

In [29]:
X.shape

(2129, 2048)

In [31]:
X[0:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])