In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm

# Fingerprint molecules
The whole set of fingerprints won't fit in memory (even sparse) so we have to save them as chunks. This iterates over the SMILES codes, generating fingerprint_matrices and score arrays, saving them as chunks of 10,000,000

In [2]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy import sparse

def makeChunk(fileobj, chunkSize, outFileName, fingerprint_function, fpSize=8192):    
    #store bit indices in these:
    row_idx = list()
    col_idx = list()
    count=0
    
    scores = list()
    
    #iterate through file, 
    for line in tqdm.tqdm_notebook(fileobj, total=chunkSize-1, smoothing=0):
        if line=='': #end of file
            break
        
        words = line[17:-1].split(',') #removes the zinc ID and trailing newline   
        if len(words[1])<1:
            continue
        if words[1]=='no_score':
            break
        else:
            try:
                mol = Chem.MolFromSmiles(words[0])
                fp = fingerprint_function(mol, **pars)
                onbits = list(fp.GetOnBits())
                #these bits all have the same row:
                row_idx += [count]*len(onbits)
                count+=1
                #and the column indices of those bits:
                col_idx+=onbits
            
                scores.append(float(words[1]))

            except KeyboardInterrupt:
                raise
            except:
                print('failed molecule')
                
        if count>chunkSize:
            break
                
    #if we reach the chunkSize:
    #generate a sparse matrix out of the row,col indices:
    unfolded_size = 8192
    fingerprint_matrix = sparse.coo_matrix((np.ones(len(row_idx)).astype(bool), (row_idx, col_idx)), 
                          shape=(max(row_idx)+1, unfolded_size))
    #convert to csr matrix, it is better:
    fingerprint_matrix =  sparse.csr_matrix(fingerprint_matrix)
    
    sparse.save_npz(outFileName+'.npz', fingerprint_matrix)
    np.save(outFileName+'.npy', np.array(scores) )
    
    
    

# Count number of valid molecules:

In [3]:
fname = '../data/AmpC_screen_table.csv'
fileobj = open(fname)
fileobj.readline()


count = 0
for line in fileobj:
    words = line[:-1].split(',')
    if len(words[2])<1:
        continue
    if words[2]=='no_score':
        break
    count+=1
fileobj.close()

In [4]:
count

96214206

In [5]:
chunksize = 10_000_000
fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect
pars = { "radius": 2,
                     "nBits": 8192,
                     "invariants": [],
                     "fromAtoms": [],
                     "useChirality": False,
                     "useBondTypes": True,
                     "useFeatures": True,
            }


fname = '../data/AmpC_screen_table.csv'
fileobj = open(fname)
fileobj.readline()


for i in range( np.ceil(count / chunksize).astype(int) ):
    makeChunk(fileobj, chunksize, '../processed_data/AmpC_all'+str(i), fingerprint_function)




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm.tqdm_notebook(fileobj, total=chunkSize-1, smoothing=0):


HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))

RDKit ERROR: [16:58:42] SMILES Parse Error: extra close parentheses while parsing: CNc1cc(-c2ccccc2)nc2ccnn21)c1ccncc1
RDKit ERROR: [16:58:42] SMILES Parse Error: Failed parsing SMILES 'CNc1cc(-c2ccccc2)nc2ccnn21)c1ccncc1' for input: 'CNc1cc(-c2ccccc2)nc2ccnn21)c1ccncc1'


failed molecule



HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9999999.0), HTML(value='')))


