In [1]:
#rdkit imports
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem import Descriptors
from rdkit.Chem import rdFMCS
from rdkit.Chem.rdmolops import RDKFingerprint
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
from rdkit.Avalon.pyAvalonTools import GetAvalonFP

#housekeeping imports
import pandas as pd
import matplotlib
import numpy as np
import scipy as sp

Maximum common substructure (not currently in use)

In [2]:
# finds maximum common substructure among list of molecule objects
def get_mcs(input_df):      # gets a df containing SMILES strings of products
    
    mol_list = list(input_df['SMILES']) # converts product column to list

    #for i in range(len(mol_list)): # converts SMILES strings to molecule objects
        #if mol_list[i] = 'none'
        #mol_list[i] = Chem.MolFromSmiles(mol_list[i])

    #maximum_common_substructure = rdFMCS.FindMCS(mol_list)
    
    return mol_list #maximum_common_substructure

#Draw.MolToImage(maximum_common_substructure, size=(300, 300), kekulize=False, wedgeBonds=True)

Data cleaning

input_df = pd.read_csv('~/OneDrive/Documents/Python/CLASSES/DIRECT/metamoles/datasets/playground_df_cleaned_kegg_with_smiles.csv')

In [3]:
loaded = pd.read_csv('../../../big-datasets/master_df_featurized.csv')

In [4]:
loaded.tail()

Unnamed: 0,enzyme,product,reacts,PubChemID,SMILES,n_C,n_H,n_O,n_N,n_P,n_S,n_X,n_DoU,MW
17006,EC-2.7.7.5,CPD-9650,0.0,25246060,CC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=...,60,97,7,0,2,0,0,13,992.377
17007,EC-1.1.1.286,PENTACHLOROPHENOL,0.0,4269236,C1(=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)[O-],6,0,1,0,0,0,5,4,265.33
17008,EC-1.14.11.9,CPD-288,0.0,176951,C1=CC(C(C=C1)O)O,6,8,2,0,0,0,0,3,112.128
17009,EC-1.9.3.1,2-HEXAPRENYL-3-METHYL-6-METHOXY-14-BENZ,0.0,50986193,CC1=C(C(=C(C=C1O)OC)O)CC=C(C)CCC=C(C)CCC=C(C)C...,53,82,3,0,0,0,0,13,767.236
17010,EC-2.7.1.71,CHORISMATE,0.0,5460312,C=C(C(=O)[O-])OC1C=C(C=CC1O)C(=O)[O-],10,8,6,0,0,0,0,7,224.168


In [5]:
pos = loaded[loaded['reacts'] == 1.0]

In [6]:
neg = loaded[loaded['reacts'] == 0.0]

In [7]:
def input_data(input_df): #cleans input df and returns neccessary elements
    '''DocString'''
    
    for index, row in input_df.iterrows():
        
        smiles = row['SMILES'] 
        if len(smiles) <= 2:
            
            input_df.drop(index, inplace=True)            
        
    return input_df

In [8]:
#input_df = input_data(input_df)

In [9]:
def fingerprint_products(input_df): #fingerprints all products in a given df
    '''DocString'''
    
    mol_list = []
    fp_list = []
    
    for index, row in input_df.iterrows():
        mol_list.append(Chem.rdmolfiles.MolFromSmiles(row['SMILES'])) #get mols from SMILES and add mols to list
        fp_list.append(FingerprintMols.FingerprintMol(Chem.rdmolfiles.MolFromSmiles(row['SMILES']))) #get fingerprints from mols and and fingerprints to list
        
    input_df.insert(6, column='Mol', value=mol_list)
    input_df.insert(7, column='Fingerprint', value= fp_list)
            
    return input_df

In [10]:
#input_df2 = fingerprint_products(input_data(input_df)) #build df containing new Mol and Fingerprint columns

split by enzyme

In [11]:
pos = pos.drop(columns=['Mol', 'Fingerprint'])

KeyError: "['Mol' 'Fingerprint'] not found in axis"

In [12]:
def split_by_enzyme(input_df):
    '''DocString'''
    
    unique_enzymes = set(input_df['enzyme'].unique())
    
    enzyme_df_list = []
    
    for entry in unique_enzymes: #for each unique enzyme in the input dataframe...
        
        enzyme_df = pd.DataFrame(columns=input_df.columns) #...initialize a new dataframe with the same columns as the input dataframe...
        
        for index, row in input_df.iterrows(): #...iterate through the input dataframe...
            
            if row['enzyme'] == entry: #... and add product rows that correspond to the unique enzyme entry...
                enzyme_df.loc[index] = row
                
        enzyme_df_list.append(enzyme_df) #...then add the completed dataframe of unique enzyme products to a list
           
    return enzyme_df_list #return list of dataframes

In [13]:
def main(input_df):
    '''DocString'''
        
    input_df = fingerprint_products(input_df)    #expand input df: generate mols from SMILES then generate fingerprints from mols, adding columns for each
    
    #input_df.drop(columns=['Mol'])
    
    enzyme_df_list = split_by_enzyme(input_df)    #split expanded df by rows, grouped by enzyme entry (1.1.1.110 etc), into a list of dataframes
    
    #outer = []
    
    for enzyme_df in enzyme_df_list:    #loop through list of enzyme dataframes
        
        enzyme_df['Dist'] = '' #initialize distance column
        
        metric = sim_metric(enzyme_df) #get similarity matrix dataframe
        
        vals = metric.values #use np array of similarity matrix
        
        start_at = 1 #skip autocorrelation
        
        dist_list =[] #initialize list
        
        if len(vals) == 1:
                
            dist_list.append(vals) #add distance value to list
                    
        elif len(vals) > 1:
            for i in range(len(vals)-1): #row of matrix except for last row
            
                for j in range(start_at, len(vals)): #col of matrix skipping first column
                
                    dist_list.append(vals[i][j]) #add distance value to list
            
                start_at += 1 #start at higher index to skip redundancy
        
        #outer.append(dist_list)
        avg_dist = sum(dist_list)/len(dist_list) #compute average distance
        #outer.append(avg_dist)
        for index, row in enzyme_df.iterrows():    #loop through enzyme dataframe 
            enzyme_df['Dist'].loc[index] = avg_dist #add averaged distance to each product row of enzyme dataframe
    
    master_df = pd.concat(enzyme_df_list) #concatenate enzyme dataframes into master_df
    
    return master_df

In [None]:
pos.head()

In [22]:
sm = pos.iloc[:20,:].copy()
sma = loaded.iloc[:100,:].copy()

In [29]:
negdist = main(neg)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [32]:
negdist.to_csv('../../../big-datasets/neg-with-dist.csv', index=None)
negdist.tail()

Unnamed: 0,enzyme,product,reacts,PubChemID,SMILES,n_C,n_H,Fingerprint,n_O,n_N,n_P,n_S,n_X,n_DoU,MW,Dist
11817,EC-2.4.1.325,TAUROLITHOCHOLATE,0.0,9548567,CC(CCC(=O)NCCS(=O)(=O)[O-])C1CCC2C1(CCC3C2CCC4...,26,44,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",5,1,0,1,0,5,482.707,0.422909
8338,EC-1.1.1.44,TEREPHTHALATE,0.0,154269,C1=CC(=CC=C1C(=O)[O-])C(=O)[O-],8,4,"[0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, ...",4,0,0,0,0,7,164.116,0.306122
9936,EC-1.1.1.44,CPD-4261,0.0,2758,CC1(C2CCC(O1)(CC2)C)C,10,18,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, ...",1,0,0,0,0,2,154.253,0.306122
13689,EC-1.1.1.79,CPD-14926,0.0,9900764,CC(C)CCCC(C)CCCC(C)CCCC(=CC=O)C,20,38,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...",1,0,0,0,0,2,294.523,1.0
14328,EC-5.1.3.3,CPD-18475,0.0,102515041,CC(C(=O)[O-])C1(C=C(C(=O)O1)Cl)Cl,7,5,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, ...",4,0,0,0,2,4,224.019,1.0


In [34]:
multiple = negdist[negdist['Dist']< 1]

In [35]:
print(multiple.shape,negdist.shape)

(7690, 16) (9616, 16)


In [47]:
multiple['enzyme'].nunique()

2700

In [60]:
multiple['Dist'].mean()

0.3704669167422681

In [64]:
multiple_pos = dist[dist['Dist'] <1]

In [65]:
multiple_pos['Dist'].mean()

0.5223235849743525

In [66]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
multiple_pos['Dist'].hist()

In [33]:
# this is the positive data
dist.to_csv('../../../big-datasets/positive-with-dist.csv', index=None)
dist.tail()

Unnamed: 0,enzyme,product,reacts,PubChemID,SMILES,n_C,n_H,Fingerprint,n_O,n_N,n_P,n_S,n_X,n_DoU,MW,Dist
3588,EC-2.5.1.134,CPD-18788,1.0,118796934,C1C(C(C(C(C1[NH3+])O)O)O)COP(=O)([O-])[O-],7,15,"[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, ...",7,1,1,0,0,1,256.171,0.359658
3589,EC-2.5.1.134,CPD-18787,1.0,118796917,C1=C(C(C(C(C1OP(=O)([O-])OP(=O)([O-])OCC2C(C(C...,17,23,"[1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",15,5,2,0,0,10,599.339,0.359658
1301,EC-1.14.14.71,CPD-12875,1.0,11241545,CC(=CCCC(=CCCC(C)(C=C)O)C)C,15,26,"[0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, ...",1,0,0,0,0,3,222.372,1.0
1591,EC-1.14.17.3,ASCORBATE,1.0,54679076,C(C(C1C(=C(C(=O)O1)[O-])O)O)O,6,7,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, ...",6,0,0,0,0,3,175.116,0.324675
1592,EC-1.14.17.3,CPD-68,1.0,6971063,C1CC1(C(=O)[O-])[NH3+],4,7,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",2,1,0,0,0,2,101.105,0.324675


In [37]:
### about to make a huge assumption here: if dist = 1 it is because the enzyme is not promiscuous

def add_prom_tag(df, cut_off):
    """Cut-off is inclusive"""
    tag = []
    for index, row in df.iterrows():
        if row['Dist'] >= cut_off:
            tag.append(0.0)
        elif row['Dist'] < cut_off:
            tag.append(1.0)
            
    df['Promiscuous'] = tag
    
    return df

In [42]:
add_prom_tag(dist, 1)

Unnamed: 0,enzyme,product,reacts,PubChemID,SMILES,n_C,n_H,Fingerprint,n_O,n_N,n_P,n_S,n_X,n_DoU,MW,Dist,Promiscuous
1697,EC-1.14.14.77,2-METHYL-3-PHYTYL-14-NAPHTHOQUINONE,1.0,5280483,CC1=C(C(=O)C2=CC=CC=C2C1=O)CC=C(C)CCCC(C)CCCC(...,31,46,"[1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...",2,0,0,0,0,9,450.707,1,0.0
1950,EC-1.14.14.80,CPD-10515,1.0,25201835,CCCCCCCCC(C(CCCCCCCC(=O)[O-])O)O,18,35,"[1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...",4,0,0,0,0,1,315.474,0.714718,1.0
1951,EC-1.14.14.80,PALMITATE,1.0,504166,CCCCCCCCCCCCCCCC(=O)[O-],16,31,"[0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",2,0,0,0,0,1,255.422,0.714718,1.0
1952,EC-1.14.14.80,OLEATE-CPD,1.0,5460221,CCCCCCCCC=CCCCCCCCC(=O)[O-],18,33,"[0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",2,0,0,0,0,2,281.460,0.714718,1.0
1953,EC-1.14.14.80,STEARIC_ACID,1.0,3033836,CCCCCCCCCCCCCCCCCC(=O)[O-],18,35,"[0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",2,0,0,0,0,1,283.476,0.714718,1.0
1954,EC-1.14.14.80,CPD-10514,1.0,19746553,CCCCCCCCC1C(O1)CCCCCCCC(=O)[O-],18,33,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",3,0,0,0,0,2,297.459,0.714718,1.0
4705,EC-2.4.1.241,CPD-14553,1.0,9549262,C1=CN(C(=O)NC1=O)C2C(C(C(O2)COP(=O)([O-])OP(=O...,15,22,"[1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, ...",17,2,2,0,0,7,564.286,1,0.0
695,EC-1.8.1.20,HYPOTAURINE,1.0,25244088,C(CS(=O)[O-])[NH3+],2,7,"[0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, ...",2,1,0,1,0,0,109.150,1,0.0
6425,EC-4.3.2.10,P-RIBOSYL-4-SUCCCARB-AMINOIMIDAZOLE,1.0,45266647,C1=NC(=C(N1C2C(C(C(O2)COP(=O)([O-])[O-])O)O)N)...,13,15,"[1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, ...",12,4,1,0,0,9,450.253,1,0.0
360,EC-1.14.15.M57,CPD-15505,1.0,23724770,CC(CCCC(C)(C)O)C1CCC2(C1(CCC3C2=CC(=O)C4C3(CCC...,27,42,"[1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, ...",4,0,0,0,0,7,430.629,1,0.0


In [43]:
dist.to_csv('../../../big-datasets/pos-with-dist-promlabel.csv')

similarity functions

In [15]:
def sim_i_j(row_i, row_j):
    """DocString"""
    return DataStructs.FingerprintSimilarity(row_i['Fingerprint'], row_j['Fingerprint'], metric=DataStructs.TanimotoSimilarity)

In [16]:
def sim_i_all(input_df, index_i, row_i, metric):
    """DocString"""
    for index_j, row_j in input_df.iterrows():
        if index_j < index_i: #skip redundant rows
            continue
        elif index_i == index_j: #autocorrelate rows
            metric.loc[index_i, index_j] = 1
        else:
            metric.loc[index_i, index_j] = sim_i_j(row_i, row_j) #fill matrix with calculated similarity at two positions at once 
            metric.loc[index_j, index_i] = metric.loc[index_i, index_j]
    return 

In [17]:
def sim_metric(input_df):
    """DocString"""
    metric = pd.DataFrame()
    for index_i, row_i in input_df.iterrows():
        sim_i_all(input_df, index_i, row_i, metric)
    return metric

In [None]:
sim_metric(test)

In [None]:
def main(input_df):
    '''DocString'''
        
    input_df = fingerprint_products(input_data(input_df))    #expand input df: generate mols from SMILES then generate fingerprints from mols, adding columns for each
    
    enzyme_df_list = split_by_enzyme(input_df)    #split expanded df by rows, grouped by enzyme entry (1.1.1.110 etc), into a list of dataframes
    
    for enzyme_df in enzyme_df_list:    #loop through list of enzyme dataframes
        
        enzyme_df['Dist'] = '' #initialize distance column
        
        metric = sim_metric(enzyme_df) #get similarity matrix dataframe
        
        vals = metric.values #use np array of similarity matrix
        
        start_at = 1 #skip autocorrelation
        
        dist_list =[] #initialize list
        
        for i in range(len(vals)-1): #row of matrix except for last row
            
            for j in range(start_at, len(vals)): #col of matrix skipping first column
                
                dist_list.append(vals[i][j]) #add distance value to list
            
            start_at += 1 #start at higher index to skip redundancy
        
        avg_dist = sum(dist_list)/len(dist_list) #compute average distance
        
        for index, row in enzyme_df.iterrows():    #loop through enzyme dataframe 
            enzyme_df['Dist'].loc[index] = avg_dist #add averaged distance to each product row of enzyme dataframe
    
    master_df = pd.concat(enzyme_df_list) #concatenate enzyme dataframes into master_df
    
    return master_df