In [2]:
import numpy as np
import pandas as pd
import pubchempy as pc

#rdkit imports
import rdkit
from rdkit import Chem
from rdkit.Chem.Fingerprints import FingerprintMols

## what do i want this model function to do?

I want it to...

1) check to see if the compound is a known enzymatic product
    * if so: return the enzyme/reaction/compound/starting compound
2) if not: pair the input or found compound with all the promiscuous products

3) calculate the distance between the query compound and each product of each promiscuous enzyme (NOT the average distance, just the distance)

4) return the n most similar reactions/substrates/whatever (number n is selected by user, default is hard-coded)

this code needs to do 2-4, 1 is external but needs to talk to this code

inputs: master_df, enzyme_col, pubchem_col, smiles_col, pubchem_cid
need to drop fingerprint and mol from columns if they are there
need to make sure this is the promsicuous master_df (positive only)
this is almost exactly the same inputs as pair_query_compound

____

preparing development dataframe

In [3]:
prom_pos = pd.read_csv('../../../big-datasets/positive-with-dist.csv') # get all of the known enzymatic reactions in metacyc

FileNotFoundError: [Errno 2] File b'../../../big-datasets/positive-with-dist.csv' does not exist: b'../../../big-datasets/positive-with-dist.csv'

In [3]:
prom_pos.drop(columns=['Fingerprint'], inplace=True)
prom_pos.head()

Unnamed: 0,enzyme,product,reacts,PubChemID,SMILES,n_C,n_H,n_O,n_N,n_P,n_S,n_X,n_DoU,MW,Dist
0,EC-1.14.14.77,2-METHYL-3-PHYTYL-14-NAPHTHOQUINONE,1.0,5280483,CC1=C(C(=O)C2=CC=CC=C2C1=O)CC=C(C)CCCC(C)CCCC(...,31,46,2,0,0,0,0,9,450.707,1.0
1,EC-1.14.14.80,CPD-10515,1.0,25201835,CCCCCCCCC(C(CCCCCCCC(=O)[O-])O)O,18,35,4,0,0,0,0,1,315.474,0.714718
2,EC-1.14.14.80,PALMITATE,1.0,504166,CCCCCCCCCCCCCCCC(=O)[O-],16,31,2,0,0,0,0,1,255.422,0.714718
3,EC-1.14.14.80,OLEATE-CPD,1.0,5460221,CCCCCCCCC=CCCCCCCCC(=O)[O-],18,33,2,0,0,0,0,2,281.46,0.714718
4,EC-1.14.14.80,STEARIC_ACID,1.0,3033836,CCCCCCCCCCCCCCCCCC(=O)[O-],18,35,2,0,0,0,0,1,283.476,0.714718


In [6]:
prom_pos.shape

(7395, 15)

In [4]:
def fingerprint_products(input_df):
    '''DocString'''
    
    mol_list = []
    fp_list = []
    
    for index, row in input_df.iterrows():
        mol_list.append(Chem.rdmolfiles.MolFromSmiles(row['SMILES'])) #get mols from SMILES and add mols to list
        fp_list.append(FingerprintMols.FingerprintMol(Chem.rdmolfiles.MolFromSmiles(row['SMILES']))) #get fingerprints from mols and and fingerprints to list
        
    input_df.insert(1, column='Mol', value=mol_list)
    input_df.insert(2, column='Fingerprint', value= fp_list)
            
    return input_df

In [5]:
small = prom_pos.iloc[:100,:].copy()

______

In [53]:
%%writefile expansion_models.py
import pubchempy as pc

# rdkit imports
from rdkit import Chem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs


def check_for_known(all_enz_df, mol_fingerprint, threshold):
    """
    Args:
        all_enz_df : pandas dataframe with all known enzymatic reactions
        mol_fingerprint : RDKit fingerprint of query molecule
        threshold (float, 0-1.0) : cut-off value for similarity.
                                    Recommend 0.95 or above for truly similar compounds.

    Returns:
        known_df : df with known enzymatic transformations of this molecule
                    ordered from most similar molecule to least similar
        or
        printed message : indication to begin searching through promiscuous enzymes

    """
    # drop any columns leftover from the last iteration
    bad_cols = ['Known', 'Mol', 'Fingerprint']

    for col in bad_cols:
        if col in all_enz_df.columns:
            all_enz_df.drop(columns=col, inplace=True)
        else:
            continue

    fingerprint = mol_fingerprint

    # fingerprint the input dataframe and return it
    input_df = fingerprint_products(all_enz_df)

    input_df['Known'] = ''

    for index, row in input_df.iterrows():
        # get similarity between query compound and each molecule 
        # in df
        similarity = DataStructs.FingerprintSimilarity(
            fingerprint, row['Fingerprint'], metric=DataStructs.TanimotoSimilarity)
        input_df['Known'].loc[index] = similarity

    # select only reactions that have product similarity to query
    # greater than the threshold
    known_df = input_df[input_df['Known'] >= threshold]

    if len(known_df) > 0:
        known_df.sort_values(by='Known', ascending=False, inplace=True)
        result = known_df
    else:
        # call to promiscuous search code here
        result = print('No known enzymes. Begin promiscuous search.')

    return result


def cid_to_smiles(cid):
    """
    Args:
        cid : cid to query for smiles
        
    Returns:
       smiles (str) : smiles string retrieved from pubchem
        cid : as input
    
    """
    
    try:
        # query pubchem for information about this cid
        compound = pc.get_compounds(cid)[0]
        smiles = compound.canonical_smiles
    except BaseException:
        pass

    return smiles, cid


def dist_for_expansion(prom_df, pubchem_cid, num_similar):
    """
    Args:
        prom_df : pandas dataframe with all promiscuous enzymatic reactions
        pubchem_cid : cid of query molecule
        num_similar (int) : number of similar compounds to retrieve for tree search.
                            If input is 'None', default is 20 molecules.
                            A higher number will make search time longer.

    Returns:
        selected (df) : df with selected promiscuous enzyme pairs to use in tree search
    """
    
    # drop column leftover from previous iteration
    bad_cols = ['ExpanDist']

    for col in bad_cols:
        if col in prom_df.columns:
            prom_df.drop(columns=col, inplace=True)
        else:
            continue

    smiles, _ = cid_to_smiles(pubchem_cid)

    if len(smiles) == 0:
        raise 'query compound SMILES string could not be retrieved'
    else:
        pass
    
    # get molecule and fingerprint for query compound
    # using rdkit
    mol = Chem.rdmolfiles.MolFromSmiles(smiles)
    fingerprint = FingerprintMols.FingerprintMol(mol)

    prom_df['ExpanDist'] = ''

    for index, row in prom_df.iterrows():
        # get similarity between query compound and each molecule 
        # in df
        comp = DataStructs.FingerprintSimilarity(
            row['Fingerprint'], fingerprint, metric=DataStructs.TanimotoSimilarity)
        prom_df['ExpanDist'].loc[index] = comp

    prom_df.sort_values(by='ExpanDist', ascending=False, inplace=True)

    if num_similar == 'None':
        n = 20
    else:
        n = num_similar

    selected = prom_df.iloc[:n, :].copy()

    return selected


def fingerprint_products(input_df):  # fingerprints all products in a given df
    """
    Args:
        input_df : pandas dataframe with smiles in column named 'SMILES'

    Returns:
        input_df : with added mol and fingerprint columns
    """

    mol_list = []
    fp_list = []

    for index, row in input_df.iterrows():
        # get mols from SMILES and add mols to list
        mol_list.append(Chem.rdmolfiles.MolFromSmiles(row['SMILES']))
        fp_list.append(FingerprintMols.FingerprintMol(Chem.rdmolfiles.MolFromSmiles(
            row['SMILES'])))  # get fingerprints from mols and and fingerprints to list

    input_df.insert(1, column='Mol', value=mol_list)
    input_df.insert(2, column='Fingerprint', value=fp_list)

    return input_df

Overwriting expansion_models.py


In [1]:
%%writefile test_expansion_models.py
import pandas as pd

# rdkit imports
from rdkit import Chem
from rdkit.Chem.Fingerprints import FingerprintMols

from pandas.util.testing import assert_frame_equal

import expansion_models


def test_check_for_known():
    """Testing check_for_known function with CIDs = [243, 985]"""

    expected_df = pd.DataFrame([['EC-1.14.14.80',
                                 'PALMITATE',
                                 504166,
                                 'CCCCCCCCCCCCCCCC(=O)[O-]',
                                 0.714717543728323,
                                 1.0],
                                ['EC-1.14.14.80',
                                 'STEARIC_ACID',
                                 3033836,
                                 'CCCCCCCCCCCCCCCCCC(=O)[O-]',
                                 0.714717543728323,
                                 1.0]],
                               columns=['enzyme',
                                        'product',
                                        'PubChemID',
                                        'SMILES',
                                        'Dist',
                                        'Known'])

    test_df = pd.DataFrame([['EC-1.14.14.77',
                             '2-METHYL-3-PHYTYL-14-NAPHTHOQUINONE',
                             5280483,
                             'CC1=C(C(=O)C2=CC=CC=C2C1=O)CC=C(C)CCCC(C)CCCC(C)CCCC(C)C',
                             1.0],
                            ['EC-1.14.14.80',
                             'CPD-10515',
                             25201835,
                             'CCCCCCCCC(C(CCCCCCCC(=O)[O-])O)O',
                             0.714717543728323],
                            ['EC-1.14.14.80',
                             'PALMITATE',
                             504166,
                             'CCCCCCCCCCCCCCCC(=O)[O-]',
                             0.714717543728323],
                            ['EC-1.14.14.80',
                             'OLEATE-CPD',
                             5460221,
                             'CCCCCCCCC=CCCCCCCCC(=O)[O-]',
                             0.714717543728323],
                            ['EC-1.14.14.80',
                             'STEARIC_ACID',
                             3033836,
                             'CCCCCCCCCCCCCCCCCC(=O)[O-]',
                             0.714717543728323]],
                           columns=['enzyme',
                                    'product',
                                    'PubChemID',
                                    'SMILES',
                                    'Dist'])

    test_smiles = ['C1=CC=C(C=C1)C(=O)O', 'CCCCCCCCCCCCCCCC(=O)O']

    fingerprint_list = []
    for smile in test_smiles:
        mol = Chem.rdmolfiles.MolFromSmiles(smile)
        fingerprint = FingerprintMols.FingerprintMol(mol)
        fingerprint_list.append(fingerprint)

    for mol_fingerprint in fingerprint_list:
        test_result = expansion_models.check_for_known(
            test_df, mol_fingerprint, 0.95)

        if test_result is not type(pd.core.frame.DataFrame):
            continue
        else:
            test_result.drop(columns=['Fingerprint', 'Mol'], inplace=True)
            assert test_result == expected_df

    return


def test_dist_for_expansion():
    test_df = pd.DataFrame([['EC-1.14.14.77',
                             '2-METHYL-3-PHYTYL-14-NAPHTHOQUINONE',
                             5280483,
                             'CC1=C(C(=O)C2=CC=CC=C2C1=O)CC=C(C)CCCC(C)CCCC(C)CCCC(C)C',
                             1.0],
                            ['EC-1.14.14.80',
                             'CPD-10515',
                             25201835,
                             'CCCCCCCCC(C(CCCCCCCC(=O)[O-])O)O',
                             0.714717543728323],
                            ['EC-1.14.14.80',
                             'PALMITATE',
                             504166,
                             'CCCCCCCCCCCCCCCC(=O)[O-]',
                             0.714717543728323],
                            ['EC-1.14.14.80',
                             'OLEATE-CPD',
                             5460221,
                             'CCCCCCCCC=CCCCCCCCC(=O)[O-]',
                             0.714717543728323]],
                           columns=['enzyme',
                                    'product',
                                    'PubChemID',
                                    'SMILES',
                                    'Dist'])

    expected = pd.DataFrame([['EC-1.14.14.80',
                              'PALMITATE',
                              504166,
                              'CCCCCCCCCCCCCCCC(=O)[O-]',
                              0.714717543728323,
                              1.0],
                             ['EC-1.14.14.80',
                              'OLEATE-CPD',
                              5460221,
                              'CCCCCCCCC=CCCCCCCCC(=O)[O-]',
                              0.714717543728323,
                              0.75],
                             ['EC-1.14.14.80',
                              'CPD-10515',
                              25201835,
                              'CCCCCCCCC(C(CCCCCCCC(=O)[O-])O)O',
                              0.714717543728323,
                              0.7377049180327869],
                             ['EC-1.14.14.77',
                              '2-METHYL-3-PHYTYL-14-NAPHTHOQUINONE',
                              5280483,
                              'CC1=C(C(=O)C2=CC=CC=C2C1=O)CC=C(C)CCCC(C)CCCC(C)CCCC(C)C',
                              1.0,
                              0.3515625]],
                            columns=['enzyme',
                                     'product',
                                     'PubChemID',
                                     'SMILES',
                                     'Dist',
                                     'ExpanDist'])

    fingers = expansion_models.fingerprint_products(test_df)
    actual = expansion_models.dist_for_expansion(fingers, 985, 4)
    actual.drop(columns=['Mol', 'Fingerprint'], inplace=True)
    assert_frame_equal(actual.reset_index(drop=True),
                       expected.reset_index(drop=True), check_dtype=False)

    return

Writing test_expansion_models.py
