## This notebook has mixed contents.

1) This notebook has the metacyc master dataframe in its infant stages.
2) This notebook contains the basis of a function to take a PubChem ID number and fetch the associated SMILES string from PubChem.

_____________

## Master DataFrame operations performed:

In [1]:
import numpy as np
import pandas as pd
import pubchempy as pc

In [2]:
master_df = pd.read_csv('../../../big-datasets/EC_master_fix_cofactor.csv') # read in dataframe from metacyc after removal of cofactors

In [4]:
pos_df = master_df.iloc[:7570,:] # select only the positive (true) reactions

In [9]:
pos_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,enzyme,product,reacts,PubChemID
0,1,1,EC-1.1.1.321,CPD-685,1.0,5363397
1,2,2,EC-1.1.1.111,1-INDANOL,1.0,22819
2,3,3,EC-1.21.99.M2,4-HYDROXYPHENYLACETATE,1.0,4693933
3,4,4,EC-1.21.99.M2,CL-,1.0,312
4,7,7,EC-1.3.1.97,UDP-N-ACETYLMURAMATE,1.0,24772978


In [11]:
pos_df = pos_df[['enzyme', 'product', 'reacts', 'PubChemID']] 

In [3]:
neg_df = master_df.sample(10000, random_state=12) # randomly select 10,0000 points from the master_df
# this is a problem to solve later!!!! some of these selected could be positive points

In [7]:
neg_df = neg_df[['enzyme', 'product', 'reacts', 'PubChemID']]

In [8]:
neg_df.head()

Unnamed: 0,enzyme,product,reacts,PubChemID
1541444,EC-1.11.1.8,CPD-9240,0.0,10895555
6802427,EC-4.3.1.24,GERANYL-PP,0.0,5280650
713426,EC-1.3.1.76,CPD-20682,0.0,131841603
3676086,EC-2.3.1.162,CPD-12724,0.0,5281605
4566352,EC-2.3.1.89,R-2-HYDROXYGLUTARATE,0.0,5460200


In [12]:
use_me = pos_df.append(neg_df)

In [13]:
use_me.shape

(17570, 4)

In [14]:
use_me.to_csv('../../../big-datasets/selected_pos_neg_rxns.csv') # saving selected reactions

In [16]:
use_df_smiles, unsuccessful = cid_df_to_smiles(use_me, 'PubChemID')

In [17]:
use_df_smiles.head()

Unnamed: 0,enzyme,product,reacts,PubChemID,SMILES
0,EC-1.1.1.321,CPD-685,1.0,5363397,CC(=CCO)CCC=C(C)CO
1,EC-1.1.1.111,1-INDANOL,1.0,22819,C1CC2=CC=CC=C2C1O
2,EC-1.21.99.M2,4-HYDROXYPHENYLACETATE,1.0,4693933,C1=CC(=CC=C1CC(=O)[O-])O
3,EC-1.21.99.M2,CL-,1.0,312,[Cl-]
4,EC-1.3.1.97,UDP-N-ACETYLMURAMATE,1.0,24772978,CC(C(=O)[O-])OC1C(C(OC(C1O)CO)OP(=O)([O-])OP(=...


In [18]:
len(unsuccessful)

559

In [19]:
use_df = use_df_smiles[~use_df_smiles['PubChemID'].isin(unsuccessful)] # drop rows without SMILES

In [20]:
use_df.shape

(17011, 5)

In [21]:
use_df.to_csv('../../../big-datasets/selected_with_smiles.csv') # save this version of the master

### Get SMILES from CID

__________________

### Manipulate DF containing CID into SMILES

In [3]:
%%writefile pubchem_client.py
import pubchempy as pc


def cid_df_to_smiles(df, cid_colname):
    """
    Args:
        df : pandas dataframe with CID numbers
        cid_colname (str) : name of column that contains PubChem SID numbers

    Returns:
        df : modified with column SMILES
        unsuccessful_list : list of CIDs for which no SMILES were found

    """

    res = []
    unsuccessful_list = []

    for index, row in df.iterrows():
        cid = row[cid_colname]
        try:
            # pubchempy calls to get compound info
            compound = pc.get_compounds(cid)[0]
            smiles = compound.canonical_smiles
            res.append(smiles)

        except BaseException:
            res.append('none')
            unsuccessful_list.append(cid)
            pass

    df['SMILES'] = res
    # df.to_csv(r'../datasets/df_cleaned_kegg_with_smiles.csv')

    return df, unsuccessful_list

Overwriting pubchem_client.py


In [45]:
compound = pc.get_compounds(5363397)[0]
smiles = compound.canonical_smiles
print(smiles)

CC(=CCO)CCC=C(C)CO


In [52]:
cid_df_to_smiles(test_df, 'CID')[0]

Unnamed: 0,EC,Compound,Reacts,CID,SMILES
0,EC-1.1.1.321,CPD-685,1,5363397,CC(=CCO)CCC=C(C)CO
1,EC-1.1.1.111,1-INDANOL,1,22819,C1CC2=CC=CC=C2C1O
2,EC-1.21.99.M2,4-HYDROXYPHENYLACETATE,1,4693933,C1=CC(=CC=C1CC(=O)[O-])O
3,EC-1.21.99.M2,Cl-,1,312,[Cl-]


In [42]:
test_df = pd.DataFrame([['EC-1.1.1.321', 'CPD-685', 1, 5363397], ['EC-1.1.1.111', '1-INDANOL', 1, 22819], 
                        ['EC-1.21.99.M2', '4-HYDROXYPHENYLACETATE', 1, 4693933],['EC-1.21.99.M2', 'Cl-', 1, 312]], columns=['EC', 'Compound', 'Reacts', 'CID'])

In [5]:
%%writefile test_pubchem_client.py
import pandas as pd

from pandas.util.testing import assert_frame_equal

import pubchem_client


def test_cid_df_to_smiles():
    """Unit test for pubchem_client.py."""

    test_frame = pd.DataFrame([['EC-1.1.1.321',
                                'CPD-685',
                                1,
                                5363397],
                               ['EC-1.1.1.111',
                                '1-INDANOL',
                                1,
                                22819],
                               ['EC-1.21.99.M2',
                                '4-HYDROXYPHENYLACETATE',
                                1,
                                4693933],
                               ['EC-1.21.99.M2',
                                'Cl-',
                                1,
                                312]],
                              columns=['EC',
                                       'Compound',
                                       'Reacts',
                                       'CID'])

    expected_frame = pd.DataFrame([['EC-1.1.1.321',
                                    'CPD-685',
                                    1,
                                    5363397,
                                    'CC(=CCO)CCC=C(C)CO'],
                                   ['EC-1.1.1.111',
                                    '1-INDANOL',
                                    1,
                                    22819,
                                    'C1CC2=CC=CC=C2C1O'],
                                   ['EC-1.21.99.M2',
                                    '4-HYDROXYPHENYLACETATE',
                                    1,
                                    4693933,
                                    'C1=CC(=CC=C1CC(=O)[O-])O'],
                                   ['EC-1.21.99.M2',
                                    'Cl-',
                                    1,
                                    312,
                                    '[Cl-]']],
                                  columns=['EC',
                                           'Compound',
                                           'Reacts',
                                           'CID',
                                           'SMILES',
                                           ])

    cid_colname = 'CID'
    result_frame = pubchem_client.cid_df_to_smiles(test_frame, cid_colname)

    assert_frame_equal(
        result_frame[0], expected_frame), 'Did not generate expected df.'

    return

Overwriting test_pubchem_client.py
