In [1]:
import Bio
from Bio.KEGG import REST
from Bio.KEGG import Enzyme

import gzip
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pubchempy as pc

In [2]:
enzyme_fields = [method for method in dir(Enzyme.Record()) if not method.startswith('_')]
data_matrix = []

with gzip.open('../datasets/KEGG_enzymes_all_data.gz', 'rt') as file:
    for record in Enzyme.parse(file):
        data_matrix.append([getattr(record, field) for field in enzyme_fields])

In [3]:
enzyme_df = pd.DataFrame(data_matrix, columns=enzyme_fields)

In [4]:
enzyme_df.head()

Unnamed: 0,classname,cofactor,comment,dblinks,disease,effector,entry,genes,inhibitor,name,pathway,product,reaction,structures,substrate,sysname
0,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Acts on primary or secondary ...,"[(ExplorEnz - The Enzyme Database, [1.1.1.1]),...",[],[],1.1.1.1,"[(HSA, [124, 125, 126, 127, 128, 130, 131]), (...",[],"[alcohol dehydrogenase, aldehyde reductase, AD...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ...",[(1) a primary alcohol + NAD+ = an aldehyde + ...,[],"[primary alcohol [CPD:C00226], NAD+ [CPD:C0000...",[alcohol:NAD+ oxidoreductase]
1,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Some members of this group ox...,"[(ExplorEnz - The Enzyme Database, [1.1.1.2]),...",[],[],1.1.1.2,"[(HSA, [10327]), (PTR, [741418]), (PPS, [10099...",[],"[alcohol dehydrogenase (NADP+), aldehyde reduc...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADPH [CPD:C00005], H+...",[an alcohol + NADP+ = an aldehyde + NADPH + H+...,[],"[alcohol [CPD:C00069], NADP+ [CPD:C00006]]",[alcohol:NADP+ oxidoreductase]
2,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[The yeast enzyme acts most rapidly with NAD+;...,"[(ExplorEnz - The Enzyme Database, [1.1.1.3]),...",[],[],1.1.1.3,"[(NVE, [NEMVE_v1g225948]), (ATH, [AT1G31230, A...",[],"[homoserine dehydrogenase, HSDH, HSD]","[(PATH, ec00260, Glycine, serine and threonine...","[L-aspartate 4-semialdehyde [CPD:C00441], NADH...",[L-homoserine + NAD(P)+ = L-aspartate 4-semial...,[],"[L-homoserine [CPD:C00263], NAD+ [CPD:C00003],...",[L-homoserine:NAD(P)+ oxidoreductase]
3,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Also converts diacetyl into acetoin with NADH...,"[(ExplorEnz - The Enzyme Database, [1.1.1.4]),...",[],[],1.1.1.4,"[(SCE, [YAL060W, YAL061W]), (KLA, [KLLA0_F0050...",[],"[(R,R)-butanediol dehydrogenase, butyleneglyco...","[(PATH, ec00650, Butanoate metabolism)]","[(R)-acetoin [CPD:C00810], NADH [CPD:C00004], ...","[(R,R)-butane-2,3-diol + NAD+ = (R)-acetoin + ...",[],"[(R,R)-butane-2,3-diol [CPD:C03044], NAD+ [CPD...","[(R,R)-butane-2,3-diol:NAD+ oxidoreductase]"
4,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Transferred entry: acetoin dehydrogenase. Now...,[],[],[],1.1.1.5,[],[],[Transferred to 1.1.1.303 and 1.1.1.304],[],[],[],[],[],[]


In [5]:
# example enzyme df search 
enzyme_df[enzyme_df.entry == '1.1.1.153']['reaction']

152    [(1) L-erythro-7,8-dihydrobiopterin + NADP+ = ...
Name: reaction, dtype: object

In [6]:
enzyme_df['reaction'][153]

['(S)-ureidoglycolate + NAD(P)+ = oxalureate + NAD(P)H + H+ [RN:R02935 R02936]']

In [7]:
# get promiscuous dataframe and make it compact 
promiscuous_df = enzyme_df[[True if len(rxn) > 1 else False for rxn in enzyme_df['reaction']]]
compact_promiscuous_df = promiscuous_df[['entry','reaction','product','substrate']]

#### check for reversible reactions 

In [8]:
def get_reaction_list(df_with_reaction_column):
    """get the list of reaction from a dataframe that contains reaction column"""
    reaction_list = []
    for index,row in df_with_reaction_column.iterrows():
        for reaction in row['reaction']:
            reaction_split = reaction.split("[RN:")[-1]
            if reaction_split.startswith("R") and not reaction_split.startswith("RN"):
                for i in reaction_split[:-1].split(" "):
                    reaction_list.append(i)
    return reaction_list

In [9]:
promiscuous_reaction_list = get_reaction_list(compact_promiscuous_df)

In [10]:
len(promiscuous_reaction_list)

1302

In [11]:
def query_reversible_reaction(list_with_reaction):
    """get the list of reversible reaction"""
    reversible_reaction = []
    for reaction in reaction_list:
        reaction_file = REST.kegg_get(reaction).read()
        for i in reaction_file.rstrip().split("\n"):
            if i.startswith("EQUATION") and "<=>" in i:
                reversible_reaction.append(reaction)
    return reversible_reaction 

In [12]:
#check whether query_reversible_reaction function works.
reaction_file = REST.kegg_get("R00709").read()
for line in reaction_file.rstrip().split("\n"):
    if line.startswith("EQUATION") and "<=>" in line:
        print ("R00709")
        print (line)

R00709
EQUATION    C00311 + C00003 <=> C00026 + C00011 + C00004 + C00080


In [13]:
#will take forever to run 


#reversible_reaction = query_reversible_reaction(promiscuous_reaction_list)

In [14]:
# it seem like all the reactions are reversible 
#len(reversible_reaction)

### append substrate molecules to product column

In [15]:
# difficult to use iterrows because of inconsistent index 
compact_promiscuous_df.head(10)

Unnamed: 0,entry,reaction,product,substrate
0,1.1.1.1,[(1) a primary alcohol + NAD+ = an aldehyde + ...,"[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ...","[primary alcohol [CPD:C00226], NAD+ [CPD:C0000..."
37,1.1.1.38,[(1) (S)-malate + NAD+ = pyruvate + CO2 + NADH...,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADH...","[(S)-malate [CPD:C00149], NAD+ [CPD:C00003], o..."
39,1.1.1.40,[(1) (S)-malate + NADP+ = pyruvate + CO2 + NAD...,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADP...","[(S)-malate [CPD:C00149], NADP+ [CPD:C00006], ..."
41,1.1.1.42,[isocitrate + NADP+ = 2-oxoglutarate + CO2 + N...,"[2-oxoglutarate [CPD:C00026], CO2 [CPD:C00011]...","[isocitrate [CPD:C00311], NADP+ [CPD:C00006], ..."
84,1.1.1.85,"[(2R,3S)-3-isopropylmalate + NAD+ = 4-methyl-2...","[4-methyl-2-oxopentanoate [CPD:C00233], CO2 [C...","[(2R,3S)-3-isopropylmalate [CPD:C04411], NAD+ ..."
109,1.1.1.110,[(1) (R)-3-(phenyl)lactate + NAD+ = 3-phenylpy...,"[3-phenylpyruvate, NADH [CPD:C00004], H+ [CPD:...","[(R)-3-(phenyl)lactate [CPD:C05607], NAD+ [CPD..."
152,1.1.1.153,"[(1) L-erythro-7,8-dihydrobiopterin + NADP+ = ...","[sepiapterin [CPD:C00835], NADPH [CPD:C00005],...","[L-erythro-7,8-dihydrobiopterin [CPD:C02953], ..."
186,1.1.1.187,[(1) GDP-alpha-D-rhamnose + NAD(P)+ = GDP-4-de...,"[GDP-4-dehydro-alpha-D-rhamnose [CPD:C01222], ...","[GDP-alpha-D-rhamnose [CPD:C03117], NAD+ [CPD:..."
202,1.1.1.203,[(1) beta-D-galacturonate + NAD+ = D-galactaro...,"[D-galactaro-1,5-lactone [CPD:C20889], NADH [C...","[beta-D-galacturonate, NAD+ [CPD:C00003], beta..."
236,1.1.1.237,[(1) (R)-3-(4-hydroxyphenyl)lactate + NAD(P)+ ...,"[3-(4-hydroxyphenyl)pyruvate [CPD:C01179], NAD...","[(R)-3-(4-hydroxyphenyl)lactate [CPD:C03964], ..."


In [16]:
rowindex = np.arange(0,len(compact_promiscuous_df))
compact_promiscuous_df_index = compact_promiscuous_df.set_index(rowindex)

In [17]:
def combine_substrate_product(df_with_ordered_index):
    """append substrates to product column. should not be run multiple times. 
    it will append substrates multiple times"""
    newdf = df_with_ordered_index
    for index,row in df_with_ordered_index.iterrows():
        productlist = row['product']
        substratelist = row['substrate']
        newdf.iloc[index,2] = productlist + substratelist 
    return newdf

In [18]:
# do not run this multiple times! 
combined_df = combine_substrate_product(compact_promiscuous_df_index)

In [19]:
# check whether it is added multiple times
# if appended multiple times, need to rerun cells from the very beginning 
combined_df.iloc[0,2]

['aldehyde [CPD:C00071]',
 'NADH [CPD:C00004]',
 'H+ [CPD:C00080]',
 'ketone [CPD:C01450]',
 'primary alcohol [CPD:C00226]',
 'NAD+ [CPD:C00003]',
 'secondary alcohol [CPD:C01612]']

In [20]:
compact_combined_df = combined_df[['entry','product']]

In [21]:
compact_combined_df.head(10)

Unnamed: 0,entry,product
0,1.1.1.1,"[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ..."
1,1.1.1.38,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADH..."
2,1.1.1.40,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADP..."
3,1.1.1.42,"[2-oxoglutarate [CPD:C00026], CO2 [CPD:C00011]..."
4,1.1.1.85,"[4-methyl-2-oxopentanoate [CPD:C00233], CO2 [C..."
5,1.1.1.110,"[3-phenylpyruvate, NADH [CPD:C00004], H+ [CPD:..."
6,1.1.1.153,"[sepiapterin [CPD:C00835], NADPH [CPD:C00005],..."
7,1.1.1.187,"[GDP-4-dehydro-alpha-D-rhamnose [CPD:C01222], ..."
8,1.1.1.203,"[D-galactaro-1,5-lactone [CPD:C20889], NADH [C..."
9,1.1.1.237,"[3-(4-hydroxyphenyl)pyruvate [CPD:C01179], NAD..."


In [22]:
# save substrate and product combined dataframe to csv 
# might remove this dataframe from the git repo soon 
# substrate_to_product_promiscuous_df.to_csv("../datasets/substrate_product_combined_promiscuous.csv")

### cofactor removal 

In [23]:
len(compact_combined_df)

549

In [24]:
# test text splicing 
test='aldehyde [CPD:C00071]'
test[-7:-1]

'C00071'

In [25]:
def get_cofactor_list(cofactor_df,CPDcolumn):
    cofactor_list = [cofactor[4:10] for cofactor in cofactor_df[CPDcolumn]]
    return cofactor_list 

In [26]:
cofactor_df=pd.read_csv("../datasets/cofactor_list.csv")
cofactor_df.head(10)

Unnamed: 0.1,Unnamed: 0,CPD,description
0,1,cpd:C00001,H2O; Water
1,2,cpd:C00002,ATP; Adenosine 5'-triphosphate
2,3,cpd:C00003,NAD+; NAD; Nicotinamide adenine dinucleotide; ...
3,4,cpd:C00004,NADH; DPNH; Reduced nicotinamide adenine dinuc...
4,5,cpd:C00005,NADPH; TPNH; Reduced nicotinamide adenine dinu...
5,6,cpd:C00006,NADP+; NADP; Nicotinamide adenine dinucleotide...
6,7,cpd:C00007,Oxygen; O2
7,8,cpd:C00008,ADP; Adenosine 5'-diphosphate
8,9,cpd:C00009,Orthophosphate; Phosphate; Phosphoric acid; Or...
9,10,cpd:C00010,CoA; Coenzyme A; CoA-SH


In [27]:
cofactor_list = get_cofactor_list(cofactor_df,"CPD")
cofactor_list

['C00001',
 'C00002',
 'C00003',
 'C00004',
 'C00005',
 'C00006',
 'C00007',
 'C00008',
 'C00009',
 'C00010',
 'C00011',
 'C00012',
 'C00013',
 'C00014',
 'C00015',
 'C00016',
 'C00017',
 'C00018',
 'C00019',
 'C00020',
 'C00021',
 'C00023',
 'C00027',
 'C00028',
 'C00030',
 'C00032',
 'C00034',
 'C00050',
 'C00061',
 'C00070',
 'C00080',
 'C00255',
 'C01007',
 'C01352',
 'C01382',
 'C02745',
 'C02869']

In [28]:
def get_cpd(compound_full):
    "when full name of compound inserted, return cpd id"
    cpd = compound_full[-7:-1]
    return cpd 

In [29]:
def rm_cofactor_only_cpd(df,compound_columnname,cofactor_list):
    newdf = df.drop(["product"],axis=1)
    cleaned_compound_column = []
    for index,row in df.iterrows():
        cpd_compound_list =[]
        for compound in row[compound_columnname]:
            if "CPD" in compound:
                onlycpd = get_cpd(compound)
                if onlycpd not in cofactor_list:
                    cpd_compound_list.append(onlycpd)
                else:
                    pass
        if len(cpd_compound_list)==0:
            cleaned_compound_column.append("NA")
        else: 
            cleaned_compound_column.append(cpd_compound_list)
    newdf['product'] = cleaned_compound_column
    return newdf

In [30]:
cleaned_df_productinList = rm_cofactor_only_cpd(compact_combined_df,'product',cofactor_list)

In [31]:
#cleaned_promiscuous_enzyme_df.to_csv("../datasets/cleaned_promiscous_enzyme_df.csv", header=['entry','product'])

In [32]:
#remove enzymes with no products 
noNAenzyme = cleaned_df_productinList.loc[cleaned_df_productinList['product']!='NA']
len(noNAenzyme)

521

### format the dataframe to be easily applicable for pubchem ID search and SMILES string search 

In [33]:
noNAenzyme.rename(columns={'product':'products'}, inplace=True)
noNAenzyme

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,entry,products
0,1.1.1.1,"[C00071, C01450, C00226, C01612]"
1,1.1.1.38,"[C00022, C00149, C00036]"
2,1.1.1.40,"[C00022, C00149, C00036]"
3,1.1.1.42,"[C00026, C05379, C00311, C05379]"
4,1.1.1.85,"[C00233, C04236, C04411, C04236]"
5,1.1.1.110,"[C01179, C00331, C05607, C03964, C22006]"
6,1.1.1.153,"[C00835, C03684, C02953, C00272]"
7,1.1.1.187,"[C01222, C03117, C02977]"
8,1.1.1.203,"[C20889, C20890]"
9,1.1.1.237,"[C01179, C04045, C03964, C22038]"


In [34]:
def itemlist_eachrow(df,oldcolumnname,newcolumnname,enzymecolumn):
    newdf = df[oldcolumnname].\
    apply(pd.Series).\
    merge(df, left_index=True, right_index=True).\
    drop([oldcolumnname],axis=1).\
    melt(id_vars=[enzymecolumn],value_name=newcolumnname).\
    sort_values(by=[enzymecolumn]).\
    dropna().\
    drop(columns=["variable"])
    return newdf

In [35]:
expanded_noNAenzyme = itemlist_eachrow(noNAenzyme,"products","product","entry")

In [36]:
#dropped duplicates within product column 

expanded_noNAenzyme.drop_duplicates(['entry','product'],keep='first',inplace=True)

In [37]:
expanded_noNAenzyme

Unnamed: 0,entry,product
0,1.1.1.1,C00071
1042,1.1.1.1,C00226
521,1.1.1.1,C01450
1563,1.1.1.1,C01612
1568,1.1.1.110,C03964
526,1.1.1.110,C00331
5,1.1.1.110,C01179
2089,1.1.1.110,C22006
1047,1.1.1.110,C05607
527,1.1.1.153,C03684


In [38]:
len(expanded_noNAenzyme)

2144

### pubchemID search 

In [39]:
import re
from Bio.KEGG import Compound

In [40]:
def compound_records_to_df(file_path):
    """
    Input should be a filepath string pointing to a gzipped text file of KEGG enzyme records.
    Function parses all records using Biopython.Bio.KEGG.Compound parser, and returns a pandas dataframe.
    """
    compound_fields = [method for method in dir(Compound.Record()) if not method.startswith('_')]
    data_matrix = []

    with gzip.open(file_path, 'rt') as file:
        for record in Compound.parse(file):
            data_matrix.append([getattr(record, field) for field in compound_fields])
    
    compound_df = pd.DataFrame(data_matrix, columns=compound_fields)
    return compound_df

In [41]:
compound_df = compound_records_to_df('../datasets/KEGG_compound_db_entries.gz')

In [42]:
def extract_PubChem_id(field):
    """
    This function uses regular expressions to extract the PubChem compound IDs from a field in a record
    """

    regex = "'PubChem', \[\'(\d+)\'\]\)" # matches "'PubChem', ['" characters exactly, then captures any number of digits (\d+), before another literal "']" character match
    ids = re.findall(regex, str(field), re.IGNORECASE)
    if len(ids) > 0:
        pubchem_id = ids[0]
    else:
        pubchem_id = ''
    
    return pubchem_id

In [43]:
PubChemID_list = []

for _, row in compound_df.iterrows():
    pubchem_id = extract_PubChem_id(row['dblinks'])
    PubChemID_list.append(pubchem_id)
    
compound_df['PubChem'] = PubChemID_list
compound_df.head(10)

Unnamed: 0,dblinks,entry,enzyme,formula,mass,name,pathway,structures,PubChem
0,"[(CAS, [7732-18-5]), (PubChem, [3303]), (ChEBI...",C00001,"[1.1.1.1, 1.1.1.22, 1.1.1.23, 1.1.1.115, 1.1.1...",H2O,,"[H2O, Water]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3303
1,"[(CAS, [56-65-5]), (PubChem, [3304]), (ChEBI, ...",C00002,"[1.1.98.6, 1.2.1.30, 1.2.1.95, 1.2.1.101, 1.3....",C10H16N5O13P3,,"[ATP, Adenosine 5'-triphosphate]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3304
2,"[(CAS, [53-84-9]), (PubChem, [3305]), (ChEBI, ...",C00003,"[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",C21H28N7O14P2,,"[NAD+, NAD, Nicotinamide adenine dinucleotide,...","[(PATH, map00190, Oxidative phosphorylation), ...",[],3305
3,"[(CAS, [58-68-4]), (PubChem, [3306]), (ChEBI, ...",C00004,"[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",C21H29N7O14P2,,"[NADH, DPNH, Reduced nicotinamide adenine dinu...","[(PATH, map00190, Oxidative phosphorylation), ...",[],3306
4,"[(CAS, [2646-71-1]), (PubChem, [3307]), (ChEBI...",C00005,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",C21H30N7O17P3,,"[NADPH, TPNH, Reduced nicotinamide adenine din...","[(PATH, map00195, Photosynthesis), (PATH, map0...",[],3307
5,"[(CAS, [53-59-8]), (PubChem, [3308]), (ChEBI, ...",C00006,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",C21H29N7O17P3,,"[NADP+, NADP, Nicotinamide adenine dinucleotid...","[(PATH, map00195, Photosynthesis), (PATH, map0...",[],3308
6,"[(CAS, [7782-44-7]), (PubChem, [3309]), (ChEBI...",C00007,"[1.1.1.170, 1.1.1.270, 1.1.3.2, 1.1.3.4, 1.1.3...",O2,,"[Oxygen, O2]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3309
7,"[(CAS, [58-64-0]), (PubChem, [3310]), (ChEBI, ...",C00008,"[1.3.7.7, 1.3.7.8, 1.3.7.14, 1.3.7.15, 1.17.4....",C10H15N5O10P2,,"[ADP, Adenosine 5'-diphosphate]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3310
8,"[(CAS, [7664-38-2]), (PubChem, [3311]), (ChEBI...",C00009,"[1.2.1.11, 1.2.1.12, 1.2.1.13, 1.2.1.38, 1.2.1...",H3PO4,,"[Orthophosphate, Phosphate, Phosphoric acid, O...","[(PATH, map00190, Oxidative phosphorylation), ...",[],3311
9,"[(CAS, [85-61-0]), (PubChem, [3312]), (ChEBI, ...",C00010,"[1.1.1.34, 1.1.1.88, 1.1.1.-, 1.2.1.10, 1.2.1....",C21H36N7O16P3S,,"[CoA, Coenzyme A, CoA-SH]","[(PATH, map00071, Fatty acid degradation), (PA...",[],3312


In [44]:
joint_enzyme_compound_df = expanded_noNAenzyme.merge(compound_df, left_on='product', right_on='entry')
joint_enzyme_compound_df.head(10)

Unnamed: 0,entry_x,product,dblinks,entry_y,enzyme,formula,mass,name,pathway,structures,PubChem
0,1.1.1.1,C00071,"[(PubChem, [3371]), (ChEBI, [17478])]",C00071,"[1.1.1.1, 1.1.1.2, 1.1.1.71, 1.1.2.7, 1.1.3.13...",CHOR,,"[Aldehyde, RCHO]","[(PATH, map00071, Fatty acid degradation)]",[],3371
1,3.3.2.2,C00071,"[(PubChem, [3371]), (ChEBI, [17478])]",C00071,"[1.1.1.1, 1.1.1.2, 1.1.1.71, 1.1.2.7, 1.1.3.13...",CHOR,,"[Aldehyde, RCHO]","[(PATH, map00071, Fatty acid degradation)]",[],3371
2,1.1.1.1,C00226,"[(PubChem, [3526]), (ChEBI, [15734])]",C00226,"[1.1.1.1, 1.1.1.2, 1.1.1.71, 1.1.2.7, 1.1.3.13...",CH3OR,,"[Primary alcohol, 1-Alcohol]","[(PATH, map00071, Fatty acid degradation), (PA...",[],3526
3,1.1.1.1,C01450,"[(PubChem, [4627]), (ChEBI, [17087])]",C01450,"[1.1.1.1, 1.1.1.184, 1.1.3.18, 1.1.98.5, 1.7.3...",COR2,,"[Ketone, R-CO-R']","[(PATH, map00073, Cutin, suberine and wax bios...",[],4627
4,1.1.1.1,C01612,"[(PubChem, [4764]), (ChEBI, [35681])]",C01612,"[1.1.1.1, 1.1.1.184, 1.1.3.18, 1.1.98.5]",CH2OR2,,"[Secondary alcohol, R-CHOH-R']","[(PATH, map00073, Cutin, suberine and wax bios...",[],4764
5,1.1.1.110,C03964,"[(PubChem, [6685]), (ChEBI, [10980, 16003]), (...",C03964,"[1.1.1.110, 1.1.1.237, 4.2.1.-]",C9H10O4,,[(R)-3-(4-Hydroxyphenyl)lactate],"[(PATH, map00130, Ubiquinone and other terpeno...",[],6685
6,1.1.1.237,C03964,"[(PubChem, [6685]), (ChEBI, [10980, 16003]), (...",C03964,"[1.1.1.110, 1.1.1.237, 4.2.1.-]",C9H10O4,,[(R)-3-(4-Hydroxyphenyl)lactate],"[(PATH, map00130, Ubiquinone and other terpeno...",[],6685
7,1.1.1.110,C00331,"[(CAS, [392-12-1]), (PubChem, [3625]), (ChEBI,...",C00331,"[1.1.1.110, 1.2.7.8, 1.4.1.19, 1.4.3.2, 1.14.1...",C11H9NO3,,"[Indolepyruvate, Indolepyruvic acid, (Indol-3-...","[(PATH, map00380, Tryptophan metabolism), (PAT...",[],3625
8,1.1.1.110,C01179,"[(CAS, [156-39-8]), (PubChem, [4406]), (ChEBI,...",C01179,"[1.1.1.110, 1.1.1.237, 1.2.3.13, 1.3.1.12, 1.3...",C9H8O4,,"[3-(4-Hydroxyphenyl)pyruvate, 4-Hydroxyphenylp...","[(PATH, map00130, Ubiquinone and other terpeno...",[],4406
9,1.1.1.237,C01179,"[(CAS, [156-39-8]), (PubChem, [4406]), (ChEBI,...",C01179,"[1.1.1.110, 1.1.1.237, 1.2.3.13, 1.3.1.12, 1.3...",C9H8O4,,"[3-(4-Hydroxyphenyl)pyruvate, 4-Hydroxyphenylp...","[(PATH, map00130, Ubiquinone and other terpeno...",[],4406


In [45]:
compact_joint_enzyme_compound_df = joint_enzyme_compound_df[['entry_x','product','PubChem']].\
                                   sort_values(by=['entry_x'])
compact_joint_enzyme_compound_df.head(10)

Unnamed: 0,entry_x,product,PubChem
0,1.1.1.1,C00071,3371.0
2,1.1.1.1,C00226,3526.0
3,1.1.1.1,C01450,4627.0
4,1.1.1.1,C01612,4764.0
5,1.1.1.110,C03964,6685.0
7,1.1.1.110,C00331,3625.0
8,1.1.1.110,C01179,4406.0
10,1.1.1.110,C22006,
11,1.1.1.110,C05607,7930.0
16,1.1.1.153,C02953,5871.0


In [46]:
print (len(compact_joint_enzyme_compound_df))

2144


In [47]:
#rename column names 
compact_joint_enzyme_compound_df.rename(columns={'entry_x':'entry','product':'KEGG'},inplace=True)
compact_joint_enzyme_compound_df = compact_joint_enzyme_compound_df.loc[compact_joint_enzyme_compound_df['PubChem']!='']
len(compact_joint_enzyme_compound_df)

2068

In [48]:
compact_joint_enzyme_compound_df.columns

Index(['entry', 'KEGG', 'PubChem'], dtype='object')

In [54]:
shortened_df = compact_joint_enzyme_compound_df.copy()
short_50 = shortened_df.head(50)

In [50]:
def sid_to_smiles(sid):
    """Takes an SID and prints the associated SMILES string."""

    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
        
    
    return compound.isomeric_smiles


def kegg_df_to_smiles(kegg_df):
    """Takes a pandas dataframe that includes a column of SIDs, gets the isomeric SMILES for each SID, stores them as a list, then adds a SMILES column."""

    res = [] 
    unsuccessful_list = []
    
    for i in range(len(kegg_df)):
        sid = kegg_df.iloc[i, 2] #CHANGE THIS 1 TO THE PROPER COLUMN NUMBER FOR SID 
        try:
            result = sid_to_smiles(sid)
            res.append(result)
        except:
            res.append('none')
            unsuccessful_list.append(sid)
            pass
        
    kegg_df.insert(3, column='SMILES', value=res) #Change this 2 to the number where the smiles column should be
    kegg_df.to_csv(r'../datasets/cleaned_kegg_with_smiles')
    
    return kegg_df, unsuccessful_list

In [55]:
def sid_to_smiles(sid):
    """Takes an SID and prints the associated SMILES string."""

    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
        
    
    return compound.isomeric_smiles, cid


def kegg_df_to_smiles(kegg_df):
    """Takes a pandas dataframe that includes a column of SIDs, gets the isomeric SMILES for each SID, stores them as a list, then adds a SMILES column."""

    res = [] 
    cid_list = []
    unsuccessful_list = []
    
    for i in range(len(kegg_df)):
        sid = kegg_df.iloc[i, 2] #CHANGE THIS 1 TO THE PROPER COLUMN NUMBER FOR SID
        try:
            smile_result = sid_to_smiles(sid)[0]
            res.append(smile_result)
            cid_result = sid_to_smiles(sid)[1]
            cid_list.append(cid_result)
        except:
            res.append('none')
            cid_list.append('none')
            unsuccessful_list.append(sid)
            pass
        
    kegg_df.insert(3, column='CID', value=cid_list)    
    kegg_df.insert(4, column='SMILES', value=res) #Change this 2 to the number where the smiles column should be
    kegg_df.to_csv(r'../datasets/playground_df_cleaned_kegg_with_smiles.csv')
    
    return kegg_df, unsuccessful_list

In [56]:
kegg_df_to_smiles(short_50)

(        entry    KEGG    PubChem        CID  \
 0     1.1.1.1  C00071       3371       none   
 2     1.1.1.1  C00226       3526       none   
 3     1.1.1.1  C01450       4627       none   
 4     1.1.1.1  C01612       4764       none   
 5   1.1.1.110  C03964       6685     440177   
 7   1.1.1.110  C00331       3625        803   
 8   1.1.1.110  C01179       4406        979   
 11  1.1.1.110  C05607       7930     643327   
 16  1.1.1.153  C02953       5871  135398687   
 14  1.1.1.153  C00835       4093  135398579   
 12  1.1.1.153  C03684       6459  135398695   
 17  1.1.1.153  C00272       3570  135398654   
 18  1.1.1.187  C02977       5888  135398688   
 19  1.1.1.187  C03117       6011  135398578   
 20  1.1.1.187  C01222       4444  135398621   
 23  1.1.1.203  C20890  254741353   46926211   
 22  1.1.1.203  C20889  254741352   53321658   
 6   1.1.1.237  C03964       6685     440177   
 9   1.1.1.237  C01179       4406        979   
 24  1.1.1.237  C04045       6746     16

In [57]:
pd.read_csv('../datasets/playground_df_cleaned_kegg_with_smiles.csv')

Unnamed: 0.1,Unnamed: 0,entry,KEGG,PubChem,CID,SMILES
0,0,1.1.1.1,C00071,3371,none,none
1,2,1.1.1.1,C00226,3526,none,none
2,3,1.1.1.1,C01450,4627,none,none
3,4,1.1.1.1,C01612,4764,none,none
4,5,1.1.1.110,C03964,6685,440177,C1=CC(=CC=C1C[C@H](C(=O)O)O)O
5,7,1.1.1.110,C00331,3625,803,C1=CC=C2C(=C1)C(=CN2)CC(=O)C(=O)O
6,8,1.1.1.110,C01179,4406,979,C1=CC(=CC=C1CC(=O)C(=O)O)O
7,11,1.1.1.110,C05607,7930,643327,C1=CC=C(C=C1)C[C@H](C(=O)O)O
8,16,1.1.1.153,C02953,5871,135398687,C[C@@H]([C@@H](C1=NC2=C(NC1)N=C(NC2=O)N)O)O
9,14,1.1.1.153,C00835,4093,135398579,C[C@@H](C(=O)C1=NC2=C(NC1)N=C(NC2=O)N)O
