In [35]:
from Bio.KEGG import REST
from Bio.KEGG import Enzyme

import gzip
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
enzyme_fields = [method for method in dir(Enzyme.Record()) if not method.startswith('_')]
data_matrix = []

with gzip.open('../datasets/KEGG_enzymes_all_data.gz', 'rt') as file:
    for record in Enzyme.parse(file):
        data_matrix.append([getattr(record, field) for field in enzyme_fields])

In [6]:
enzyme_df = pd.DataFrame(data_matrix, columns=enzyme_fields)

In [7]:
enzyme_df.head()

Unnamed: 0,classname,cofactor,comment,dblinks,disease,effector,entry,genes,inhibitor,name,pathway,product,reaction,structures,substrate,sysname
0,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Acts on primary or secondary ...,"[(ExplorEnz - The Enzyme Database, [1.1.1.1]),...",[],[],1.1.1.1,"[(HSA, [124, 125, 126, 127, 128, 130, 131]), (...",[],"[alcohol dehydrogenase, aldehyde reductase, AD...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ...",[(1) a primary alcohol + NAD+ = an aldehyde + ...,[],"[primary alcohol [CPD:C00226], NAD+ [CPD:C0000...",[alcohol:NAD+ oxidoreductase]
1,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[A zinc protein. Some members of this group ox...,"[(ExplorEnz - The Enzyme Database, [1.1.1.2]),...",[],[],1.1.1.2,"[(HSA, [10327]), (PTR, [741418]), (PPS, [10099...",[],"[alcohol dehydrogenase (NADP+), aldehyde reduc...","[(PATH, ec00010, Glycolysis / Gluconeogenesis)...","[aldehyde [CPD:C00071], NADPH [CPD:C00005], H+...",[an alcohol + NADP+ = an aldehyde + NADPH + H+...,[],"[alcohol [CPD:C00069], NADP+ [CPD:C00006]]",[alcohol:NADP+ oxidoreductase]
2,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[The yeast enzyme acts most rapidly with NAD+;...,"[(ExplorEnz - The Enzyme Database, [1.1.1.3]),...",[],[],1.1.1.3,"[(NVE, [NEMVE_v1g225948]), (ATH, [AT1G31230, A...",[],"[homoserine dehydrogenase, HSDH, HSD]","[(PATH, ec00260, Glycine, serine and threonine...","[L-aspartate 4-semialdehyde [CPD:C00441], NADH...",[L-homoserine + NAD(P)+ = L-aspartate 4-semial...,[],"[L-homoserine [CPD:C00263], NAD+ [CPD:C00003],...",[L-homoserine:NAD(P)+ oxidoreductase]
3,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Also converts diacetyl into acetoin with NADH...,"[(ExplorEnz - The Enzyme Database, [1.1.1.4]),...",[],[],1.1.1.4,"[(SCE, [YAL060W, YAL061W]), (KLA, [KLLA0_F0050...",[],"[(R,R)-butanediol dehydrogenase, butyleneglyco...","[(PATH, ec00650, Butanoate metabolism)]","[(R)-acetoin [CPD:C00810], NADH [CPD:C00004], ...","[(R,R)-butane-2,3-diol + NAD+ = (R)-acetoin + ...",[],"[(R,R)-butane-2,3-diol [CPD:C03044], NAD+ [CPD...","[(R,R)-butane-2,3-diol:NAD+ oxidoreductase]"
4,"[Oxidoreductases;, Acting on the CH-OH group o...",[],[Transferred entry: acetoin dehydrogenase. Now...,[],[],[],1.1.1.5,[],[],[Transferred to 1.1.1.303 and 1.1.1.304],[],[],[],[],[],[]


In [8]:
# enzyme df search 
enzyme_df[enzyme_df.entry == '1.1.1.153']['reaction']

152    [(1) L-erythro-7,8-dihydrobiopterin + NADP+ = ...
Name: reaction, dtype: object

In [9]:
enzyme_df['reaction'][153]

['(S)-ureidoglycolate + NAD(P)+ = oxalureate + NAD(P)H + H+ [RN:R02935 R02936]']

In [10]:
promiscuous_df = enzyme_df[[True if len(rxn) > 1 else False for rxn in enzyme_df['reaction']]]
compact_promiscuous_df = promiscuous_df[['entry','reaction','product','substrate']]
compact_promiscuous_df.iloc[1,2]

['pyruvate [CPD:C00022]', 'CO2 [CPD:C00011]', 'NADH [CPD:C00004]']

In [None]:

# create a list of reactions that appear in promiscuous enzyme dataframe 
reaction_list = []
for index,row in compact_promiscuous_df.iterrows():
     for reaction in row[1]:
            if reaction.split("[RN:")[-1].startswith("R"):
                if not reaction.split("[RN:")[-1].startswith("RN"):
                     for i in reaction.split("[RN:")[-1][:-1].split(" "):
                            reaction_list.append(i)
            
reaction_list       


In [None]:
# run it when there is good internet connection
# append all the reactions that are reversible 
reversible_reaction = []
for reaction in reaction_list:
    reaction_file = REST.kegg_get(reaction).read()
    for i in reaction_file.rstrip().split("\n"):
        if i.startswith("EQUATION") and "<=>" in i:
            reversible_reaction.append(reaction)
            print (reaction)

In [127]:
# it seem like all the reactions are reversible 
len(reversible_reaction)

1302

In [11]:
rowindex = np.arange(0,len(compact_promiscuous_df))
rowindex

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [12]:
compact_promiscuous_df_index=compact_promiscuous_df.set_index(rowindex)
compact_promiscuous_df_index.iloc[1,2]


['pyruvate [CPD:C00022]', 'CO2 [CPD:C00011]', 'NADH [CPD:C00004]']

### append substrate molecules to product column

In [13]:
# do not run this cell several times! it will append substrate molecules multiple times 
for index,row in compact_promiscuous_df_index.iterrows():
    productlist = row['product']
    substratelist = row['substrate']
    for substrate in substratelist:
        productlist.append(substrate)
    compact_promiscuous_df_index.iloc[index,2] = productlist

In [14]:
substrate_to_product_promiscuous_df = compact_promiscuous_df_index
len(substrate_to_product_promiscuous_df.iloc[1,2])

6

In [15]:
substrate_to_product_promiscuous_df = substrate_to_product_promiscuous_df[['entry','reaction','product']]

In [153]:
# save substrate and product combined dataframe to csv 
# might remove this dataframe from the git repo soon 
# substrate_to_product_promiscuous_df.to_csv("../datasets/substrate_product_combined_promiscuous.csv")

### cofactor removal 

In [16]:
substrate_to_product_promiscuous_df 

Unnamed: 0,entry,reaction,product
0,1.1.1.1,[(1) a primary alcohol + NAD+ = an aldehyde + ...,"[aldehyde [CPD:C00071], NADH [CPD:C00004], H+ ..."
1,1.1.1.38,[(1) (S)-malate + NAD+ = pyruvate + CO2 + NADH...,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADH..."
2,1.1.1.40,[(1) (S)-malate + NADP+ = pyruvate + CO2 + NAD...,"[pyruvate [CPD:C00022], CO2 [CPD:C00011], NADP..."
3,1.1.1.42,[isocitrate + NADP+ = 2-oxoglutarate + CO2 + N...,"[2-oxoglutarate [CPD:C00026], CO2 [CPD:C00011]..."
4,1.1.1.85,"[(2R,3S)-3-isopropylmalate + NAD+ = 4-methyl-2...","[4-methyl-2-oxopentanoate [CPD:C00233], CO2 [C..."
5,1.1.1.110,[(1) (R)-3-(phenyl)lactate + NAD+ = 3-phenylpy...,"[3-phenylpyruvate, NADH [CPD:C00004], H+ [CPD:..."
6,1.1.1.153,"[(1) L-erythro-7,8-dihydrobiopterin + NADP+ = ...","[sepiapterin [CPD:C00835], NADPH [CPD:C00005],..."
7,1.1.1.187,[(1) GDP-alpha-D-rhamnose + NAD(P)+ = GDP-4-de...,"[GDP-4-dehydro-alpha-D-rhamnose [CPD:C01222], ..."
8,1.1.1.203,[(1) beta-D-galacturonate + NAD+ = D-galactaro...,"[D-galactaro-1,5-lactone [CPD:C20889], NADH [C..."
9,1.1.1.237,[(1) (R)-3-(4-hydroxyphenyl)lactate + NAD(P)+ ...,"[3-(4-hydroxyphenyl)pyruvate [CPD:C01179], NAD..."


In [17]:
compact_substrate_to_product_promiscuous_df = substrate_to_product_promiscuous_df[['entry','product']]

In [18]:
len(compact_substrate_to_product_promiscuous_df)

549

In [19]:
# test cleaning 
test='aldehyde [CPD:C00071]'
test[-7:-1]

'C00071'

In [20]:
#cofactor removal 
cofactor_df = pd.read_csv("cofactor_list.csv")
cofactor_list = [cofactor[4:10] for cofactor in cofactor_df.CPD]
cofactor_list
    

['C00001',
 'C00002',
 'C00003',
 'C00004',
 'C00005',
 'C00006',
 'C00007',
 'C00008',
 'C00009',
 'C00010',
 'C00011',
 'C00012',
 'C00013',
 'C00014',
 'C00015',
 'C00016',
 'C00017',
 'C00018',
 'C00019',
 'C00020',
 'C00021',
 'C00023',
 'C00027',
 'C00028',
 'C00030',
 'C00032',
 'C00034',
 'C00050',
 'C00061',
 'C00070',
 'C00080',
 'C00255',
 'C01007',
 'C01352',
 'C01382',
 'C02745',
 'C02869']

In [21]:
newcompoundcolumn=[]
newdf = compact_substrate_to_product_promiscuous_df[['entry']]
newdf

            

Unnamed: 0,entry
0,1.1.1.1
1,1.1.1.38
2,1.1.1.40
3,1.1.1.42
4,1.1.1.85
5,1.1.1.110
6,1.1.1.153
7,1.1.1.187
8,1.1.1.203
9,1.1.1.237


In [22]:
# I wanted to use dataframe.loc but I was keep getting an error and this was the best way possible.. 
no_noncofactorcompound = []
compoundcolumn = []
for index,row in compact_substrate_to_product_promiscuous_df.iterrows():
    newcompoundlist = []
    for compound in row[1]:
        if "CPD" in compound:
            onlycpd = compound[-7:-1]
            if onlycpd not in cofactor_list:
                newcompoundlist.append(onlycpd)
    if len(newcompoundlist)==0:
        no_noncofactorcompound.append(row[0])
        compoundcolumn.append("NA")
    else:
        compoundcolumn.append(newcompoundlist)
newdf['product'] = compoundcolumn

        

In [23]:
#renamed
cleaned_promiscuous_enzyme_df=newdf

In [24]:
#no cofactor. #cleaned version 
cleaned_promiscuous_enzyme_df

Unnamed: 0,entry,product
0,1.1.1.1,"[C00071, C01450, C00226, C01612]"
1,1.1.1.38,"[C00022, C00149, C00036]"
2,1.1.1.40,"[C00022, C00149, C00036]"
3,1.1.1.42,"[C00026, C05379, C00311, C05379]"
4,1.1.1.85,"[C00233, C04236, C04411, C04236]"
5,1.1.1.110,"[C01179, C00331, C05607, C03964, C22006]"
6,1.1.1.153,"[C00835, C03684, C02953, C00272]"
7,1.1.1.187,"[C01222, C03117, C02977]"
8,1.1.1.203,"[C20889, C20890]"
9,1.1.1.237,"[C01179, C04045, C03964, C22038]"


In [25]:
#list of enzyme entries without compound, other than cofactors 
no_noncofactorcompound

['1.3.1.88',
 '1.3.1.90',
 '1.11.1.21',
 '1.14.13.107',
 '1.18.1.3',
 '2.1.1.200',
 '2.1.1.202',
 '2.1.1.207',
 '2.1.1.213',
 '2.1.1.225',
 '2.1.1.244',
 '2.1.1.268',
 '2.3.2.24',
 '2.3.2.25',
 '2.3.2.26',
 '2.3.2.31',
 '3.1.11.7',
 '4.1.1.68',
 '4.2.1.161',
 '4.2.1.166',
 '4.2.1.170',
 '4.4.1.29',
 '4.4.1.30',
 '4.6.1.18',
 '4.6.1.20',
 '4.6.1.21',
 '5.3.3.8',
 '6.5.1.4']

In [26]:
#cleaned_promiscuous_enzyme_df.to_csv("../datasets/cleaned_promiscous_enzyme_df.csv", header=['entry','product'])

In [66]:
noNAenzyme = cleaned_promiscuous_enzyme_df.loc[cleaned_promiscuous_enzyme_df['product']!='NA']

In [68]:
noNAenzyme = noNAenzyme.rename(columns={'product':'products'})
noNAenzyme

Unnamed: 0,entry,products
0,1.1.1.1,"[C00071, C01450, C00226, C01612]"
1,1.1.1.38,"[C00022, C00149, C00036]"
2,1.1.1.40,"[C00022, C00149, C00036]"
3,1.1.1.42,"[C00026, C05379, C00311, C05379]"
4,1.1.1.85,"[C00233, C04236, C04411, C04236]"
5,1.1.1.110,"[C01179, C00331, C05607, C03964, C22006]"
6,1.1.1.153,"[C00835, C03684, C02953, C00272]"
7,1.1.1.187,"[C01222, C03117, C02977]"
8,1.1.1.203,"[C20889, C20890]"
9,1.1.1.237,"[C01179, C04045, C03964, C22038]"


In [80]:
noNAenzyme_expand = noNAenzyme.products.apply(pd.Series).merge(noNAenzyme, left_index = True, right_index = True).drop(["products"], axis = 1).melt(id_vars = ['entry'], value_name = "product")


In [117]:
noNAenzyme_expand.sort_values(by=['entry'],inplace=True)

In [118]:
new = noNAenzyme_expand.dropna().drop(columns=['variable']).drop_duplicates(['entry','product'],keep='first')
compoundList = list(new['product'])

In [119]:
compoundList




['C00071',
 'C01612',
 'C01450',
 'C00226',
 'C05607',
 'C22006',
 'C01179',
 'C00331',
 'C03964',
 'C02953',
 'C00272',
 'C03684',
 'C00835',
 'C01222',
 'C03117',
 'C02977',
 'C20890',
 'C20889',
 'C01179',
 'C03964',
 'C22038',
 'C04045',
 'C00065',
 'C06735',
 'C11822',
 'C00944',
 'C02637',
 'C00493',
 'C00296',
 'C00311',
 'C05662',
 'C00322',
 'C00026',
 'C00310',
 'C00309',
 'C01904',
 'C07276',
 'C18020',
 'C11937',
 'C18019',
 'C19082',
 'C20223',
 'C20222',
 'C17621',
 'C17622',
 'C20263',
 'C03684',
 'C20264',
 'C00835',
 'C20501',
 'C20502',
 'C20500',
 'C20506',
 'C20503',
 'C20507',
 'C20508',
 'C20505',
 'C20504',
 'C02669',
 'C00198',
 'C00124',
 'C00031',
 'C00116',
 'C00577',
 'C02426',
 'C00149',
 'C00036',
 'C00022',
 'C05519',
 'C03508',
 'C01888',
 'C06735',
 'C11822',
 'C00065',
 'C21140',
 'C18155',
 'C18154',
 'C18153',
 'C21374',
 'C00149',
 'C00022',
 'C00036',
 'C21610',
 'C00111',
 'C21585',
 'C03393',
 'C00111',
 'C00311',
 'C05379',
 'C00026',
 'C04236',

In [120]:
new

Unnamed: 0,entry,product
0,1.1.1.1,C00071
1563,1.1.1.1,C01612
521,1.1.1.1,C01450
1042,1.1.1.1,C00226
1047,1.1.1.110,C05607
2089,1.1.1.110,C22006
5,1.1.1.110,C01179
526,1.1.1.110,C00331
1568,1.1.1.110,C03964
1048,1.1.1.153,C02953


In [121]:
import re
from Bio.KEGG import Compound

In [122]:
def compound_records_to_df(file_path):
    """
    Input should be a filepath string pointing to a gzipped text file of KEGG enzyme records.
    Function parses all records using Biopython.Bio.KEGG.Compound parser, and returns a pandas dataframe.
    """
    compound_fields = [method for method in dir(Compound.Record()) if not method.startswith('_')]
    data_matrix = []

    with gzip.open(file_path, 'rt') as file:
        for record in Compound.parse(file):
            data_matrix.append([getattr(record, field) for field in compound_fields])
    
    compound_df = pd.DataFrame(data_matrix, columns=compound_fields)
    return compound_df

In [123]:
compound_df = compound_records_to_df('../datasets/KEGG_compound_db_entries.gz')

In [124]:
def extract_PubChem_id(field):
    """
    This function uses regular expressions to extract the PubChem compound IDs from a field in a record
    """

    regex = "'PubChem', \[\'(\d+)\'\]\)" # matches "'PubChem', ['" characters exactly, then captures any number of digits (\d+), before another literal "']" character match
    ids = re.findall(regex, str(field), re.IGNORECASE)
    if len(ids) > 0:
        pubchem_id = ids[0]
    else:
        pubchem_id = ''
    
    return pubchem_id

In [125]:
PubChemID_list = []

for _, row in compound_df.iterrows():
    pubchem_id = extract_PubChem_id(row['dblinks'])
    PubChemID_list.append(pubchem_id)
    
compound_df['PubChem'] = PubChemID_list
compound_df.head(10)

Unnamed: 0,dblinks,entry,enzyme,formula,mass,name,pathway,structures,PubChem
0,"[(CAS, [7732-18-5]), (PubChem, [3303]), (ChEBI...",C00001,"[1.1.1.1, 1.1.1.22, 1.1.1.23, 1.1.1.115, 1.1.1...",H2O,,"[H2O, Water]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3303
1,"[(CAS, [56-65-5]), (PubChem, [3304]), (ChEBI, ...",C00002,"[1.1.98.6, 1.2.1.30, 1.2.1.95, 1.2.1.101, 1.3....",C10H16N5O13P3,,"[ATP, Adenosine 5'-triphosphate]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3304
2,"[(CAS, [53-84-9]), (PubChem, [3305]), (ChEBI, ...",C00003,"[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",C21H28N7O14P2,,"[NAD+, NAD, Nicotinamide adenine dinucleotide,...","[(PATH, map00190, Oxidative phosphorylation), ...",[],3305
3,"[(CAS, [58-68-4]), (PubChem, [3306]), (ChEBI, ...",C00004,"[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",C21H29N7O14P2,,"[NADH, DPNH, Reduced nicotinamide adenine dinu...","[(PATH, map00190, Oxidative phosphorylation), ...",[],3306
4,"[(CAS, [2646-71-1]), (PubChem, [3307]), (ChEBI...",C00005,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",C21H30N7O17P3,,"[NADPH, TPNH, Reduced nicotinamide adenine din...","[(PATH, map00195, Photosynthesis), (PATH, map0...",[],3307
5,"[(CAS, [53-59-8]), (PubChem, [3308]), (ChEBI, ...",C00006,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",C21H29N7O17P3,,"[NADP+, NADP, Nicotinamide adenine dinucleotid...","[(PATH, map00195, Photosynthesis), (PATH, map0...",[],3308
6,"[(CAS, [7782-44-7]), (PubChem, [3309]), (ChEBI...",C00007,"[1.1.1.170, 1.1.1.270, 1.1.3.2, 1.1.3.4, 1.1.3...",O2,,"[Oxygen, O2]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3309
7,"[(CAS, [58-64-0]), (PubChem, [3310]), (ChEBI, ...",C00008,"[1.3.7.7, 1.3.7.8, 1.3.7.14, 1.3.7.15, 1.17.4....",C10H15N5O10P2,,"[ADP, Adenosine 5'-diphosphate]","[(PATH, map00190, Oxidative phosphorylation), ...",[],3310
8,"[(CAS, [7664-38-2]), (PubChem, [3311]), (ChEBI...",C00009,"[1.2.1.11, 1.2.1.12, 1.2.1.13, 1.2.1.38, 1.2.1...",H3PO4,,"[Orthophosphate, Phosphate, Phosphoric acid, O...","[(PATH, map00190, Oxidative phosphorylation), ...",[],3311
9,"[(CAS, [85-61-0]), (PubChem, [3312]), (ChEBI, ...",C00010,"[1.1.1.34, 1.1.1.88, 1.1.1.-, 1.2.1.10, 1.2.1....",C21H36N7O16P3S,,"[CoA, Coenzyme A, CoA-SH]","[(PATH, map00071, Fatty acid degradation), (PA...",[],3312


In [126]:
joint_enzyme_compound_df = new.merge(compound_df, left_on='product', right_on='entry')

In [127]:
joint_enzyme_compound_df=joint_enzyme_compound_df[['entry_x','product','formula','PubChem']].sort_values(by=['entry_x'])

In [128]:
joint_enzyme_compound_df.rename(columns={'entry_x':'entry','product':'KEGG'},inplace=True)

In [129]:
joint_enzyme_compound_df

Unnamed: 0,entry,KEGG,formula,PubChem
0,1.1.1.1,C00071,CHOR,3371
2,1.1.1.1,C01612,CH2OR2,4764
3,1.1.1.1,C01450,COR2,4627
4,1.1.1.1,C00226,CH3OR,3526
5,1.1.1.110,C05607,C9H10O3,7930
6,1.1.1.110,C22006,C11H11NO3,
7,1.1.1.110,C01179,C9H8O4,4406
9,1.1.1.110,C00331,C11H9NO3,3625
10,1.1.1.110,C03964,C9H10O4,6685
16,1.1.1.153,C00835,C9H11N5O3,4093
