In [1]:
import numpy as np
import pandas as pd
import math
import ast

# Additional task
Finding the pathway in central metabolites --- e.g. TCA cycle

In [3]:
TCA_raw = pd.read_csv('../datasets/TCA_pathway.csv', names = ['Col'])

In [4]:
TCA_raw.tail(50)

Unnamed: 0,Col
64,CREDITS - caspi
65,"DBLINKS - (ECOCYC ""TCA"" NIL |paley| 3392397157..."
66,"DBLINKS - (ARACYC ""TCA"" NIL |green| 3381011399..."
67,ENZYMES-NOT-USED - CPLX0-251
68,IN-PATHWAY - TCA-GLYOX-BYPASS
69,KEY-REACTIONS - ISOCITDEH-RXN
70,KEY-REACTIONS - MALATE-DEHYDROGENASE-ACCEPTOR-RXN
71,KEY-REACTIONS - SUCCCOASYN-RXN
72,PATHWAY-LINKS - (ACETYL-COA (|Fatty-Acid-Degra...
73,"PREDECESSORS - (""FUMHYDR-RXN"" ""RXN-14971"")"


In [5]:
TCA_rxn = []
rxn_str = 'REACTION-LIST'
for index, row in TCA_raw[90:].iterrows():
    if row['Col'][:len(rxn_str)] == rxn_str:
        TCA_rxn.append(row['Col'][(len(rxn_str)+3):])

In [6]:
TCA_rxn

['RXN-14971',
 'MALATE-DEH-RXN',
 'ISOCITDEH-RXN',
 'MALATE-DEHYDROGENASE-ACCEPTOR-RXN',
 'ACONITATEDEHYDR-RXN',
 'CITSYN-RXN',
 'ACONITATEHYDR-RXN',
 '2OXOGLUTARATEDEH-RXN',
 'SUCCCOASYN-RXN',
 'FUMHYDR-RXN']

In [8]:
def recover_list(df, column):
    """This function will recover a list formatted string read from .csv into a list"""
    assert type(df[column][0]) != type([]), "TypeError: The data type is already a list, it should not be converted again"
    replacement = []
    for index, row in df.iterrows():
        
        data = []
        
        if type(row[column]) == type('string'):
            data = ast.literal_eval(row[column])
        else:
            pass
        replacement.append(data)
    df[column] = replacement
    return

In [10]:
df_cpd = pd.read_csv('df_cpd.csv', index_col = 0)
df_cpd = df_cpd.set_index(keys ='UNIQUE-ID')

df_rxn = pd.read_csv('parsed_rxns.csv', index_col = 0)
df_rxn = df_rxn.set_index(keys = 'UNIQUE-ID')

# Change PubChemID into int type in df_cpd
PubChemID_int = df_cpd['PubChemID'].fillna(0).astype(int)
df_cpd['PubChemID'] = PubChemID_int

# Recover list format of df_rxn
rxn_list_fix = ['EC-NUMBER', 'ERXN-NUMBER', 'SUBSTRATES', 'PRODUCTS']
for col in rxn_list_fix:
    recover_list(df_rxn, col)


In [20]:
df_TCA = pd.DataFrame({'ERXN-NUMBER': [],
                      'EC-NUMBER': [],
                      'SUBSTRATES': [],
                      'PRODUCTS': [],
                      'GIBBS': []})
for item in TCA_rxn:
    print(item, '\n', df_rxn.loc[item], '\n')
    df_TCA.loc[item] = df_rxn.loc[item]

RXN-14971 
 ERXN-NUMBER    [ENZRXN-24875, ENZRXN-24874, ENZRXN-12464]
EC-NUMBER                                    [EC-1.3.5.1]
SUBSTRATES                            [ETR-Quinones, SUC]
PRODUCTS                               [ETR-Quinols, FUM]
GIBBS                                            -5.04225
Name: RXN-14971, dtype: object 

MALATE-DEH-RXN 
 ERXN-NUMBER    [MALATE-DEH-ENZRXN, ENZRXN-22204, ENZRXN66-196...
EC-NUMBER                                          [EC-1.1.1.37]
SUBSTRATES                                            [MAL, NAD]
PRODUCTS                         [OXALACETIC_ACID, NADH, PROTON]
GIBBS                                                    4.46176
Name: MALATE-DEH-RXN, dtype: object 

ISOCITDEH-RXN 
 ERXN-NUMBER    [ISOCITDEH-ENZRXN, ENZRXN3O-1406, ENZRXN-12349...
EC-NUMBER                                          [EC-1.1.1.42]
SUBSTRATES                          [THREO-DS-ISO-CITRATE, NADP]
PRODUCTS                [2-KETOGLUTARATE, CARBON-DIOXIDE, NADPH]
GIBBS    

In [22]:
df_TCA.to_csv('df_TCA.csv')
df_TCA

Unnamed: 0,ERXN-NUMBER,EC-NUMBER,SUBSTRATES,PRODUCTS,GIBBS
RXN-14971,"[ENZRXN-24875, ENZRXN-24874, ENZRXN-12464]",[EC-1.3.5.1],"[ETR-Quinones, SUC]","[ETR-Quinols, FUM]",-5.042252
MALATE-DEH-RXN,"[MALATE-DEH-ENZRXN, ENZRXN-22204, ENZRXN66-196...",[EC-1.1.1.37],"[MAL, NAD]","[OXALACETIC_ACID, NADH, PROTON]",4.461761
ISOCITDEH-RXN,"[ISOCITDEH-ENZRXN, ENZRXN3O-1406, ENZRXN-12349...",[EC-1.1.1.42],"[THREO-DS-ISO-CITRATE, NADP]","[2-KETOGLUTARATE, CARBON-DIOXIDE, NADPH]",0.375854
MALATE-DEHYDROGENASE-ACCEPTOR-RXN,[ENZRXN0-282],[EC-1.1.5.4],"[ETR-Quinones, MAL]","[ETR-Quinols, OXALACETIC_ACID]",-14.94226
ACONITATEDEHYDR-RXN,"[ACONITATEDEHYDR-ENZRXN, ACONITATEDEHYDRB-ENZR...",[],[CIT],"[CIS-ACONITATE, WATER]",3.352997
CITSYN-RXN,"[CITSYN-ENZRXN, ENZRXN0-7594, ENZRXN3O-1609, E...","[EC-2.3.3.16, EC-2.3.3.1]","[ACETYL-COA, WATER, OXALACETIC_ACID]","[CIT, CO-A, PROTON]",-10.515259
ACONITATEHYDR-RXN,"[ACONITATEHYDR-ENZRXN, ACONITATEHYDRB-ENZRXN, ...",[],"[CIS-ACONITATE, WATER]",[THREO-DS-ISO-CITRATE],-3.132996
2OXOGLUTARATEDEH-RXN,"[2OXOGLUTARATEDEH-ENZRXN, ENZRXN-23162, ENZRXN...",[EC-1.2.1.M9],"[2-KETOGLUTARATE, CO-A, NAD]","[SUC-COA, CARBON-DIOXIDE, NADH]",-11.366394
SUCCCOASYN-RXN,"[SUCCCOASYN-ENZRXN, ENZRXN-22640, ENZRXN-22263...",[EC-6.2.1.5],"[SUC, CO-A, ATP]","[SUC-COA, ADP, Pi]",-1.054077
FUMHYDR-RXN,"[FUMARC-ENZRXN, ENZRXN0-8343, ENZRXN0-8344, FU...",[EC-4.2.1.2],[MAL],"[FUM, WATER]",0.613007
