In [2]:
import pandas as pd
import numpy as np

In [3]:
reac = pd.read_csv('reac_prop.tsv', sep='\t', header=351) # skip 351 lines of documentation
reac

Unnamed: 0,#ID,mnx_equation,reference,classifs,is_balanced,is_transport
0,EMPTY,=,mnx:EMPTY,,B,
1,MNXR01,1 MNXM01@MNXD1 = 1 MNXM1@MNXD1,mnx:MNXR01,,B,
2,MNXR02,1 MNXM1@MNXD1 = 1 MNXM1@MNXD2,mnx:MNXR02,,B,T
3,MNXR03,1 MNXM01@MNXD1 = 1 MNXM01@MNXD2,mnx:MNXR03,,B,T
4,MNXR100000,1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,biggR:GALNACT5g,,,
...,...,...,...,...,...,...
74608,MNXR99995,1 MNXM1100890@MNXD1 + 1 MNXM147451@MNXD1 = 1 M...,biggR:GALNACT1g,,,
74609,MNXR99996,1 MNXM1100890@MNXD1 + 1 MNXM163780@MNXD1 = 1 M...,biggR:GALNACT1g_cho,,,
74610,MNXR99997,1 MNXM1102128@MNXD1 + 1 MNXM147449@MNXD1 = 1 M...,biggR:GALNACT2g,,,
74611,MNXR99998,1 MNXM10945@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,biggR:GALNACT3g,,,


In [4]:
def preprocess_rex(rex):
    """
    rex: one reaction, such as '1 MNXM01@MNXD1 = 1 MNXM1@MNXD1'

    output: 
        - reaction formula without compartment (MNXM01 instead of MNXM01@MNXD1)
        - list of metabolites involved in the reaction
        - list of substrates
        - list of products
    """
    # compartment can only be @MNXD1 or @MNXD2
    rex_clean = rex.replace('@MNXD1', '').replace('@MNXD2', '')

    metabolites = take_MNXM(rex_clean.split(' '))
    # drop duplicates
    metabolites = list(set(metabolites))

    substrates, products = rex_clean.split('=')
    substrates = take_MNXM(substrates.split(' '))
    products = take_MNXM(products.split(' '))

    return rex_clean, metabolites, substrates, products


def take_MNXM(str_list):
    """
    Helper function for preprocessing.
    """
    return [mol for mol in str_list if mol.startswith('MNXM')]

In [5]:
reac_prep = []
for rex in reac['mnx_equation']:
    reac_prep.append(preprocess_rex(rex))
reac_prep = np.array(reac_prep, dtype=object)

reac = reac[['#ID', 'mnx_equation']]
reac['equation'] = reac_prep[:, 0]
reac['metabolites'] = reac_prep[:, 1]
reac['substrates'] = reac_prep[:, 2]
reac['products'] = reac_prep[:, 3]

reac

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reac['equation'] = reac_prep[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reac['metabolites'] = reac_prep[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reac['substrates'] = reac_prep[:, 2]


Unnamed: 0,#ID,mnx_equation,equation,metabolites,substrates,products
0,EMPTY,=,=,[],[],[]
1,MNXR01,1 MNXM01@MNXD1 = 1 MNXM1@MNXD1,1 MNXM01 = 1 MNXM1,"[MNXM1, MNXM01]",[MNXM01],[MNXM1]
2,MNXR02,1 MNXM1@MNXD1 = 1 MNXM1@MNXD2,1 MNXM1 = 1 MNXM1,[MNXM1],[MNXM1],[MNXM1]
3,MNXR03,1 MNXM01@MNXD1 = 1 MNXM01@MNXD2,1 MNXM01 = 1 MNXM01,[MNXM01],[MNXM01],[MNXM01]
4,MNXR100000,1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,1 MNXM10958 + 1 MNXM1104529 = 1 MNXM1102128 + ...,"[MNXM1102128, MNXM1104529, MNXM8415, MNXM10958]","[MNXM10958, MNXM1104529]","[MNXM1102128, MNXM8415]"
...,...,...,...,...,...,...
74608,MNXR99995,1 MNXM1100890@MNXD1 + 1 MNXM147451@MNXD1 = 1 M...,1 MNXM1100890 + 1 MNXM147451 = 1 MNXM1102128 +...,"[MNXM1100890, MNXM147451, MNXM8416, MNXM1102128]","[MNXM1100890, MNXM147451]","[MNXM1102128, MNXM8416]"
74609,MNXR99996,1 MNXM1100890@MNXD1 + 1 MNXM163780@MNXD1 = 1 M...,1 MNXM1100890 + 1 MNXM163780 = 1 MNXM1102128 +...,"[MNXM1100890, MNXM1102128, MNXM163780, MNXM8416]","[MNXM1100890, MNXM163780]","[MNXM1102128, MNXM8416]"
74610,MNXR99997,1 MNXM1102128@MNXD1 + 1 MNXM147449@MNXD1 = 1 M...,1 MNXM1102128 + 1 MNXM147449 = 1 MNXM1104529 +...,"[MNXM1102128, MNXM1104529, MNXM147449, MNXM148...","[MNXM1102128, MNXM147449]","[MNXM1104529, MNXM148157]"
74611,MNXR99998,1 MNXM10945@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,1 MNXM10945 + 1 MNXM1104529 = 1 MNXM10946 + 1 ...,"[MNXM1102128, MNXM1104529, MNXM10945, MNXM10946]","[MNXM10945, MNXM1104529]","[MNXM10946, MNXM1102128]"


In [6]:
chem = pd.read_csv('chem_prop.tsv', sep='\t', header=351) # skip 351 lines of documentation
chem

Unnamed: 0,#ID,name,reference,formula,charge,mass,InChI,InChIKey,SMILES
0,BIOMASS,BIOMASS,mnx:BIOMASS,,,,,,
1,MNXM01,PMF,mnx:PMF,H,1.0,1.00794,InChI=1S/p+1,InChIKey=GPRLSGONYQIRFK-UHFFFAOYSA-N,[H+]
2,MNXM02,OH(-),mnx:HYDROXYDE,OH,-1.0,17.00734,InChI=1S/H2O/h1H2/p-1,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-M,[O-][H]
3,MNXM03,H3O(+),mnx:OXONIUM,H3O,1.0,19.02322,InChI=1S/H2O/h1H2/p+1,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-O,[OH3+]
4,MNXM1,H(+),mnx:PROTON,H,1.0,1.00794,InChI=1S/p+1,InChIKey=GPRLSGONYQIRFK-UHFFFAOYSA-N,[H+]
...,...,...,...,...,...,...,...,...,...
1292149,MNXM999996,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692384,C73H121NO9P,-1.0,1186.87844,InChI=1S/C73H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=VNZHXXLXDVSBLA-IZNAGHOASA-M,CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)NC...
1292150,MNXM999997,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692385,C69H121NO9P,-1.0,1138.87844,InChI=1S/C69H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=FTBDAPNXHPOOLH-RUXWUTLCSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...
1292151,MNXM999998,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692386,C69H119NO9P,-1.0,1136.86279,InChI=1S/C69H120NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=UTTKGJJHRRHZRR-BNJOEXAFSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...
1292152,MNXM999999,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692387,C71H121NO9P,-1.0,1162.87844,InChI=1S/C71H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=BSOMIPWIPDKFRW-FMCFKDERSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...
