# MetaNetX data preprocessing

Correlate `reac_prop.tsv` and `chem_prop.tsv` to convert dataset into useful format (SMILES, InChI or InChiKey).

Two methods, one simpler and one more comprehensive.

1. Discard all reaction information and get a list of metabolites involved in reaction to be seen as "bioreachable".
2. Retain all reaction information but change MNX_ID's into useful chemical representations.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
reac = pd.read_csv('reac_clean.csv')
for col in ['metabolites', 'substrates', 'products']:
    reac[col] = reac[col].apply(eval)
reac

Unnamed: 0,#ID,mnx_equation,equation,metabolites,substrates,products
0,EMPTY,=,=,[],[],[]
1,MNXR01,1 MNXM01@MNXD1 = 1 MNXM1@MNXD1,1 MNXM01 = 1 MNXM1,"[MNXM1, MNXM01]",[MNXM01],[MNXM1]
2,MNXR02,1 MNXM1@MNXD1 = 1 MNXM1@MNXD2,1 MNXM1 = 1 MNXM1,[MNXM1],[MNXM1],[MNXM1]
3,MNXR03,1 MNXM01@MNXD1 = 1 MNXM01@MNXD2,1 MNXM01 = 1 MNXM01,[MNXM01],[MNXM01],[MNXM01]
4,MNXR100000,1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,1 MNXM10958 + 1 MNXM1104529 = 1 MNXM1102128 + ...,"[MNXM1104529, MNXM8415, MNXM10958, MNXM1102128]","[MNXM10958, MNXM1104529]","[MNXM1102128, MNXM8415]"
...,...,...,...,...,...,...
74608,MNXR99995,1 MNXM1100890@MNXD1 + 1 MNXM147451@MNXD1 = 1 M...,1 MNXM1100890 + 1 MNXM147451 = 1 MNXM1102128 +...,"[MNXM8416, MNXM147451, MNXM1102128, MNXM1100890]","[MNXM1100890, MNXM147451]","[MNXM1102128, MNXM8416]"
74609,MNXR99996,1 MNXM1100890@MNXD1 + 1 MNXM163780@MNXD1 = 1 M...,1 MNXM1100890 + 1 MNXM163780 = 1 MNXM1102128 +...,"[MNXM8416, MNXM163780, MNXM1102128, MNXM1100890]","[MNXM1100890, MNXM163780]","[MNXM1102128, MNXM8416]"
74610,MNXR99997,1 MNXM1102128@MNXD1 + 1 MNXM147449@MNXD1 = 1 M...,1 MNXM1102128 + 1 MNXM147449 = 1 MNXM1104529 +...,"[MNXM148157, MNXM147449, MNXM1104529, MNXM1102...","[MNXM1102128, MNXM147449]","[MNXM1104529, MNXM148157]"
74611,MNXR99998,1 MNXM10945@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,1 MNXM10945 + 1 MNXM1104529 = 1 MNXM10946 + 1 ...,"[MNXM10945, MNXM10946, MNXM1104529, MNXM1102128]","[MNXM10945, MNXM1104529]","[MNXM10946, MNXM1102128]"


In [3]:
chem = pd.read_csv('mnx_chem_bioreachable.csv')
chem

Unnamed: 0,#ID,name,reference,formula,charge,mass,InChI,InChIKey,SMILES
0,MNXM01,PMF,mnx:PMF,H,1.0,1.00794,1S/p+1,GPRLSGONYQIRFK-UHFFFAOYSA-N,[H+]
1,MNXM1,H(+),mnx:PROTON,H,1.0,1.00794,1S/p+1,GPRLSGONYQIRFK-UHFFFAOYSA-N,[H+]
2,MNXM10,NADH,chebi:57945,C21H27N7O14P2,-2.0,663.11022,1S/C21H29N7O14P2/c22-17-12-19(25-7-24-17)28(8-...,BOPGDPNILDQYTO-NNYOXOHSSA-L,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...
3,MNXM100,(2E)-geranyl diphosphate,chebi:58057,C10H17O7P2,-3.0,311.04660,"1S/C10H20O7P2/c1-9(2)5-4-6-10(3)7-8-16-19(14,1...",GVVPGTZRZFNKDS-JXMROGBWSA-K,CC(C)=CCC/C(C)=C/COP(=O)([O-])OP(=O)([O-])[O-]
4,MNXM10002,3-deoxycapsidiol,chebi:72642,C15H24O,0.0,220.18272,1S/C15H24O/c1-10(2)12-6-7-13-14(16)8-5-11(3)15...,NJWPLFBOSCSZFA-QHSBEEBCSA-N,C=C(C)[C@@H]1CC=C2[C@H](O)CC[C@@H](C)[C@@]2(C)C1
...,...,...,...,...,...,...,...,...,...
42547,MNXM9994,"5-chlorobenzoate-cis-3,4-diol",metacycM:CPD-11220,C7H6ClO4,-1.0,188.99601,1S/C7H7ClO4/c8-4-1-3(7(11)12)2-5(9)6(4)10/h1-2...,GNYUNLRRAAQENB-NTSWFWBYSA-M,O=C([O-])C1=C[C@H](O)[C@H](O)C(Cl)=C1
42548,MNXM9995,3-chlorotoluene,metacycM:CPD-10654,C7H7Cl,0.0,126.02363,"1S/C7H7Cl/c1-6-3-2-4-7(8)5-6/h2-5H,1H3",OSOUNOBYRMOXQQ-UHFFFAOYSA-N,Cc1cccc(Cl)c1
42549,MNXM99969,compound 0043171,envipathM:650babc9-9d68-4b73-9332-11972ca26f7b...,C35H72,0.0,492.56340,1S/C35H72/c1-3-5-7-9-11-13-15-17-19-21-23-25-2...,VHQQPFLOGSTQPC-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
42550,MNXM9999,3-demethylubiquinol-7,chebi:84431,C43H66O4,0.0,646.49611,1S/C43H66O4/c1-31(2)17-11-18-32(3)19-12-20-33(...,OHBHBMXNJCUMCR-DKCCAHEHSA-N,COc1c(O)c(O)c(C)c(C/C=C(\C)CC/C=C(\C)CC/C=C(\C...


## Retaining RXN info

In [4]:
def findIdentifier(mnxid_list, identifier='SMILES'):

    return chem[chem['#ID'].isin(mnxid_list)][identifier].values.tolist()

In [5]:
substrates_identifiers = []
products_identifiers = []
for i in trange(len(reac)):

    substrates_identifiers.append(findIdentifier(reac.loc[i, 'substrates']))
    products_identifiers.append(findIdentifier(reac.loc[i, 'products']))

reac['substrates_SMILES'] = substrates_identifiers
reac['products_SMILES'] = products_identifiers

100%|██████████| 74613/74613 [06:37<00:00, 187.83it/s]


In [6]:
substrates_identifiers = []
products_identifiers = []
for i in trange(len(reac)):

    substrates_identifiers.append(findIdentifier(reac.loc[i, 'substrates'], identifier='InChI'))
    products_identifiers.append(findIdentifier(reac.loc[i, 'products'], identifier='InChI'))

reac['substrates_InChI'] = substrates_identifiers
reac['products_InChI'] = products_identifiers

100%|██████████| 74613/74613 [06:20<00:00, 195.93it/s]


In [7]:
substrates_identifiers = []
products_identifiers = []
for i in trange(len(reac)):

    substrates_identifiers.append(findIdentifier(reac.loc[i, 'substrates'], identifier='InChIKey'))
    products_identifiers.append(findIdentifier(reac.loc[i, 'products'], identifier='InChIKey'))

reac['substrates_InChIKey'] = substrates_identifiers
reac['products_InChIKey'] = products_identifiers

100%|██████████| 74613/74613 [05:55<00:00, 210.13it/s]


In [8]:
reac.to_csv('mnx_reac_bioreachable.csv', index=False)