In [65]:
from __future__ import print_function
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pubchempy
from pandas import Series, DataFrame
from rdkit import Chem
from IPython.core.pylabtools import figsize

%matplotlib inline
figsize(16, 8)

# Load Data and compute InChI Keys

In [66]:
df_train = pd.read_csv('../../data/TableS2_training.csv', keep_default_na=False)
df_test = pd.read_csv('../../data/TableS3_training.csv', keep_default_na=False)
df_hmdb = pd.read_csv('../../data/hmdb.csv.gz', low_memory=False, keep_default_na=False)

# Calculate InChI Key
df_train['InChI_Key'] = df_train['InChI'].apply(lambda x: 'InChIKey=' + Chem.inchi.InchiToInchiKey(x))
df_test['InChI_Key'] = df_test['InChI'].apply(lambda x: 'InChIKey=' + Chem.inchi.InchiToInchiKey(x))

In [68]:
for df in [df_train, df_test]:
    df['Match'] = df['InChI_Key'].apply(lambda x: len(df_hmdb[df_hmdb['INCHI_KEY'] == x]))

for df in [df_train, df_test]:
    for i, row in df.iterrows():
        if row['Match'] == 0:
            print('Skipping', row['Name'])
    
df_train = df_train[df_train['Match'] > 0]
df_test = df_test[df_test['Match'] > 0]

Skipping L-Methionine sulfone
Skipping o-L-Tyrosine


In [69]:
hmdb_data = []

for df in [df_train, df_test]:
    for i, row in df.iterrows():
        df_h = df_hmdb[df_hmdb['INCHI_KEY'] == row['InChI_Key']]
        hmdb_data.append(df_h.iloc[0])

In [70]:
df_hmdb_match = DataFrame(hmdb_data)

In [71]:
df_train_extended = pd.merge(df_train, df_hmdb_match, how='left', left_on='InChI_Key', right_on='INCHI_KEY')
df_test_extended = pd.merge(df_test, df_hmdb_match, how='left', left_on='InChI_Key', right_on='INCHI_KEY')

In [72]:
df_train_extended.to_csv('../../data/TableS2_training_hmdb.csv', encoding='utf-8', index=False)
df_train_extended.drop(['INCHI_KEY'], axis=1)
df_train_extended.head(3)

Unnamed: 0,Name,Annotation,Short Name,InChI,MH+ Fragment,MH+ Isotope,fragment/isotope,MV,logD,abs_mob,...,JCHEM_DONOR_COUNT,JCHEM_LOGP,JCHEM_PHYSIOLOGICAL_CHARGE,JCHEM_POLARIZABILITY,JCHEM_POLAR_SURFACE_AREA,JCHEM_REFRACTIVITY,JCHEM_ROTATABLE_BOND_COUNT,IUPAC_NAME,TRADITIONAL_IUPAC_NAME,SYNONYMS
0,L-Alanine,a,Ala,"InChI=1S/C3H7NO2/c1-2(4)3(5)6/h2H,4H2,1H3,(H,5...",90,,,70.3,-2.79,3.14,...,2,-2.8,0,8.54,63.32,20.5,1,(2S)-2-aminopropanoic acid,L-alanine,(2S)-2-Aminopropanoate;(2S)-2-Aminopropanoic a...
1,β-Alanine,a,β-Ala,"InChI=1S/C3H7NO2/c4-2-1-3(5)6/h1-2,4H2,(H,5,6)",90,,,70.4,-3.01,4.14,...,2,-3.2,0,8.62,63.32,20.7,2,3-aminopropanoic acid,β-alanine,2-Carboxyethylamine;3-Amino-Propanoate;3-Amino...
2,γ-Aminobutyric acid,a,GABA,"InChI=1S/C4H9NO2/c5-3-1-2-4(6)7/h1-3,5H2,(H,6,7)",104,,,89.2,-3.1,0.000409,...,2,-2.9,0,10.62,63.32,25.46,3,4-aminobutanoic acid,gamma(amino)-butyric acid,3-Carboxypropylamine;4-Aminobutanoate;4-Aminob...


In [73]:
df_test_extended.to_csv('../../data/TableS3_training_hmdb.csv', encoding='utf-8', index=False)
df_test_extended.drop(['INCHI_KEY'], axis=1)
df_test_extended.head(3)

Unnamed: 0,Name,Short Name,InChI,MH+,MV,logD,abs_mob,zeff,RRF_Measured,RRF_Predicted,...,JCHEM_DONOR_COUNT,JCHEM_LOGP,JCHEM_PHYSIOLOGICAL_CHARGE,JCHEM_POLARIZABILITY,JCHEM_POLAR_SURFACE_AREA,JCHEM_REFRACTIVITY,JCHEM_ROTATABLE_BOND_COUNT,IUPAC_NAME,TRADITIONAL_IUPAC_NAME,SYNONYMS
0,L-Ornithine,Orn,"InChI=1S/C5H12N2O2/c6-3-1-2-4(7)5(8)9/h4H,1-3,...",133,113,-4.22,0.000686,1.82,0.281,0.631,...,3,-3.7,1,13.94,89.34,33.21,4,"(2S)-2,5-diaminopentanoic acid",L-ornithine,"(+)-S-Ornithine;(S)-2,5-Diaminopentanoate;(S)-..."
1,N-Methyl-Aspartic acid,MeAsp,"InChI=1S/C5H9NO4/c1-6-3(5(9)10)2-4(7)8/h3,6H,2...",148,114,-2.58,0.000348,0.71,0.842,0.667,...,3,-3.3,-1,13.18,86.63,31.31,4,(2R)-2-(methylamino)butanedioic acid,N-methyl-D-aspartate,N-Me-D-Asp-OH;N-Methyl D-aspartate;N-Methyl D-...
2,3-Methyl-L-Histidine,MeHis,InChI=1S/C7H11N3O2/c1-10-4-9-3-5(10)2-6(8)7(11...,170,144,-2.94,0.00062,1.49,1.4,2.28,...,2,-3.4,0,16.58,81.14,42.96,3,(2S)-2-amino-3-(1-methyl-1H-imidazol-5-yl)prop...,3-methylhistidine,(2S)-2-amino-3-(1-methyl-1H-imidazol-5-yl)prop...
