In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pubchempy
from pandas import Series, DataFrame
from rdkit import Chem
from IPython.core.pylabtools import figsize

%matplotlib inline
figsize(16, 8)

# Load Data and compute InChI Keys

In [2]:
df_std1 = pd.read_csv('../../data/std1_mh_11_pubchem.csv', keep_default_na=False)
df_std2 = pd.read_csv('../../data/std2_mh_11_pubchem.csv', keep_default_na=False)
df_hmdb = pd.read_csv('../../data/hmdb.csv.gz', low_memory=False, keep_default_na=False)

# Correct InChI Key
df_std1['InChI_Key'] = df_std1['InChI_Key'].apply(lambda x: 'InChIKey=' + x)
df_std2['InChI_Key'] = df_std2['InChI_Key'].apply(lambda x: 'InChIKey=' + x)

In [3]:
df_std1.head(3)

Unnamed: 0,Name,Formula,Adduct,Dil_1_1,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,molecular_formula,rotatable_bond_count,tpsa,xlogp
0,Spermidine,C7H19N3,M+H,24278942,"InChI=1S/C7H19N3/c8-4-1-2-6-10-7-3-5-9/h10H,1-9H2",InChIKey=ATHGHQPFGPMSJY-UHFFFAOYSA-N,1102,56.8,145.157898,3,3,10,C7H19N3,7,64.1,-1.0
1,Glycerol,C3H8O3,M+H,2168900,"InChI=1S/C3H8O3/c4-1-3(6)2-5/h3-6H,1-2H2",InChIKey=PEDCQBHIVMGVHV-UHFFFAOYSA-N,753,25.2,92.047344,3,3,6,C3H8O3,2,60.7,-1.8
2,Inosine,C10H12N4O5,M+H,70263872,InChI=1S/C10H12N4O5/c15-1-4-6(16)7(17)10(19-4)...,InChIKey=UGQMRVRMYYASKQ-KQYNXXCUSA-N,6021,405.0,268.08077,6,4,19,C10H12N4O5,2,129.0,-2.1


In [4]:
for df in [df_std1, df_std2]:
    df['Match'] = df['InChI_Key'].apply(lambda x: len(df_hmdb[df_hmdb['INCHI_KEY'] == x]))

for df in [df_std1, df_std2]:
    for i, row in df.iterrows():
        if row['Match'] == 0:
            print('Skipping', row['Name'])

print("std1 Compunds before:", len(df_std1))
df_std1 = df_std1[df_std1['Match'] > 0]
print("std1 Compunds after:", len(df_std1))

print("std2 Compunds before:", len(df_std1))
df_std2 = df_std2[df_std2['Match'] > 0]
print("std2 Compunds after:", len(df_std1))

Skipping N2-Acetyl-L-lysine
Skipping sn-glycero-3-Phosphocholine
Skipping Phenolsulfonphthalein
Skipping Phenylhydrazine
Skipping L-Rhamnose
Skipping MOPS
Skipping L-Cysteate
Skipping CMP
Skipping IDP
Skipping Deoxyribose
Skipping Deoxyadenosine
Skipping L-2-Aminoadipate
Skipping HEPES
Skipping Eflornithine
Skipping Taurocholate
Skipping Pantothenate
Skipping N-Acetylneuraminate
Skipping D-Glucuronolactone
std1 Compunds before: 63
std1 Compunds after: 53
std2 Compunds before: 53
std2 Compunds after: 53


In [5]:
hmdb_data = []

for df in [df_std1, df_std2]:
    for i, row in df.iterrows():
        df_h = df_hmdb[df_hmdb['INCHI_KEY'] == row['InChI_Key']]
        hmdb_data.append(df_h.iloc[0])

In [6]:
df_hmdb_match = DataFrame(hmdb_data)

In [7]:
df_std1_extended = pd.merge(df_std1, df_hmdb_match, how='left', left_on='InChI_Key', right_on='INCHI_KEY')
df_std2_extended = pd.merge(df_std2, df_hmdb_match, how='left', left_on='InChI_Key', right_on='INCHI_KEY')

In [8]:
df_std1_extended.to_csv('../../data/std1_mh_11_pubchem_hmdb.csv', encoding='utf-8', index=False)
df_std1_extended.drop(['INCHI_KEY', 'Match'], axis=1)
df_std1_extended.head(3)

Unnamed: 0,Name,Formula,Adduct,Dil_1_1,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,...,JCHEM_DONOR_COUNT,JCHEM_LOGP,JCHEM_PHYSIOLOGICAL_CHARGE,JCHEM_POLARIZABILITY,JCHEM_POLAR_SURFACE_AREA,JCHEM_REFRACTIVITY,JCHEM_ROTATABLE_BOND_COUNT,IUPAC_NAME,TRADITIONAL_IUPAC_NAME,SYNONYMS
0,Spermidine,C7H19N3,M+H,24278942,"InChI=1S/C7H19N3/c8-4-1-2-6-10-7-3-5-9/h10H,1-9H2",InChIKey=ATHGHQPFGPMSJY-UHFFFAOYSA-N,1102,56.8,145.157898,3,...,3,-1.1,3,18.8,64.07,44.97,7,(4-aminobutyl)(3-aminopropyl)amine,spermidine,"1,5,10-Triazadecane;1,8-Diamino-4-azaoctane;4-..."
1,Glycerol,C3H8O3,M+H,2168900,"InChI=1S/C3H8O3/c4-1-3(6)2-5/h3-6H,1-2H2",InChIKey=PEDCQBHIVMGVHV-UHFFFAOYSA-N,753,25.2,92.047344,3,...,3,-1.8,0,8.93,60.69,20.52,2,"propane-1,2,3-triol",glycerol,"1,2,3-Trihydroxypropane;Bulbold;Cristal;E 422;..."
2,Inosine,C10H12N4O5,M+H,70263872,InChI=1S/C10H12N4O5/c15-1-4-6(16)7(17)10(19-4)...,InChIKey=UGQMRVRMYYASKQ-KQYNXXCUSA-N,6021,405.0,268.08077,6,...,4,-2.5,0,24.48,129.2,61.33,2,"9-[(2R,3R,4S,5R)-3,4-dihydroxy-5-(hydroxymethy...","9-[(2R,3R,4S,5R)-3,4-dihydroxy-5-(hydroxymethy...","(-)-Inosine;1,9-Dihydro-9-b-D-ribofuranosyl-6H..."


In [9]:
df_std2_extended.to_csv('../../data/std2_mh_11_pubchem_hmdb.csv', encoding='utf-8', index=False)
df_std2_extended.drop(['INCHI_KEY', 'Match'], axis=1)
df_std2_extended.head(3)

Unnamed: 0,Name,Formula,Adduct,Dil_1_1,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,...,JCHEM_DONOR_COUNT,JCHEM_LOGP,JCHEM_PHYSIOLOGICAL_CHARGE,JCHEM_POLARIZABILITY,JCHEM_POLAR_SURFACE_AREA,JCHEM_REFRACTIVITY,JCHEM_ROTATABLE_BOND_COUNT,IUPAC_NAME,TRADITIONAL_IUPAC_NAME,SYNONYMS
0,Hypoxanthine,C5H4N4O,M+H,64196896,InChI=1S/C5H4N4O/c10-5-3-4(7-1-6-3)8-2-9-5/h1-...,InChIKey=FDGQSTZJBFJUBT-UHFFFAOYSA-N,790,190,136.038511,2,...,2,-0.048,0,11.82,74.69,35.5,0,7H-purin-6-ol,7H-purin-6-ol,"1,7-Dihydro-6H-purin-6-one;1,7-Dihydro-6H-puri..."
1,Nicotinamide,C6H6N2O,M+H,949647488,"InChI=1S/C6H6N2O/c7-6(9)5-2-1-3-8-4-5/h1-4H,(H...",InChIKey=DFPAKSUCGFBDDF-UHFFFAOYSA-N,936,114,122.048013,2,...,1,-0.39,0,11.71,55.98,32.98,1,pyridine-3-carboxamide,nicotinamide,3-Carbamoylpyridine;3-Pyridinecarboxamide;3-Py...
2,5-Oxoproline,C5H7NO3,M+H,50486336,"InChI=1S/C5H7NO3/c7-4-2-1-3(6-4)5(8)9/h3H,1-2H...",InChIKey=ODHCTXKNWHHXJC-UHFFFAOYSA-N,499,154,129.042593,3,...,2,-0.89,-1,11.6,66.4,28.09,1,5-oxopyrrolidine-2-carboxylic acid,Glp,(+)-2-Pyrrolidone-5-carboxylate;(+)-2-Pyrrolid...
