# Extract data from HMDB (The Human Metabolome Database)

http://www.hmdb.ca/downloads

In [19]:
from __future__ import print_function
import urllib
import gzip
import io
import os
import pandas as pd
from rdkit import Chem
from pandas import DataFrame
from zipfile import ZipFile

FORCE_DOWNLOAD = False
COMPRESS = True

In [20]:
structures_url = 'http://www.hmdb.ca/downloads/structures.zip'

In [21]:
structures_file = '../../data/dl/hmdb_structures.zip'

if not os.path.exists(structures_file) or FORCE_DOWNLOAD:
    urllib.urlretrieve(structures_url, '../../data/dl/hmdb_structures.zip')

In [22]:
columns = ['GENERIC_NAME', 'CHEMICAL_FORMULA', 'INCHI_KEY', 'EXACT_MASS',
           'ALOGPS_LOGP', 'ALOGPS_LOGS', 'ALOGPS_SOLUBILITY', 'MOLECULAR_WEIGHT', 'SMILES',
           'HMDB_ID', 'INCHI_IDENTIFIER',
           'JCHEM_ACCEPTOR_COUNT', 'JCHEM_ACIDIC_PKA', 'JCHEM_BASIC_PKA', 'JCHEM_DONOR_COUNT', 'JCHEM_LOGP',
           'JCHEM_PHYSIOLOGICAL_CHARGE', 'JCHEM_POLARIZABILITY', 'JCHEM_POLAR_SURFACE_AREA', 'JCHEM_REFRACTIVITY',
           'JCHEM_ROTATABLE_BOND_COUNT',
           'IUPAC_NAME', 'TRADITIONAL_IUPAC_NAME', 'SYNONYMS']

In [23]:
with ZipFile(structures_file, 'r') as zh:
    zh.extract('structures.sdf', path='../data/dl/')

In [31]:
sdf_file = '../../data/dl/structures.sdf'
molecules = Chem.SDMolSupplier(sdf_file)
data = []

In [25]:
for mol in molecules:
    if mol and isinstance(mol, Chem.Mol):
        info = {}

        for prop in columns:
            if mol.HasProp(prop):
                info[prop] = mol.GetProp(prop)

        data.append(info)

In [26]:
# Create DataFrame
df = DataFrame(data)

# Reorder columns
df = df[columns]

# Preview
df.head(3)

Unnamed: 0,GENERIC_NAME,CHEMICAL_FORMULA,INCHI_KEY,EXACT_MASS,ALOGPS_LOGP,ALOGPS_LOGS,ALOGPS_SOLUBILITY,MOLECULAR_WEIGHT,SMILES,HMDB_ID,...,JCHEM_DONOR_COUNT,JCHEM_LOGP,JCHEM_PHYSIOLOGICAL_CHARGE,JCHEM_POLARIZABILITY,JCHEM_POLAR_SURFACE_AREA,JCHEM_REFRACTIVITY,JCHEM_ROTATABLE_BOND_COUNT,IUPAC_NAME,TRADITIONAL_IUPAC_NAME,SYNONYMS
0,1-Methylhistidine,C7H11N3O2,InChIKey=BRMWTNUJHUMWMS-LURJTMIESA-N,169.085126611,-3.0,-1.4,6.93 g/L,169.1811,CN1C=NC(C[C@H](N)C(O)=O)=C1,HMDB00001,...,2,-3.1,0,16.95,81.14,42.39,3,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,1 methylhistidine,1 Methylhistidine;1-Methyl histidine;1-Methyl-...
1,"1,3-Diaminopropane",C3H10N2,InChIKey=XFNJVJPLKCPIBV-UHFFFAOYSA-N,74.08439833,-1.4,0.77,437 g/L,74.1249,NCCCN,HMDB00002,...,2,-1.4,2,9.06,52.04,22.73,2,"propane-1,3-diamine","1,3-diaminopropane","1,3-Diamino-N-propane;1,3-Propanediamine;1,3-P..."
2,2-Ketobutyric acid,C4H6O3,InChIKey=TYEYBOSBBBHJIV-UHFFFAOYSA-N,102.031694058,0.07,-0.11,79.2 g/L,102.0886,CCC(=O)C(O)=O,HMDB00005,...,1,0.77,-1,9.21,54.37,22.62,2,2-oxobutanoic acid,2-oxobutanoic acid,2-Ketobutanoate;2-Ketobutanoic acid;2-Ketobuty...


In [29]:
# Save
if COMPRESS:
    with gzip.open('../../data/hmdb.csv.gz', 'w') as gzf:
        df.to_csv(gzf, encoding='utf-8', index=False)
else:
    df.to_csv('../../data/hmdb.csv', encoding='utf-8', index=False)

In [32]:
# Remove the sdf file
os.remove(sdf_file)