# Convert HMDB XML export to pythonic format

In [1]:
import requests as req
from lxml import etree as ET
import pandas as pd
import pickle
import numpy as np
import json
from IPython.display import clear_output
import time

import data_utils as du

data_dir = du.find_data_dir('app')
source = du.get_file_path(data_dir, 'HMDB metabolites', 'Raw xml', 'hmdb_metabolites.xml')

hmdb_metabolites_secondary_accessions_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_secondary_accessions.p')
hmdb_metabolites_direct_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_direct_features.p')
hmdb_metabolites_synonyms_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_synonyms.p')
hmdb_metabolites_taxonomy_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_taxonomy.p')
hmdb_metabolites_biological_properties_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_biological_properties.p')
hmdb_metabolites_physical_properties_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_physical_properties.p')

In [2]:
ID_features = ['accession', 'name']
    
direct_features = ['accession', 'name', 'description', 'chemical_formula', 'average_molecular_weight', 
                   'monisotopic_molecular_weight', 'iupac_name', 'traditional_iupac', 
                   'cas_registry_number', 'smiles', 'inchi', 'inchikey', 'state', 'synthesis_reference']

direct_dict_features = ['taxonomy', 'biological_properties']

taxonomy_list_names = ['alternative_parents', 'substituents', 'external_descriptors']

indirect_list_features = []

indirect_list_of_dicts_features = ['ontology', 'experimental_properties', 'predicted_properties', 'spectra', 'normal_concentrations', 'abnormal_concentrations', 'diseases', 'general_references', 'protein_associations']

physical_properties_features = ['experimental_properties', 'predicted_properties']

other_db_ids = ['kegg_id', 'foodb_id', 'chemspider_id', 'drugbank_id', 'pdb_id',
                'chebi_id', 'pubchem_compound_id', 'biocyc_id', 'wikipedia_id',
                'knapsack_id', 'phenol_explorer_compound_id', 'bigg_id', 'metlin_id',
                'vmh_id', 'fbonto_id']

unprocessed = []

In [3]:
def get_child(elem, childname, namespace):
    return elem.find(namespace+f'{childname}')

def text_of_child(elem, childname, namespace):
    return get_child(elem, childname, namespace).text

def list_childnames(elem):
    if len(elem) > 0:
        return [child.text for child in elem]
    else:
        return []

def create_dict_of_elem(elem, namespace):
    elem_dict = {}
    for child in elem:
        tag = child.tag.replace(namespace, '')
        if tag=='descendants' or tag=='references':
            elem_dict[tag] = create_list_of_dicts(child, namespace)
        elif tag=='synonyms':
            elem_dict[tag] = list_childnames(child)
        else:
            elem_dict[tag] = child.text
    return elem_dict

def create_list_of_dicts(elem, namespace):
    list_of_dicts = []
    for instance_elem in elem:
        list_of_dicts.append(create_dict_of_elem(instance_elem, namespace))
    return list_of_dicts
        
def parse_metabolite(new_metabolite, namespace):   
    new_dict = {}
    
    for feature in direct_features:
        new_dict[feature] = text_of_child(new_metabolite, feature, namespace)
        
    for indirect_list_feature in indirect_list_features:
        feature_elem = get_child(new_metabolite, indirect_list_feature, namespace)
        new_dict[indirect_list_feature] = list_childnames(feature_elem)

    for direct_dict_feature in direct_dict_features:
        feature_elem = get_child(new_metabolite, direct_dict_feature, namespace)
        feature_dict = create_dict_of_elem(feature_elem, namespace)
        
        if direct_dict_feature == 'taxonomy':
            for taxonomy_list_name in taxonomy_list_names: 
                taxonomy_list_elem = get_child(feature_elem, taxonomy_list_name, namespace)
                if taxonomy_list_elem:
                    feature_dict[taxonomy_list_name] = list_childnames(taxonomy_list_elem)
                else:
                    feature_dict[taxonomy_list_name] = []

        if direct_dict_feature == 'biological_properties':
            feature_dict['cellular_locations'] = list_childnames(get_child(feature_elem, 'cellular_locations', namespace))
            feature_dict['biospecimen_locations'] = list_childnames(get_child(feature_elem, 'biospecimen_locations', namespace))
            feature_dict['tissue_locations'] = list_childnames(get_child(feature_elem, 'tissue_locations', namespace))
            feature_dict['pathways'] = create_list_of_dicts(get_child(feature_elem, 'pathways', namespace), namespace)
            
        new_dict[direct_dict_feature] = feature_dict
        
    for indirect_list_of_dicts_feature in indirect_list_of_dicts_features:
        feature_elem = get_child(new_metabolite, indirect_list_of_dicts_feature, namespace)
        feature_list_of_dicts = create_list_of_dicts(feature_elem, namespace)
        
        new_dict[indirect_list_of_dicts_feature] = feature_list_of_dicts

    for db_id in other_db_ids:
        try:
            new_dict[db_id] = text_of_child(new_metabolite, db_id, namespace)   
        except:
            new_dict[db_id] = None

    return new_dict

def parse_taxonomy(new_metabolite, namespace):
    new_dict = {}
    
    for ID_feature in ID_features:
        new_dict[ID_feature] = text_of_child(new_metabolite, ID_feature, namespace)
        
    taxonomy_elem = get_child(new_metabolite, 'taxonomy', namespace)
    taxonomy_dict = create_dict_of_elem(taxonomy_elem, namespace)

    for taxonomy_list_name in taxonomy_list_names: 
        taxonomy_list_elem = get_child(taxonomy_elem, taxonomy_list_name, namespace)
        if taxonomy_list_elem:
            taxonomy_dict[taxonomy_list_name] = list_childnames(taxonomy_list_elem)
        else:
            taxonomy_dict[taxonomy_list_name] = []
    
    new_dict['taxonomy'] = taxonomy_dict
    
    return new_dict

def parse_biological_properties(new_metabolite, namespace):
    new_dict = {}
    
    for ID_feature in ID_features:
        new_dict[ID_feature] = text_of_child(new_metabolite, ID_feature, namespace)
        
    bio_prop_elem = get_child(new_metabolite, 'biological_properties', namespace)
    bio_prop_dict = create_dict_of_elem(bio_prop_elem, namespace)

    bio_prop_dict['cellular_locations'] = list_childnames(get_child(bio_prop_elem, 'cellular_locations', namespace))
    bio_prop_dict['biospecimen_locations'] = list_childnames(get_child(bio_prop_elem, 'biospecimen_locations', namespace))
    bio_prop_dict['tissue_locations'] = list_childnames(get_child(bio_prop_elem, 'tissue_locations', namespace))
    bio_prop_dict['pathways'] = create_list_of_dicts(get_child(bio_prop_elem, 'pathways', namespace), namespace)

    new_dict['biological_properties'] = bio_prop_dict
    
    return new_dict

def parse_physical_properties(new_metabolite, namespace):
    new_dict = {}
    
    for ID_feature in ID_features:
        new_dict[ID_feature] = text_of_child(new_metabolite, ID_feature, namespace)
        
    for physical_property_feature in physical_properties_features:
        feature_elem = get_child(new_metabolite, physical_property_feature, namespace)
        feature_list_of_dicts = create_list_of_dicts(feature_elem, namespace)
        
        new_dict[physical_property_feature] = feature_list_of_dicts        
    
    return new_dict

def test_ontology(new_metabolite, namespace):
    ontology_elem = new_metabolite.find(namespace+f'ontology')
    feature_list_of_dicts = create_list_of_dicts(ontology_elem, namespace)
    for dict in feature_list_of_dicts:
        print(json.dumps(dict,indent=1))

def test_parse(new_metabolite, namespace):
    feature_elem = new_metabolite.find(namespace+f'gene_properties')
    for child in new_metabolite:
        print(child.tag.replace(namespace, ''), ':', child.text)

def parse_index(new_metabolite, namespace, start_line, end_line):
    new_dict = {}
    
    for ID_feature in ID_features:
        new_dict[ID_feature] = text_of_child(new_metabolite, ID_feature, namespace)
    
    new_dict['start_line'] = start_line
    new_dict['end_line'] = end_line

    return new_dict

def parse_direct_features(new_metabolite, namespace):
    new_dict = {}
    
    for feature in direct_features:
        new_dict[feature] = text_of_child(new_metabolite, feature, namespace)  

    for db_id in other_db_ids:
        try:
            new_dict[db_id] = text_of_child(new_metabolite, db_id, namespace) 
        except:
            new_dict[db_id] = None

    return new_dict

def parse_synonyms(new_metabolite, namespace):    
    synonyms = new_metabolite.find(namespace+f'synonyms')
    synonyms_list = []
    
    for synonym in synonyms:
        synonyms_list.append(synonym.text)
    
    new_dict = {'name': new_metabolite.find(namespace+f'name').text,
                'accession': new_metabolite.find(namespace+f'accession').text, 
                'synonyms': synonyms_list}
    return new_dict

def parse_secondary_accesions(new_metabolite, namespace):
    secondary_accessions = list_childnames(get_child(new_metabolite, 'secondary_accessions', namespace))
    new_dict = {'accession': new_metabolite.find(namespace+f'accession').text, 
                'secondary_accessions': secondary_accessions}
    return new_dict

def parse_hmdb_xml(filename, parse_function):
    metabolite_dicts = [{}]*114222
    context = ET.iterparse(filename, events=("start","end"))
    namespace = '{http://www.hmdb.ca}'
    metabolite = namespace+'metabolite'
    hmdb = namespace+'hmdb'
    
    metabolite_counter=0
    t0 = time.time()
    
    for event, elem in context:
        if event=="start":
            if elem.tag==metabolite:
                new_metabolite = elem
            elif elem.tag==hmdb:
                continue
        elif event=="end":
            if elem.tag==metabolite:
                metabolite_dicts[metabolite_counter] = parse_function(elem, namespace)
                metabolite_counter+= 1
                if metabolite_counter % 1000==0 and metabolite_counter>0:
                    clear_output(wait=True)
                    t1 = time.time()
                    duration = float(t1-t0)
                    print(f'Finished {metabolite_counter} in {duration:.0f}s')
                elem.clear()
                
    return metabolite_dicts

## Parse raw XML

#### Secondary accessions

In [4]:
%%time
hmdb_metabolites_secondary_accessions = parse_hmdb_xml(str(source), parse_secondary_accesions)
du.dump_in_pickle(hmdb_metabolites_secondary_accessions_file, hmdb_metabolites_secondary_accessions)

Finished 1000 in 8s


KeyboardInterrupt: 

#### Direct features

In [6]:
%%time
hmdb_metabolites_direct_features = parse_hmdb_xml(str(source), parse_direct_features)
du.dump_in_pickle(hmdb_metabolites_direct_features_file, hmdb_metabolites_direct_features)

Finished 114000 in 225s
CPU times: user 3min 43s, sys: 3.04 s, total: 3min 46s
Wall time: 3min 46s


#### Synonyms

In [8]:
%%time
hmdb_metabolites_synonyms = parse_hmdb_xml(str(source), parse_synonyms)
du.dump_in_pickle(hmdb_metabolites_synonyms_file, hmdb_metabolites_synonyms)

Finished 114000 in 183s
CPU times: user 3min 2s, sys: 1.18 s, total: 3min 3s
Wall time: 3min 3s


#### Taxonomy

In [19]:
%%time
hmdb_metabolites_taxonomy = parse_hmdb_xml(str(source), parse_taxonomy)
du.dump_in_pickle(hmdb_metabolites_taxonomy_file, hmdb_metabolites_taxonomy)

Finished 114000 in 163s
CPU times: user 2min 43s, sys: 1.83 s, total: 2min 45s
Wall time: 2min 44s


#### Biological properties

In [18]:
%%time
hmdb_metabolites_biological_properties = parse_hmdb_xml(str(source), parse_biological_properties)
du.dump_in_pickle(hmdb_metabolites_biological_properties_file, hmdb_metabolites_biological_properties)

CPU times: user 1.72 s, sys: 230 ms, total: 1.95 s
Wall time: 1.95 s


#### Physical properties

In [20]:
%%time
hmdb_metabolites_physical_properties = parse_hmdb_xml(str(source), parse_physical_properties)
du.dump_in_pickle(hmdb_metabolites_physical_properties_file, hmdb_metabolites_physical_properties)

Finished 114000 in 186s
CPU times: user 3min 9s, sys: 2.97 s, total: 3min 12s
Wall time: 3min 11s


### Read from pickle

In [14]:
hmdb_metabolites_direct_features = du.read_from_pickle(hmdb_metabolites_direct_features_file)
hmdb_metabolites_synonyms = du.read_from_pickle(hmdb_metabolites_synonyms_file)
hmdb_metabolites_taxonomy = du.read_from_pickle(hmdb_metabolites_taxonomy_file)
hmdb_metabolites_biological_properties = du.read_from_pickle(hmdb_metabolites_biological_properties_file)
hmdb_metabolites_physical_properties = du.read_from_pickle(hmdb_metabolites_physical_properties_file)

In [15]:
hmdb_metabolites_direct_features[0]

{'accession': 'HMDB0000001',
 'name': '1-Methylhistidine',
 'description': "1-Methylhistidine, also known as 1-MHis, belongs to the class of organic compounds known as histidine and derivatives. Histidine and derivatives are compounds containing cysteine or a derivative thereof resulting from a reaction of cysteine at the amino group or the carboxy group, or from the replacement of any hydrogen of glycine by a heteroatom. 1-Methylhistidine is derived mainly from the anserine of dietary flesh sources, especially poultry. The enzyme, carnosinase, splits anserine into beta-alanine and 1-MHis. High levels of 1-MHis tend to inhibit the enzyme carnosinase and increase anserine levels. Conversely, genetic variants with deficient carnosinase activity in plasma show increased 1-MHis excretions when they consume a high meat diet. Reduced serum carnosinase activity is also found in patients with Parkinson's disease and multiple sclerosis and patients following a cerebrovascular accident. Vitamin 