# Convert HMDB data from pythonic format to one-hot-encoded pandas DataFrames

In [1]:
import sys
import numpy as np
import pandas as pd
import data_utils as du
import scipy.sparse as sparse

from collections import OrderedDict

data_dir = du.find_data_dir('app')

hmdb_metabolites_direct_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_direct_features.p')
hmdb_metabolites_synonyms_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_synonyms.p')
hmdb_metabolites_taxonomy_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_taxonomy.p')
hmdb_metabolites_biological_properties_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_biological_properties.p')
hmdb_metabolites_physical_properties_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'hmdb_metabolites_physical_properties.p')

#Direct features
metabolite_molecular_weight_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_molecular_weight.p')
metabolite_state_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_state.p')

#Chemical taxonomy
metabolite_kingdom_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_kingdom.p')
metabolite_super_class_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_super_class.p')
metabolite_class_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_class.p')
metabolite_sub_class_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_sub_class.p')
metabolite_direct_parent_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_direct_parent.p')
metabolite_molecular_framework_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_molecular_framework.p')
metabolite_alternative_parents_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_alternative_parents.p')
metabolite_substituents_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_substituents.p')
metabolite_external_descriptors_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_external_descriptors.p')

#Biological properties
metabolite_cellular_locations_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_cellular_locations.p')
metabolite_biospecimen_locations_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_biospecimen_locations.p')
metabolite_tissue_locations_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_tissue_locations.p')
metabolites_pathways_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_pathways.p')

#Physical properties
metabolite_physical_properties_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_physical_properties.p')

#All
metabolite_all_features_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Feature dfs pickle', 'hmdb_metabolites_all_features.p')

#Utils
metabolite_kegg_to_hmdb_file = du.get_file_path(data_dir, 'HMDB metabolites', 'Parsed pickle', 'kegg_id_to_hmdb_id.p')

### Load pickled lists of dicts

In [None]:
hmdb_metabolites_direct_features = du.read_from_pickle(hmdb_metabolites_direct_features_file)
# hmdb_metabolites_synonyms = du.read_from_pickle(hmdb_metabolites_synonyms_file)
hmdb_metabolites_taxonomy = du.read_from_pickle(hmdb_metabolites_taxonomy_file)
hmdb_metabolites_bio_prop = du.read_from_pickle(hmdb_metabolites_biological_properties_file)
hmdb_metabolites_phy_prop = du.read_from_pickle(hmdb_metabolites_physical_properties_file)

In [54]:
def get_value(metabofrom_spmatrix, key='kingdom'):
    try:
        if key=='accession':
            return metabolite['accession']
        if key=='kingdom':
            return metabolite['taxonomy']['kingdom']
        if key=='super_class':
            return metabolite['taxonomy']['super_class']
        if key=='class':
            return metabolite['taxonomy']['class']
        if key=='sub_class':
            return metabolite['taxonomy']['sub_class']
        if key=='molecular_framework':
            return metabolite['taxonomy']['molecular_framework']
        if key=='direct_parent':
            return metabolite['taxonomy']['direct_parent']
        if key=='alternative_parents':
            return  metabolite['taxonomy']['alternative_parents']
        if key=='substituents':
            return  metabolite['taxonomy']['substituents']
        if key=='external_descriptors':
            return  metabolite['taxonomy']['external_descriptors']
        
        if key=='cellular_locations':
            return metabolite['biological_properties']['cellular_locations']
        if key=='biospecimen_locations':
            return metabolite['biological_properties']['biospecimen_locations']
        if key=='tissue_locations':
            return metabolite['biological_properties']['tissue_locations']
        if key=='pathways':
            return [pathway for pathway in metabolite['biological_properties']['pathways']]
        if key=='pathways_name':
            return [pathway['name'] for pathway in metabolite['biological_properties']['pathways']]
        
        if key=='experimental_properties':
            return [experimental_property['kind'] for experimental_property in metabolite['experimental_properties']]
        if key=='predicted_properties':
            return [predicted_property['kind'] for predicted_property in metabolite['predicted_properties']]
    except:
        return None

def to_categorical(y, num_classes=None, dtype='float32'):
    y = np.array(y, dtype='int')
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=dtype)
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

def get_cat(values, feature_vals_cat, df_feature_vals):
    return sparse.csr_matrix(feature_vals_cat[df_feature_vals.loc[values]['ID']].sum(axis=0))

def get_feature_categories(metabolite_feature, single_or_list='single', feature_name='{feature_name}'):
    if single_or_list=='single':
        feature_vals_str = metabolite_feature.unique()
    elif single_or_list=='list':
        feature_vals_str = list({x for l in metabolite_feature for x in l})
    feature_vals_int = np.arange(len(feature_vals_str))
    df_feature_vals = pd.DataFrame(list(zip(feature_vals_int, feature_vals_str)), columns=['ID', 'Value']).set_index('Value')
       
    feature_vals_cat = sparse.csr_matrix(to_categorical(feature_vals_int).astype(int))
    if single_or_list=='single':
        metabolite_feature_cat = feature_vals_cat[df_feature_vals.loc[metabolite_feature]['ID']].astype(int)
        column_names = [feature_name+'=="' +str(feature_val_str)+'"' for feature_val_str in feature_vals_str]
        metabolite_feature_cat_df = pd.DataFrame.sparse.from_spmatrix(metabolite_feature_cat, index=metabolite_feature.index, columns=column_names)
    elif single_or_list=='list':
        metabolite_feature_cat = sparse.vstack(metabolite_feature.apply(get_cat, args=(feature_vals_cat, df_feature_vals))).astype(int)
        column_names = ['"'+str(feature_val_str)+'" in '+feature_name for feature_val_str in feature_vals_str]    
        metabolite_feature_cat_df = pd.DataFrame.sparse.from_spmatrix(metabolite_feature_cat, index=metabolite_feature.index, columns=column_names)

    return metabolite_feature_cat_df

def get_metabolites_feature_vals(list_of_dicts, key):
    metabolite_feature_vals_list = [get_value(metabolite, key) for metabolite in list_of_dicts]
    hmdb_ids = [get_value(metabolite, 'accession') for metabolite in list_of_dicts]
    return pd.Series(metabolite_feature_vals_list, hmdb_ids)    

def metabolite_features(list_metabolite_dicts):
    df = pd.DataFrame(list_metabolite_dicts).set_index('accession', drop=True)
    return df

## Direct features

In [5]:
hmdb_metabolites_direct_features = du.read_from_pickle(hmdb_metabolites_direct_features_file)
metabolites_df = metabolite_features(hmdb_metabolites_direct_features)
metabolites_df

Unnamed: 0_level_0,name,description,chemical_formula,average_molecular_weight,monisotopic_molecular_weight,iupac_name,traditional_iupac,cas_registry_number,smiles,inchi,...,chebi_id,pubchem_compound_id,biocyc_id,wikipedia_id,knapsack_id,phenol_explorer_compound_id,bigg_id,metlin_id,vmh_id,fbonto_id
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HMDB0000001,1-Methylhistidine,"1-Methylhistidine, also known as 1-MHis, belon...",C7H11N3O2,169.1811,169.085126611,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,1 methylhistidine,332-80-9,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,...,50599,92105,,Methylhistidine,,,,3741,,
HMDB0000002,"1,3-Diaminopropane","1,3-Diaminopropane, also known as DAP or trime...",C3H10N2,74.1249,74.08439833,"propane-1,3-diamine","α,ω-propanediamine",109-76-2,NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,...,15725,428,CPD-313,"1,3-Diaminopropane",C00007404,,,,,
HMDB0000005,2-Ketobutyric acid,"2-Ketobutyric acid, also known as alpha-ketobu...",C4H6O3,102.0886,102.031694058,2-oxobutanoic acid,2-oxobutanoic acid,600-18-0,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",...,30831,58,2-OXOBUTANOATE,Alpha-Ketobutyric_acid,C00019675,,33889,,2OBUT,
HMDB0000008,2-Hydroxybutyric acid,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn...",C4H8O3,104.105,104.047344118,(2S)-2-hydroxybutanoic acid,(S)-2-hydroxybutyric acid,3347-90-8,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",...,50613,440864,CPD-3564,2-Hydroxybutyric_acid,,,,,,
HMDB0000010,2-Methoxyestrone,2-Methoxyestrone belongs to the class of organ...,C19H24O3,300.3921,300.172544634,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...","(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",362-08-3,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,...,1189,440624,,2-Methoxyestrone,,,,2578,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,Cer(d17:1/18:0),"Ceramides, also known as N-acylsphingosines, c...",C35H69NO3,551.941,551.527745086,"N-[(2S,3R,4E)-1,3-dihydroxyheptadec-4-en-2-yl]...","N-[(2S,3R,4E)-1,3-dihydroxyheptadec-4-en-2-yl]...",123065-42-9,[H][C@@](CO)(NC(=O)CCCCCCCCCCCCCCCCC)[C@H](O)\...,InChI=1S/C35H69NO3/c1-3-5-7-9-11-13-15-17-18-1...,...,,9985066,,,,,,,,
HMDB0240684,Cer(d20:1/18:0),"Ceramides, also known as N-acylsphingosines, c...",C38H75NO3,594.022,593.574695279,"N-[(2S,3R,4E)-1,3-dihydroxyicos-4-en-2-yl]octa...","N-[(2S,3R,4E)-1,3-dihydroxyicos-4-en-2-yl]octa...",27888-43-3,[H][C@@](CO)(NC(=O)CCCCCCCCCCCCCCCCC)[C@H](O)\...,InChI=1S/C38H75NO3/c1-3-5-7-9-11-13-15-17-19-2...,...,,101853667,,,,,,,,
HMDB0240685,Cer(d17:1/16:0),"Ceramides, also known as N-acylsphingosines, c...",C33H65NO3,523.887,523.496444957,"N-[(2S,3R,4E)-1,3-dihydroxyheptadec-4-en-2-yl]...","N-[(2S,3R,4E)-1,3-dihydroxyheptadec-4-en-2-yl]...",123065-40-7,[H][C@@](CO)(NC(=O)CCCCCCCCCCCCCCC)[C@H](O)\C=...,InChI=1S/C33H65NO3/c1-3-5-7-9-11-13-15-17-19-2...,...,,10256256,,,,,,,,
HMDB0240686,"Cer(d18:2(4E,14Z)/16:0)","Ceramides, also known as N-acylsphingosines, c...",C34H65NO3,535.898,535.496444957,"N-[(2S,3R,4E,14Z)-1,3-dihydroxyoctadeca-4,14-d...","N-[(2S,3R,4E,14Z)-1,3-dihydroxyoctadeca-4,14-d...",1919028-96-8,[H][C@@](CO)(NC(=O)CCCCCCCCCCCCCCC)[C@H](O)\C=...,InChI=1S/C34H65NO3/c1-3-5-7-9-11-13-15-17-19-2...,...,,52931118,,,,,,,,


## KEGG to HMDB 

In [19]:
kegg_id_to_hmdb = pd.DataFrame(metabolites_df['kegg_id']).reset_index()
kegg_id_to_hmdb = kegg_id_to_hmdb[kegg_id_to_hmdb['kegg_id'].notna()]
kegg_id_to_hmdb = kegg_id_to_hmdb.set_index('kegg_id')
du.dump_in_pickle(metabolite_kegg_to_hmdb_file, kegg_id_to_hmdb)
kegg_id_to_hmdb

Unnamed: 0_level_0,accession
kegg_id,Unnamed: 1_level_1
C01152,HMDB0000001
C00986,HMDB0000002
C00109,HMDB0000005
C05984,HMDB0000008
C05299,HMDB0000010
...,...
C10796,HMDB0240575
C06193,HMDB0240587
C06231,HMDB0240650
C07273,HMDB0240653


### Molecular weight

In [83]:
metabolites_molecular_weight_df = metabolites_df[['average_molecular_weight', 'monisotopic_molecular_weight']]
du.dump_in_pickle(metabolite_molecular_weight_features_file, metabolites_molecular_weight_df)
metabolites_molecular_weight_df

Unnamed: 0_level_0,average_molecular_weight,monisotopic_molecular_weight
accession,Unnamed: 1_level_1,Unnamed: 2_level_1
HMDB0000001,169.1811,169.085127
HMDB0000002,74.1249,74.084398
HMDB0000005,102.0886,102.031694
HMDB0000008,104.1050,104.047344
HMDB0000010,300.3921,300.172545
...,...,...
HMDB0240683,551.9410,551.527745
HMDB0240684,594.0220,593.574695
HMDB0240685,523.8870,523.496445
HMDB0240686,535.8980,535.496445


### State

In [84]:
metabolites_state_df = get_feature_categories(metabolites_df['state'], 'single', 'state')
du.dump_in_pickle(metabolite_state_features_file, metabolites_state_df)
metabolite_state_df

Unnamed: 0_level_0,"state==""Solid""","state==""Liquid""","state==""Gas""","state==""nan"""
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HMDB0000001,1,0,0,0
HMDB0000002,0,1,0,0
HMDB0000005,1,0,0,0
HMDB0000008,1,0,0,0
HMDB0000010,1,0,0,0
...,...,...,...,...
HMDB0240683,0,0,0,1
HMDB0240684,0,0,0,1
HMDB0240685,0,0,0,1
HMDB0240686,0,0,0,1


## Chemical taxonomy

### kingdom

In [25]:
%%time
metabolite_kingdoms_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'kingdom')
metabolite_kingdom_df = get_feature_categories(metabolite_kingdoms_series, 'single', 'kingdom')
du.dump_in_pickle(metabolite_kingdom_features_file, metabolite_kingdom_df)
metabolite_kingdom_df

CPU times: user 341 ms, sys: 9 µs, total: 341 ms
Wall time: 339 ms


Unnamed: 0,"kingdom==""Organic compounds""","kingdom==""Inorganic compounds""","kingdom==""None"""
HMDB0000001,1,0,0
HMDB0000002,1,0,0
HMDB0000005,1,0,0
HMDB0000008,1,0,0
HMDB0000010,1,0,0
...,...,...,...
HMDB0240683,1,0,0
HMDB0240684,1,0,0
HMDB0240685,1,0,0
HMDB0240686,1,0,0


In [42]:
metabolite_kingdom_df.sum()

kingdom=="Organic compounds"      113992
kingdom=="Inorganic compounds"       141
kingdom=="None"                       89
dtype: int64

### super_class

In [28]:
%%time
metabolite_super_class_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'super_class')
metabolite_super_class_df = get_feature_categories(metabolite_super_class_series, 'single', 'super_class')
du.dump_in_pickle(metabolite_super_class_features_file, metabolite_super_class_df)
metabolite_super_class_df

CPU times: user 318 ms, sys: 73 µs, total: 318 ms
Wall time: 315 ms


Unnamed: 0,"super_class==""Organic acids and derivatives""","super_class==""Organic nitrogen compounds""","super_class==""Lipids and lipid-like molecules""","super_class==""Nucleosides, nucleotides, and analogues""","super_class==""Organoheterocyclic compounds""","super_class==""Benzenoids""","super_class==""Organic oxygen compounds""","super_class==""Homogeneous non-metal compounds""","super_class==""Phenylpropanoids and polyketides""","super_class==""Homogeneous metal compounds""",...,"super_class==""None""","super_class==""Lignans, neolignans and related compounds""","super_class==""Acetylides""","super_class==""Organometallic compounds""","super_class==""Organohalogen compounds""","super_class==""Organic 1,3-dipolar compounds""","super_class==""Organic salts""","super_class==""Organophosphorus compounds""","super_class==""Hydrocarbon derivatives""","super_class==""Organooxygen compounds"""
HMDB0000001,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
metabolite_super_class_df.sum()

super_class=="Organic acids and derivatives"                 2648
super_class=="Organic nitrogen compounds"                     242
super_class=="Lipids and lipid-like molecules"              90688
super_class=="Nucleosides, nucleotides, and analogues"        285
super_class=="Organoheterocyclic compounds"                  3260
super_class=="Benzenoids"                                    2463
super_class=="Organic oxygen compounds"                      2943
super_class=="Homogeneous non-metal compounds"                 66
super_class=="Phenylpropanoids and polyketides"             10522
super_class=="Homogeneous metal compounds"                     53
super_class=="Alkaloids and derivatives"                      229
super_class=="Miscellaneous inorganic compounds"                1
super_class=="Organosulfur compounds"                         252
super_class=="Mixed metal/non-metal compounds"                 21
super_class=="Hydrocarbons"                                   195
super_clas

### class

In [29]:
%%time
metabolite_class_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'class')
metabolite_class_df = get_feature_categories(metabolite_class_series, 'single', 'class')
du.dump_in_pickle(metabolite_class_features_file, metabolite_class_df)
metabolite_class_df

CPU times: user 343 ms, sys: 0 ns, total: 343 ms
Wall time: 340 ms


Unnamed: 0,"class==""Carboxylic acids and derivatives""","class==""Organonitrogen compounds""","class==""Keto acids and derivatives""","class==""Hydroxy acids and derivatives""","class==""Steroids and steroid derivatives""","class==""Pyrimidine nucleosides""","class==""Pyridines and derivatives""","class==""Phenols""","class==""Sphingolipids""","class==""Organic carbonic acids and derivatives""",...,"class==""Thiolactams""","class==""Thienopyrimidines""","class==""Thiochromanes""","class==""Dithiocarbamic acids and derivatives""","class==""Organobromides""","class==""Acyl halides""","class==""Seleninic acids and derivatives""","class==""Azetidines""","class==""Propargyl-type 1,3-dipolar organic compounds""","class==""Orthocarboxylic acid derivatives"""
HMDB0000001,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
metabolite_class_df.sum()

class=="Carboxylic acids and derivatives"                2059
class=="Organonitrogen compounds"                         242
class=="Keto acids and derivatives"                        95
class=="Hydroxy acids and derivatives"                     92
class=="Steroids and steroid derivatives"                1254
                                                         ... 
class=="Acyl halides"                                       1
class=="Seleninic acids and derivatives"                    1
class=="Azetidines"                                         1
class=="Propargyl-type 1,3-dipolar organic compounds"       1
class=="Orthocarboxylic acid derivatives"                   1
Length: 328, dtype: int64

### sub_class

In [31]:
%%time
metabolite_sub_class_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'sub_class')
metabolite_sub_class_df = get_feature_categories(metabolite_sub_class_series, 'single', 'sub_class')
du.dump_in_pickle(metabolite_sub_class_features_file, metabolite_sub_class_df)
metabolite_sub_class_df

CPU times: user 372 ms, sys: 15 µs, total: 372 ms
Wall time: 369 ms


Unnamed: 0,"sub_class==""Amino acids, peptides, and analogues""","sub_class==""Amines""","sub_class==""Short-chain keto acids and derivatives""","sub_class==""Alpha hydroxy acids and derivatives""","sub_class==""Estrane steroids""","sub_class==""Beta hydroxy acids and derivatives""","sub_class==""Pyrimidine 2'-deoxyribonucleosides""","sub_class==""Hydroxysteroids""","sub_class==""Pyridinecarboxylic acids and derivatives""","sub_class==""1-hydroxy-2-unsubstituted benzenoids""",...,"sub_class==""Chloronaphthalenes""","sub_class==""Polyterpenoids""","sub_class==""Organic peroxynitrites""","sub_class==""Alkyldiazohydroxides""","sub_class==""Non-metal selenates""","sub_class==""Caprolactams""","sub_class==""Difurocoumarins""","sub_class==""Butyrophenones""","sub_class==""Anthocyanidins""","sub_class==""Isoflavonoid C-glycosides"""
HMDB0000001,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
metabolite_sub_class_df.sum()

sub_class=="Amino acids, peptides, and analogues"      1768
sub_class=="Amines"                                     136
sub_class=="Short-chain keto acids and derivatives"      27
sub_class=="Alpha hydroxy acids and derivatives"          8
sub_class=="Estrane steroids"                            60
                                                       ... 
sub_class=="Caprolactams"                                 1
sub_class=="Difurocoumarins"                              2
sub_class=="Butyrophenones"                               2
sub_class=="Anthocyanidins"                               3
sub_class=="Isoflavonoid C-glycosides"                    1
Length: 528, dtype: int64

### direct_parent

In [33]:
%%time
metabolite_direct_parent_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'direct_parent')
metabolite_direct_parent_df = get_feature_categories(metabolite_direct_parent_series, 'single', 'direct_parent')
du.dump_in_pickle(metabolite_direct_parent_features_file, metabolite_direct_parent_df)
metabolite_direct_parent_df

CPU times: user 426 ms, sys: 20 ms, total: 446 ms
Wall time: 444 ms


Unnamed: 0,"direct_parent==""Histidine and derivatives""","direct_parent==""Monoalkylamines""","direct_parent==""Short-chain keto acids and derivatives""","direct_parent==""Alpha hydroxy acids and derivatives""","direct_parent==""Estrogens and derivatives""","direct_parent==""Beta hydroxy acids and derivatives""","direct_parent==""Pyrimidine 2'-deoxyribonucleosides""","direct_parent==""21-hydroxysteroids""","direct_parent==""Pyridinecarboxylic acids""","direct_parent==""1-hydroxy-2-unsubstituted benzenoids""",...,"direct_parent==""6-hydroxyflavonoids""","direct_parent==""Anthocyanidins""","direct_parent==""Monohydroxyflavonoids""","direct_parent==""8-hydroxyflavonoids""","direct_parent==""N-acyldopamines""","direct_parent==""Carbamic acids""","direct_parent==""Dicarboximides""","direct_parent==""Isoflavonoid C-glycosides""","direct_parent==""Ergopeptines""","direct_parent==""Coumaric acids"""
HMDB0000001,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
metabolite_direct_parent_df.sum()

direct_parent=="Histidine and derivatives"                 19
direct_parent=="Monoalkylamines"                           21
direct_parent=="Short-chain keto acids and derivatives"    27
direct_parent=="Alpha hydroxy acids and derivatives"        8
direct_parent=="Estrogens and derivatives"                 43
                                                           ..
direct_parent=="Carbamic acids"                             2
direct_parent=="Dicarboximides"                             1
direct_parent=="Isoflavonoid C-glycosides"                  1
direct_parent=="Ergopeptines"                               1
direct_parent=="Coumaric acids"                             1
Length: 1527, dtype: int64

### molecular_framework

In [35]:
%%time
metabolite_molecular_framework_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'molecular_framework')
metabolite_molecular_framework_df = get_feature_categories(metabolite_molecular_framework_series, 'single', 'molecular_framework')
du.dump_in_pickle(metabolite_molecular_framework_features_file, metabolite_molecular_framework_df)
metabolite_molecular_framework_df

CPU times: user 332 ms, sys: 15 µs, total: 332 ms
Wall time: 329 ms


Unnamed: 0,"molecular_framework==""Aromatic heteromonocyclic compounds""","molecular_framework==""Aliphatic acyclic compounds""","molecular_framework==""Aromatic homopolycyclic compounds""","molecular_framework==""Aliphatic homopolycyclic compounds""","molecular_framework==""Aromatic homomonocyclic compounds""","molecular_framework==""Aliphatic heteromonocyclic compounds""","molecular_framework==""Aromatic heteropolycyclic compounds""","molecular_framework==""Aliphatic heteropolycyclic compounds""","molecular_framework==""None""","molecular_framework==""Aliphatic homomonocyclic compounds"""
HMDB0000001,1,0,0,0,0,0,0,0,0,0
HMDB0000002,0,1,0,0,0,0,0,0,0,0
HMDB0000005,0,1,0,0,0,0,0,0,0,0
HMDB0000008,0,1,0,0,0,0,0,0,0,0
HMDB0000010,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,1,0,0,0,0,0,0,0,0
HMDB0240684,0,1,0,0,0,0,0,0,0,0
HMDB0240685,0,1,0,0,0,0,0,0,0,0
HMDB0240686,0,1,0,0,0,0,0,0,0,0


In [36]:
metabolite_molecular_framework_df.sum()

molecular_framework=="Aromatic heteromonocyclic compounds"      4678
molecular_framework=="Aliphatic acyclic compounds"             86100
molecular_framework=="Aromatic homopolycyclic compounds"         468
molecular_framework=="Aliphatic homopolycyclic compounds"       1302
molecular_framework=="Aromatic homomonocyclic compounds"        4331
molecular_framework=="Aliphatic heteromonocyclic compounds"     2259
molecular_framework=="Aromatic heteropolycyclic compounds"     10559
molecular_framework=="Aliphatic heteropolycyclic compounds"     1413
molecular_framework=="None"                                     1893
molecular_framework=="Aliphatic homomonocyclic compounds"       1219
dtype: int64

### alternative_parents

In [100]:
%%time
metabolite_alternative_parents_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'alternative_parents')
metabolite_alternative_parents_df = get_feature_categories(metabolite_alternative_parents_series, 'list', 'alternative_parents')
du.dump_in_pickle(metabolite_alternative_parents_features_file, metabolite_alternative_parents_df)
metabolite_alternative_parents_df

CPU times: user 2min 8s, sys: 309 ms, total: 2min 8s
Wall time: 2min 8s


Unnamed: 0,"""2-benzimidazolylcarbamic acid esters"" in alternative_parents","""Carboxylic acids"" in alternative_parents","""Halogenated steroids"" in alternative_parents","""Azo compounds"" in alternative_parents","""Organotin compounds"" in alternative_parents","""Azoles"" in alternative_parents","""Alkylsilanes"" in alternative_parents","""Nicotinamide nucleotides"" in alternative_parents","""7-alpha-hydroxysteroids"" in alternative_parents","""Acyloins"" in alternative_parents",...,"""Organophosphorus compounds"" in alternative_parents","""Aminoquinolines and derivatives"" in alternative_parents","""Thiocarbamic acid derivatives"" in alternative_parents","""Tosyl compounds"" in alternative_parents","""Indanes"" in alternative_parents","""Steroid esters"" in alternative_parents","""Cyclopropanecarboxylic acids"" in alternative_parents","""Spirostanes and derivatives"" in alternative_parents","""6-hydroxysteroids"" in alternative_parents","""Cyclic olefins"" in alternative_parents"
HMDB0000001,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
metabolite_alternative_parents_df.sum()

"2-benzimidazolylcarbamic acid esters" in alternative_parents        1
"Carboxylic acids" in alternative_parents                        10728
"Halogenated steroids" in alternative_parents                       34
"Azo compounds" in alternative_parents                              36
"Organotin compounds" in alternative_parents                         2
                                                                 ...  
"Steroid esters" in alternative_parents                             39
"Cyclopropanecarboxylic acids" in alternative_parents                4
"Spirostanes and derivatives" in alternative_parents                38
"6-hydroxysteroids" in alternative_parents                          43
"Cyclic olefins" in alternative_parents                            134
Length: 1544, dtype: int64

In [102]:
metabolite_alternative_parents_df.sum().sum()

1030404

### substituents

In [103]:
%%time
metabolite_substituents_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'substituents')
metabolite_substituents_df = get_feature_categories(metabolite_substituents_series, 'list', 'substituents')
du.dump_in_pickle(metabolite_substituents_features_file, metabolite_substituents_df)
metabolite_substituents_df

CPU times: user 2min 11s, sys: 250 ms, total: 2min 11s
Wall time: 2min 11s


Unnamed: 0,"""3-prenylated chalcone"" in substituents","""2-halobenzoic acid"" in substituents","""Steviol glycoside"" in substituents","""Glycerol ether"" in substituents","""Hydroxy fatty acid"" in substituents","""Iodobenzene"" in substituents","""11-alpha-hydroxysteroid"" in substituents","""Orthocarboxylic acid derivative"" in substituents","""Purine ribonucleoside triphosphate"" in substituents","""6-oxopurine"" in substituents",...,"""Acyl monophosphate"" in substituents","""Precorrin"" in substituents","""Thiobarbiturate"" in substituents","""Polyphenyl ether"" in substituents","""Solanidane skeleton"" in substituents","""Pyrimidine-5-carboxylic acid or derivatives"" in substituents","""M-aminophenol"" in substituents","""Tetrahydroquinoline"" in substituents","""Pectenotoxin fragment"" in substituents","""Bromodiphenyl ether"" in substituents"
HMDB0000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
metabolite_substituents_df.sum()

"3-prenylated chalcone" in substituents                            57
"2-halobenzoic acid" in substituents                                6
"Steviol glycoside" in substituents                                 5
"Glycerol ether" in substituents                                 1380
"Hydroxy fatty acid" in substituents                              800
                                                                 ... 
"Pyrimidine-5-carboxylic acid or derivatives" in substituents       1
"M-aminophenol" in substituents                                    10
"Tetrahydroquinoline" in substituents                              10
"Pectenotoxin fragment" in substituents                             3
"Bromodiphenyl ether" in substituents                               8
Length: 2403, dtype: int64

In [105]:
metabolite_substituents_df.sum().sum()

2051564

### external_descriptors

In [98]:
%%time
metabolite_external_descriptors_series = get_metabolites_feature_vals(hmdb_metabolites_taxonomy, 'external_descriptors')
metabolite_external_descriptors_df = get_feature_categories(metabolite_external_descriptors_series, 'list', 'external_descriptors')
du.dump_in_pickle(metabolite_external_descriptors_features_file, metabolite_external_descriptors_df)
metabolite_external_descriptors_df

CPU times: user 1min 40s, sys: 152 ms, total: 1min 40s
Wall time: 1min 40s


Unnamed: 0,"""bismuth cation"" in external_descriptors","""3-(3-sn-phosphatidyl)-sn-glycerol 1-phosphate"" in external_descriptors","""beta-farnesene"" in external_descriptors","""aryl sulfate"" in external_descriptors","""glutamic semialdehyde"" in external_descriptors","""phenanthrol"" in external_descriptors","""hydroxyisoflavans"" in external_descriptors","""linalool"" in external_descriptors","""octadeca-10,12-dienoic acid"" in external_descriptors","""bromoalkane"" in external_descriptors",...,"""1-linoleoyl-2-oleoylglycerol"" in external_descriptors","""tropane alkaloid"" in external_descriptors","""Pyrazole herbicides"" in external_descriptors","""carbonyl compound"" in external_descriptors","""hydroxyflavone"" in external_descriptors","""hydroxy-L-tryptophan"" in external_descriptors","""heteroarenecarbaldehyde"" in external_descriptors","""Actinamines"" in external_descriptors","""benzenoid aromatic compound"" in external_descriptors","""hexenal"" in external_descriptors"
HMDB0000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
metabolite_external_descriptors_df.sum()

"bismuth cation" in external_descriptors                                    1
"3-(3-sn-phosphatidyl)-sn-glycerol 1-phosphate" in external_descriptors     1
"beta-farnesene" in external_descriptors                                    2
"aryl sulfate" in external_descriptors                                     17
"glutamic semialdehyde" in external_descriptors                             4
                                                                           ..
"hydroxy-L-tryptophan" in external_descriptors                              1
"heteroarenecarbaldehyde" in external_descriptors                           1
"Actinamines" in external_descriptors                                       1
"benzenoid aromatic compound" in external_descriptors                       1
"hexenal" in external_descriptors                                           2
Length: 3210, dtype: int64

In [100]:
metabolite_external_descriptors_df.sum().sum()

18602

# Biological properties

In [7]:
hmdb_metabolites_bio_prop[0]

{'accession': 'HMDB0000001',
 'name': '1-Methylhistidine',
 'biological_properties': {'cellular_locations': ['Cytoplasm'],
  'biospecimen_locations': ['Blood',
   'Cerebrospinal Fluid (CSF)',
   'Feces',
   'Saliva',
   'Urine'],
  'tissue_locations': ['Placenta', 'Skeletal Muscle'],
  'pathways': [{'name': 'Histidine Metabolism',
    'smpdb_id': 'SMP00044',
    'kegg_map_id': 'map00340'},
   {'name': 'Histidinemia', 'smpdb_id': 'SMP00191', 'kegg_map_id': None}]}}

### Cellular locations

In [3]:
%%time
metabolite_cellular_locations_series = get_metabolites_feature_vals(hmdb_metabolites_bio_prop, 'cellular_locations')
metabolite_cellular_locations_df = get_feature_categories(metabolite_cellular_locations_series, 'list', 'cellular_locations')
du.dump_in_pickle(metabolite_cellular_locations_features_file, metabolite_cellular_locations_df)
metabolite_cellular_locations_df

CPU times: user 1min 44s, sys: 252 ms, total: 1min 44s
Wall time: 1min 44s


Unnamed: 0,"""Mitochondria"" in cellular_locations","""Peroxisome"" in cellular_locations","""Microsomes"" in cellular_locations","""Endoplasmic reticulum"" in cellular_locations","""Extracellular"" in cellular_locations","""Inner mitochondrial membrane"" in cellular_locations","""Golgi apparatus"" in cellular_locations","""Nucleus"" in cellular_locations","""Membrane"" in cellular_locations","""Lysosome"" in cellular_locations","""Cytoplasm"" in cellular_locations"
HMDB0000001,0,0,0,0,0,0,0,0,0,0,1
HMDB0000002,0,0,0,0,0,0,0,0,0,0,1
HMDB0000005,0,0,0,0,0,0,0,0,0,0,1
HMDB0000008,0,0,0,0,1,0,0,0,0,0,1
HMDB0000010,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,0


In [4]:
metabolite_cellular_locations_df.sum().astype(int)

"Mitochondria" in cellular_locations                      310
"Peroxisome" in cellular_locations                         97
"Microsomes" in cellular_locations                          2
"Endoplasmic reticulum" in cellular_locations             131
"Extracellular" in cellular_locations                   29243
"Inner mitochondrial membrane" in cellular_locations        1
"Golgi apparatus" in cellular_locations                    45
"Nucleus" in cellular_locations                            79
"Membrane" in cellular_locations                        32841
"Lysosome" in cellular_locations                           75
"Cytoplasm" in cellular_locations                        7860
dtype: int64

In [5]:
metabolite_cellular_locations_df.sum().sum()

70684

### Biospecimen locations

In [6]:
%%time
metabolite_biospecimen_locations_series = get_metabolites_feature_vals(hmdb_metabolites_bio_prop, 'biospecimen_locations')
metabolite_biospecimen_locations_df = get_feature_categories(metabolite_biospecimen_locations_series, 'list', 'biospecimen_locations')
du.dump_in_pickle(metabolite_biospecimen_locations_features_file, metabolite_biospecimen_locations_df)
metabolite_biospecimen_locations_df

CPU times: user 1min 44s, sys: 140 ms, total: 1min 44s
Wall time: 1min 44s


Unnamed: 0,"""Semen"" in biospecimen_locations","""Blood"" in biospecimen_locations","""Amniotic Fluid"" in biospecimen_locations","""Breath"" in biospecimen_locations","""Prostate Tissue"" in biospecimen_locations","""Feces"" in biospecimen_locations","""Aqueous Humour"" in biospecimen_locations","""Pericardial Effusion"" in biospecimen_locations","""Urine"" in biospecimen_locations","""Lymph"" in biospecimen_locations","""Ascites Fluid"" in biospecimen_locations","""Cellular Cytoplasm"" in biospecimen_locations","""Breast Milk"" in biospecimen_locations","""Tears"" in biospecimen_locations","""Saliva"" in biospecimen_locations","""Cerebrospinal Fluid (CSF)"" in biospecimen_locations","""Sweat"" in biospecimen_locations","""Bile"" in biospecimen_locations"
HMDB0000001,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0
HMDB0000002,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
HMDB0000005,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0
HMDB0000008,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,1,1,0
HMDB0000010,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
metabolite_biospecimen_locations_df.sum()

"Amniotic Fluid" in biospecimen_locations                  18
"Lymph" in biospecimen_locations                            1
"Urine" in biospecimen_locations                         4364
"Aqueous Humour" in biospecimen_locations                   1
"Ascites Fluid" in biospecimen_locations                    1
"Cerebrospinal Fluid (CSF)" in biospecimen_locations      445
"Blood" in biospecimen_locations                        25411
"Breast Milk" in biospecimen_locations                    121
"Semen" in biospecimen_locations                            4
"Prostate Tissue" in biospecimen_locations                 12
"Bile" in biospecimen_locations                            18
"Saliva" in biospecimen_locations                        1245
"Tears" in biospecimen_locations                            1
"Feces" in biospecimen_locations                         6810
"Sweat" in biospecimen_locations                           91
"Pericardial Effusion" in biospecimen_locations             1
"Cellula

In [13]:
metabolite_biospecimen_locations_df.sum().sum()

38653

### Tissue locations

In [7]:
%%time
metabolite_tissue_locations_series = get_metabolites_feature_vals(hmdb_metabolites_bio_prop, 'tissue_locations')
metabolite_tissue_locations_df = get_feature_categories(metabolite_tissue_locations_series, 'list', 'tissue_locations')
du.dump_in_pickle(metabolite_tissue_locations_features_file, metabolite_tissue_locations_df)
metabolite_tissue_locations_df

CPU times: user 1min 44s, sys: 180 ms, total: 1min 45s
Wall time: 1min 45s


Unnamed: 0,"""Adrenal Gland"" in tissue_locations","""Leukocyte"" in tissue_locations","""Thyroid Gland"" in tissue_locations","""Skeletal Muscle"" in tissue_locations","""Adrenal Cortex"" in tissue_locations","""Adipose Tissue"" in tissue_locations","""Bone Marrow"" in tissue_locations","""Parathyroid"" in tissue_locations","""Bile"" in tissue_locations","""Cartilage"" in tissue_locations",...,"""Hair"" in tissue_locations","""Testes"" in tissue_locations","""Adrenal Medulla"" in tissue_locations","""Smooth Muscle"" in tissue_locations","""Epidermis"" in tissue_locations","""Retina"" in tissue_locations","""Basal Ganglia"" in tissue_locations","""Gall Bladder"" in tissue_locations","""Spleen"" in tissue_locations","""Bladder"" in tissue_locations"
HMDB0000001,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
metabolite_tissue_locations_df.sum()

"Semen" in tissue_locations                 8
"Brain" in tissue_locations              1888
"Umbilical cord" in tissue_locations        1
"Adipose Tissue" in tissue_locations      136
"All Tissues" in tissue_locations        4343
"Smooth Muscle" in tissue_locations         9
"Bladder" in tissue_locations              86
"Intestine" in tissue_locations           259
"Pineal Gland" in tissue_locations          1
"Neuron" in tissue_locations              328
"Hair" in tissue_locations                 14
"Pancreas" in tissue_locations            117
"Leukocyte" in tissue_locations            35
"Erythrocyte" in tissue_locations          70
"Adrenal Medulla" in tissue_locations      53
"Lung" in tissue_locations                 20
"Liver" in tissue_locations              1601
"Fibroblasts" in tissue_locations         337
"Basal Ganglia" in tissue_locations         5
"Adrenal Gland" in tissue_locations        88
"Cartilage" in tissue_locations            11
"Thyroid Gland" in tissue_location

In [27]:
metabolite_tissue_locations_df.sum().sum()

13374

In [28]:
feature_vals_str = list({x for l in metabolite_tissue_locations_series for x in l})
target_str= ''
for val in feature_vals_str:
    target_str+= f'"{val}", ' 
target_str

'"Semen", "Brain", "Umbilical cord", "Adipose Tissue", "All Tissues", "Smooth Muscle", "Bladder", "Intestine", "Pineal Gland", "Neuron", "Hair", "Pancreas", "Leukocyte", "Erythrocyte", "Adrenal Medulla", "Lung", "Liver", "Fibroblasts", "Basal Ganglia", "Adrenal Gland", "Cartilage", "Thyroid Gland", "Skeletal Muscle", "Bone Marrow", "Platelet", "Testes", "Placenta", "Epidermis", "Gall Bladder", "Prostate", "Parathyroid", "Urine", "Adrenal Cortex", "Spleen", "Retina", "Eye Lens", "Vitreous humor", "Kidney", "Heart", "Bile", "Ovary", "Testis", "Blood", '

### Pathways

For additional information on pathways: https://smpdb.ca/downloads

In [133]:
def get_all_unique_pathways():
    metabolite_pathways_series = get_metabolites_feature_vals(hmdb_metabolites_bio_prop, 'pathways')
    key = lambda d: tuple(sorted(d.items()))
    unique_pathway_dicts = list(OrderedDict((key(v), v) for row in metabolite_pathways_series for v in row).values())
    all_pathways = pd.DataFrame(unique_pathway_dicts)
    return all_pathways

def get_pathway_cat(pathway_names):
    return sparse.csr_matrix(pathway_cats[pathway_index_by_name[pathway_names].values].sum(axis=0))

In [134]:
%%time
metabolite_pathways_series = get_metabolites_feature_vals(hmdb_metabolites_bio_prop, 'pathways_name')
all_pathways = get_all_unique_pathways()
pathway_cats = sparse.csr_matrix(scipy.sparse.diags(np.ones(all_pathways.shape[0])))
pathway_index_by_name = all_pathways.reset_index().set_index('name', drop=True)['index']
pathway_column_names = [f"{smpdb_id} in pathways" for smpdb_id in all_pathways['smpdb_id']]
metabolites_pathways_df = sparse.vstack(metabolite_pathways_series.apply(get_pathway_cat)).astype(int)
metabolites_pathways_df = pd.DataFrame.sparse.from_spmatrix(metabolites_pathways_df, index=metabolite_pathways_series.index, columns=pathway_column_names)
du.dump_in_pickle(metabolites_pathways_features_file, metabolites_pathways_df)
metabolites_pathways_df

CPU times: user 3min 38s, sys: 76.4 ms, total: 3min 38s
Wall time: 3min 38s


Unnamed: 0,SMP00044 in pathways,SMP00191 in pathways,SMP00007 in pathways,SMP00493 in pathways,SMP00351 in pathways,SMP00492 in pathways,SMP00721 in pathways,SMP00177 in pathways,SMP00179 in pathways,SMP00242 in pathways,...,SMP00264 in pathways,SMP00645 in pathways,SMP00619 in pathways,SMP00263 in pathways,SMP31696 in pathways,SMP31697 in pathways,SMP00732 in pathways,SMP31698 in pathways,SMP31699 in pathways,SMP31700 in pathways
HMDB0000001,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000002,0,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000005,0,0,0,0,0,0,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
HMDB0000008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0000010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HMDB0240686,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Physical properties

In [75]:
experimental_properties = ['water_solubility', 'boiling_point', 'melting_point', 'logp']
predicted_properties_all = ['formal_charge', 'smiles', 'logp', 'pka_strongest_basic', 'refractivity', 'donor_count', 'bioavailability', 
                            'polar_surface_area', 'ghose_filter', 'solubility', 'acceptor_count', 'mono_mass', 'rule_of_five', 'average_mass', 
                            'polarizability', 'physiological_charge', 'inchikey', 'logs', 'veber_rule', 'formula', 'rotatable_bond_count', 
                            'iupac', 'mddr_like_rule', 'pka_strongest_acidic', 'inchi', 'number_of_rings']

predicted_properties_exclude = ['iupac', 'inchi', 'inchikey', 'smiles', 'formula']

predicted_properties = list(set(predicted_properties_all) - set(predicted_properties_exclude))

def get_physical_property_value(metabolite, key, experimental_predicted):
    value_str = next((item['value'] for item in metabolite[experimental_predicted] if item["kind"] == key), None)
    return value_str

def parse_value_str(value_str):
    if '< ' in value_str:
        value_str = value_str.replace('< ', '')
    if '> ' in value_str:
        value_str = value_str.replace('> ', '')

    if '(' in value_str and ')' in value_str:
        start = value_str.find('(')
        end = value_str.find(')')
        value_str = value_str[0:start]+value_str[end+1:]

    return value_str.strip()

def deal_with_specific_data_errors(value_str):
    if value_str == '200 g/kg':
        return '200', 'g/L'
    if value_str == '1.19e+00 g/l':
        return '1.19', 'g/L'
    if value_str == '145 - 146 (hydrochloride salt)':
        return '146', '°C'
    elif value_str == '243-246 °C at 1.00E-02 mm Hg':
        return '246', '°C'
    elif value_str == '232–234 °C':
        return '232', '°C'
    elif value_str == '40 – 50 °C':
        return '40', '°C'
    elif value_str == '618.2°C at 760 mmHg':
        return '618.2', '°C'
    elif value_str == '1000.0 mg/mL; 862 mg/mL (magnesium salt)':
        return '1000', 'mg/mL'
    elif '473°C' in value_str:
        return '473', '°C'
    elif value_str == '248-250°C':
        return '248', '°C'
    elif 'soluble' in value_str:
        return None
    else:
        return None

def get_physical_property(metabolite, key, experimental_predicted):
    value_str = get_physical_property_value(metabolite, key, experimental_predicted)

    if value_str:
        if 'soluble' in value_str:
            return None
        # Check for specific data errors -> manual correction
        exception_result = deal_with_specific_data_errors(value_str)
        if exception_result:
            return exception_result

        # Deal with more common exceptions
        value_str = parse_value_str(value_str)
        
        if ' at ' in value_str:
            if key=='water_solubility' and '°C' in value_str:
                value_str = value_str.split(' at ')[0] 
            else:
                return value_str.split(' at ')[0]
        
        if ' - ' in value_str:
            value, unit = value_str.split(' ')[-2:]
            return value, unit 
        
        if ' ' in value_str:
            value, unit = value_str.split(' ')[:2]
            if value.find('-')>0 and not 'e-' in value and not 'E-' in value:
                value = value.split('-')[0]
            return value, unit 
        else:
            return value_str
    else:
        return None

def metabolite_physical_properties(metabolite):
    new_dict={}
    
    new_dict['accession'] = get_value(metabolite, 'accession')
    
    for experimental_property in experimental_properties:
        result = get_physical_property(metabolite, experimental_property, 'experimental_properties')
        if type(result)==tuple:
            value, unit = result
            
            if unit=='soluble':
                print(value)
            
            if ',' in value:
                value = value.replace(',', '.')
            
            if not(experimental_property=='water_solubility' and unit=='mol/L'):
                new_dict[f'experimental_{experimental_property} ({unit})'] = value
        else:
            if result:
                if experimental_property=='melting_point':
                    print(result)
                
                if not(experimental_property=='water_solubility'):
                    new_dict[f'experimental_{experimental_property}'] = result
        
    for predicted_property in predicted_properties:
        result = get_physical_property(metabolite, predicted_property, 'predicted_properties')
        if type(result)==tuple:
            value, unit = result
            new_dict[f'predicted_{predicted_property} ({unit})'] = value
        else:
            if result:
                new_dict[f'predicted_{predicted_property}'] = result

    return new_dict

def binary_yes_no(yes_no_str):
    if(type(yes_no_str)==str):
        if yes_no_str=='Yes':
            return True
        elif yes_no_str=='No':
            return False
        else:
            return None
    else:
        return yes_no_str

def convert_df_column(df, column_name, type):
    df[column_name] = df[column_name].astype(type)
    
def convert_columns(df):
    convert_df_column(df, 'experimental_water_solubility (g/L)', float)
    convert_df_column(df, 'experimental_water_solubility (mg/mL)', float)
    convert_df_column(df, 'experimental_water_solubility (mg/L)', float)
    convert_df_column(df, 'experimental_melting_point (°C)', float)
    convert_df_column(df, 'experimental_boiling_point (°C)', float)
    convert_df_column(df, 'experimental_logp', float)

    convert_df_column(df, 'predicted_average_mass', float)
    convert_df_column(df, 'predicted_logp', float)
    convert_df_column(df, 'predicted_pka_strongest_basic', float)
    convert_df_column(df, 'predicted_polarizability', float)
    convert_df_column(df, 'predicted_refractivity', float)
    convert_df_column(df, 'predicted_logs', float)
    convert_df_column(df, 'predicted_mono_mass', float)
    convert_df_column(df, 'predicted_mono_mass', float)
    convert_df_column(df, 'predicted_polar_surface_area', float)
    convert_df_column(df, 'predicted_pka_strongest_acidic', float)
    convert_df_column(df, 'predicted_solubility (g/L)', float)
    convert_df_column(df, 'predicted_formal_charge', float)

    #Actually int but contains nans so has to be float
    convert_df_column(df, 'predicted_physiological_charge', float)
    convert_df_column(df, 'predicted_donor_count', float)
    convert_df_column(df, 'predicted_rotatable_bond_count', float)
    convert_df_column(df, 'predicted_acceptor_count', float)
    convert_df_column(df, 'predicted_number_of_rings', float)

    #Convert to binary from Yes/No strings
    df['predicted_veber_rule'] = df['predicted_veber_rule'].apply(binary_yes_no)
    df['predicted_bioavailability'] = df['predicted_bioavailability'].apply(binary_yes_no)
    df['predicted_mddr_like_rule'] = df['predicted_mddr_like_rule'].apply(binary_yes_no)
    df['predicted_ghose_filter'] = df['predicted_ghose_filter'].apply(binary_yes_no)
    df['predicted_rule_of_five'] = df['predicted_rule_of_five'].apply(binary_yes_no)   
    
    return df

def get_physical_properties_features(list_of_dicts):
    phys_prop_df = pd.DataFrame([metabolite_physical_properties(metabolite) for metabolite in list_of_dicts])
    phys_prop_df = phys_prop_df.set_index('accession')
    phys_prop_df = convert_columns(phys_prop_df) 
    return phys_prop_df

In [76]:
metabolite_physical_properties_df = get_physical_properties_features(hmdb_metabolites_phy_prop)
du.dump_in_pickle(metabolite_physical_properties_features_file, metabolite_physical_properties_df)
metabolite_physical_properties_df

Unnamed: 0_level_0,experimental_water_solubility (g/L),experimental_melting_point (°C),predicted_mddr_like_rule,predicted_formal_charge,predicted_polar_surface_area,predicted_acceptor_count,predicted_ghose_filter,predicted_veber_rule,predicted_solubility (g/L),predicted_pka_strongest_basic,...,predicted_rotatable_bond_count,predicted_number_of_rings,predicted_rule_of_five,predicted_polarizability,predicted_physiological_charge,experimental_logp,experimental_water_solubility (mg/mL),experimental_boiling_point (°C),experimental_water_solubility (mg/L),experimental_water_solubility (g/l)
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HMDB0000001,200.0,249.0,False,0.0,81.14,4.0,False,False,6.9300,9.25,...,3.0,1.0,True,17.11,0.0,,,,,
HMDB0000002,,-12.0,False,0.0,52.04,2.0,False,False,437.0000,10.17,...,2.0,0.0,True,9.06,2.0,-1.43,,,,
HMDB0000005,,33.0,False,0.0,54.37,3.0,False,False,79.2000,-9.70,...,2.0,0.0,True,9.20,-1.0,,119.0,,,
HMDB0000008,,44.2,False,0.0,57.53,3.0,False,False,484.0000,-3.80,...,2.0,0.0,True,9.98,-1.0,,,,,
HMDB0000010,,189.5,False,0.0,46.53,3.0,True,False,0.0075,-4.90,...,1.0,4.0,True,34.34,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,,,False,0.0,69.56,3.0,False,False,,-1.00,...,31.0,0.0,False,75.24,0.0,,,,,
HMDB0240684,,,False,0.0,69.56,3.0,False,False,,-1.00,...,34.0,0.0,False,81.63,0.0,,,,,
HMDB0240685,,,False,0.0,69.56,3.0,False,False,,-1.00,...,29.0,0.0,False,70.97,0.0,,,,,
HMDB0240686,,,False,0.0,69.56,3.0,False,False,,-1.00,...,29.0,0.0,False,71.83,0.0,,,,,


In [106]:
metabolite_physical_properties_df.notna().sum()

experimental_water_solubility (g/L)        1123
experimental_melting_point (°C)            6976
predicted_mddr_like_rule                 114194
predicted_formal_charge                  114219
predicted_polar_surface_area             114196
predicted_acceptor_count                 114196
predicted_ghose_filter                   114219
predicted_veber_rule                     114194
predicted_solubility (g/L)                41528
predicted_pka_strongest_basic            112726
predicted_bioavailability                114194
predicted_refractivity                   114193
predicted_logs                           113954
predicted_mono_mass                      114202
predicted_average_mass                   114202
predicted_pka_strongest_acidic            70245
predicted_logp                           114215
predicted_donor_count                    114196
predicted_rotatable_bond_count           114194
predicted_number_of_rings                114194
predicted_rule_of_five                  

# Combining feature columns

### Load feature dfs pickle

In [102]:
metabolite_molecular_weight_df = du.read_from_pickle(metabolite_molecular_weight_features_file)
metabolite_state_df = du.read_from_pickle(metabolite_state_features_file)
metabolite_kingdom_df = du.read_from_pickle(metabolite_kingdom_features_file)
metabolite_super_class_df = du.read_from_pickle(metabolite_super_class_features_file)
metabolite_class_df = du.read_from_pickle(metabolite_class_features_file)
metabolite_sub_class_df = du.read_from_pickle(metabolite_sub_class_features_file)
metabolite_direct_parent_df = du.read_from_pickle(metabolite_direct_parent_features_file)
metabolite_molecular_framework_df = du.read_from_pickle(metabolite_molecular_framework_features_file)
metabolite_alternative_parents_df = du.read_from_pickle(metabolite_alternative_parents_features_file)
metabolite_substituents_df = du.read_from_pickle(metabolite_substituents_features_file)
metabolite_external_descriptors_df = du.read_from_pickle(metabolite_external_descriptors_features_file)
metabolite_cellular_locations_df = du.read_from_pickle(metabolite_cellular_locations_features_file)
metabolite_biospecimen_locations_df = du.read_from_pickle(metabolite_biospecimen_locations_features_file)
metabolite_tissue_locations_df = du.read_from_pickle(metabolite_tissue_locations_features_file)
metabolite_physical_properties_df = du.read_from_pickle(metabolite_physical_properties_features_file)

KeyboardInterrupt: 

In [115]:
direct_features_dfs = [metabolites_molecular_weight_df, metabolite_state_df]
chemical_taxonomy_dfs = [metabolite_molecular_framework_df, metabolite_kingdom_df, metabolite_super_class_df, metabolite_class_df, metabolite_sub_class_df, metabolite_direct_parent_df]
chemical_taxonomy_additions_dfs = [metabolite_alternative_parents_df, metabolite_substituents_df, metabolite_external_descriptors_df]
biological_properties_dfs = [metabolite_cellular_locations_df, metabolite_biospecimen_locations_df, metabolite_tissue_locations_df]

metabolite_features_dfs = direct_features_dfs + chemical_taxonomy_dfs + chemical_taxonomy_additions_dfs + biological_properties_dfs + biological_properties_dfs + [metabolite_physical_properties_df]

metabolite_all_features_df = pd.concat(metabolite_features_dfs, axis=1)
du.dump_in_pickle(metabolite_all_features_file, metabolite_all_features_df)

print('Size:', sys.getsizeof(metabolite_all_features_df))
metabolite_all_features_df

Size: 101639736


Unnamed: 0,average_molecular_weight,monisotopic_molecular_weight,"state==""Solid""","state==""Liquid""","state==""Gas""","state==""nan""","molecular_framework==""Aromatic heteromonocyclic compounds""","molecular_framework==""Aliphatic acyclic compounds""","molecular_framework==""Aromatic homopolycyclic compounds""","molecular_framework==""Aliphatic homopolycyclic compounds""",...,predicted_rotatable_bond_count,predicted_number_of_rings,predicted_rule_of_five,predicted_polarizability,predicted_physiological_charge,experimental_logp,experimental_water_solubility (mg/mL),experimental_boiling_point (°C),experimental_water_solubility (mg/L),experimental_water_solubility (g/l)
HMDB0000001,169.1811,169.085127,1,0,0,0,1,0,0,0,...,3.0,1.0,True,17.11,0.0,,,,,
HMDB0000002,74.1249,74.084398,0,1,0,0,0,1,0,0,...,2.0,0.0,True,9.06,2.0,-1.43,,,,
HMDB0000005,102.0886,102.031694,1,0,0,0,0,1,0,0,...,2.0,0.0,True,9.20,-1.0,,119.0,,,
HMDB0000008,104.1050,104.047344,1,0,0,0,0,1,0,0,...,2.0,0.0,True,9.98,-1.0,,,,,
HMDB0000010,300.3921,300.172545,1,0,0,0,0,0,1,0,...,1.0,4.0,True,34.34,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HMDB0240683,551.9410,551.527745,0,0,0,1,0,1,0,0,...,31.0,0.0,False,75.24,0.0,,,,,
HMDB0240684,594.0220,593.574695,0,0,0,1,0,1,0,0,...,34.0,0.0,False,81.63,0.0,,,,,
HMDB0240685,523.8870,523.496445,0,0,0,1,0,1,0,0,...,29.0,0.0,False,70.97,0.0,,,,,
HMDB0240686,535.8980,535.496445,0,0,0,1,0,1,0,0,...,29.0,0.0,False,71.83,0.0,,,,,


In [112]:
metabolite_all_features_df.columns

Index(['average_molecular_weight', 'monisotopic_molecular_weight',
       'state=="Solid"', 'state=="Liquid"', 'state=="Gas"', 'state=="nan"',
       'molecular_framework=="Aromatic heteromonocyclic compounds"',
       'molecular_framework=="Aliphatic acyclic compounds"',
       'molecular_framework=="Aromatic homopolycyclic compounds"',
       'molecular_framework=="Aliphatic homopolycyclic compounds"',
       ...
       'predicted_rotatable_bond_count', 'predicted_number_of_rings',
       'predicted_rule_of_five', 'predicted_polarizability',
       'predicted_physiological_charge', 'experimental_logp',
       'experimental_water_solubility (mg/mL)',
       'experimental_boiling_point (°C)',
       'experimental_water_solubility (mg/L)',
       'experimental_water_solubility (g/l)'],
      dtype='object', length=9757)