# HMDB database importer

In [1]:
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import ast
import re

In [2]:
#tree = ET.parse('hmdb_urine/urine_metabolites.xml')
tree = ET.parse('hmdb_urine/serum_metabolites.xml')
root = tree.getroot()

In [3]:
def print_element(element, level=0):
    indent = "  " * level
    print(f"{indent}Tag: {element.tag}, Attributes: {element.attrib}")
    
    if element.text and element.text.strip():
        print(f"{indent}  Text: {element.text.strip()}")
    
    for child in element:
        print_element(child, level + 1)

first = True
for child in root:
    if first:
        #print_element(child)
        first = False

In [4]:
import pandas as pd
# List to hold metabolite data
# Function to extract text from an XML element, if it exists
def get_text(element, tag, namespace):
    child = element.find(f'ns:{tag}', namespace)
    return child.text if child is not None else None

# Function to extract a list of texts from XML elements, if they exist
def get_text_list(element, tag, namespace):
    children = element.findall(f'ns:{tag}', namespace)
    return [child.text for child in children] if children is not None else None

data = []
namespace = {'ns': 'http://www.hmdb.ca'}

# Iterate through each metabolite
for metabolite in root.findall('ns:metabolite', namespace):
    # Extract required elements
    accession = get_text(metabolite, 'accession', namespace)
    secondary_accessions = get_text_list(metabolite.find('ns:secondary_accessions', namespace), 'accession', namespace)
    name = get_text(metabolite, 'name', namespace)
    synonyms = get_text_list(metabolite.find('ns:synonyms', namespace), 'synonym', namespace)
    chemical_formula = get_text(metabolite, 'chemical_formula', namespace)
    iupac_name = get_text(metabolite, 'iupac_name', namespace)
    cas_registry_number = get_text(metabolite, 'cas_registry_number', namespace)
    smiles = get_text(metabolite, 'smiles', namespace)
    inchi = get_text(metabolite, 'inchi', namespace)
    inchikey = get_text(metabolite, 'inchikey', namespace)
    description = get_text(metabolite, 'description', namespace)
    monisotopic_molecular_weight = get_text(metabolite, 'monisotopic_molecular_weight', namespace)
    average_molecular_weight = get_text(metabolite, 'average_molecular_weight', namespace)
    
    # Append the metabolite data to the list
    data.append([accession, secondary_accessions, name, synonyms, chemical_formula, iupac_name, cas_registry_number, smiles, inchi, inchikey, monisotopic_molecular_weight, average_molecular_weight, description])

# Create a DataFrame
columns = ['accession', 'secondary_accessions', 'name', 'synonyms', 'chemical_formula', 'iupac_name', 'cas_registry_number', 'smiles', 'inchi', 'inchikey', 'monisotopic_molecular_weight', 'average_molecular_weight', 'description']
df = pd.DataFrame(data, columns=columns)

# Display the DataFrame
df.head()

# Save the DataFrame to a CSV file if needed
#df.to_csv('metabolites.csv', index=False)
#df.drop('description').to_csv('metabolites_no_description.csv', index=False)

Unnamed: 0,accession,secondary_accessions,name,synonyms,chemical_formula,iupac_name,cas_registry_number,smiles,inchi,inchikey,monisotopic_molecular_weight,average_molecular_weight,description
0,HMDB0000001,"[HMDB00001, HMDB0004935, HMDB0006703, HMDB0006...",1-Methylhistidine,[(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)pro...,C7H11N3O2,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,332-80-9,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,BRMWTNUJHUMWMS-LURJTMIESA-N,169.085126611,169.1811,"1-Methylhistidine, also known as 1-MHis or 1MH..."
1,HMDB0000002,"[HMDB00002, HMDB0060172, HMDB60172]","1,3-Diaminopropane","[1,3-Propanediamine, 1,3-Propylenediamine, Pro...",C3H10N2,"propane-1,3-diamine",109-76-2,NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,XFNJVJPLKCPIBV-UHFFFAOYSA-N,74.08439833,74.1249,"1,3-Diaminopropane, also known as DAP or trime..."
2,HMDB0000005,"[HMDB00005, HMDB0006544, HMDB06544]",2-Ketobutyric acid,"[2-Ketobutanoic acid, 2-Oxobutyric acid, 3-Met...",C4H6O3,2-oxobutanoic acid,600-18-0,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",TYEYBOSBBBHJIV-UHFFFAOYSA-N,102.031694058,102.0886,"2-Ketobutyric acid, also known as alpha-ketobu..."
3,HMDB0000008,[HMDB00008],2-Hydroxybutyric acid,"[(S)-2-Hydroxybutanoic acid, 2-Hydroxybutyrate...",C4H8O3,(2S)-2-hydroxybutanoic acid,3347-90-8,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",AFENDNXGAFYKQO-VKHMYHEASA-N,104.047344118,104.105,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn..."
4,HMDB0000010,"[HMDB00010, HMDB0004990, HMDB0004991, HMDB0499...",2-Methoxyestrone,"[2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-meth...",C19H24O3,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",362-08-3,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,WHEUWNKSCXYKBU-QPWUGHHJSA-N,300.172544634,300.3921,2-Methoxyestrone (or 2-ME1) belongs to the cla...


In [5]:
import os

def is_1h_file(path):
    if os.path.isfile(path):
        with open(path) as f:
            first_line = f.readline()
            return "_13C_" not in first_line and "ADDRESS" not in first_line
    return False

def list_files(folder_path):
    try:
        # Get all entries in the folder
        entries = os.listdir(folder_path)
        # Filter out only files
        files = [f for f in entries if is_1h_file(os.path.join(folder_path, f))]
        return files
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
        return []
    except PermissionError:
        print(f"Permission denied: {folder_path}")
        return []

spectra_files = list_files('hmdb_urine/hmdb_nmr_peak_lists')
print(len(spectra_files))

1786


In [6]:
def create_dataframe(file_list):
    # List to hold the data for each file
    data = []

    for file_name in file_list:
        # Split the file name by "_" and take the first part as accession
        tokens  = file_name.split('_')
        # Append the data as a tuple (accession, file_name)
        data.append((tokens[0], file_name, tokens[1]))

    # Create the DataFrame
    df = pd.DataFrame(data, columns=['accession', 'file_name', 'dim'])

    return df

df_files = create_dataframe(spectra_files)

#Filter out only 1d files
df_files = df_files[df_files['dim'] == 'nmroned']
df_files.head()

Unnamed: 0,accession,file_name,dim
4,HMDB0003072,HMDB0003072_nmroned_5251_2734492.txt,nmroned
7,HMDB0000176,HMDB0000176_nmroned_1144_28428.txt,nmroned
9,HMDB0002873,HMDB0002873_nmroned_1910_33109.txt,nmroned
11,HMDB0000622,HMDB0000622_nmroned_1440_29992.txt,nmroned
12,HMDB0000548,HMDB0000548_nmroned_1415_29734.txt,nmroned


In [7]:
df = df.join(df_files.set_index('accession'), on='accession', how='left')
df = df.dropna(subset=['file_name'])
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,accession,secondary_accessions,name,synonyms,chemical_formula,iupac_name,cas_registry_number,smiles,inchi,inchikey,monisotopic_molecular_weight,average_molecular_weight,description,file_name,dim
0,HMDB0000001,"[HMDB00001, HMDB0004935, HMDB0006703, HMDB0006...",1-Methylhistidine,[(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)pro...,C7H11N3O2,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,332-80-9,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,BRMWTNUJHUMWMS-LURJTMIESA-N,169.085126611,169.1811,"1-Methylhistidine, also known as 1-MHis or 1MH...",HMDB0000001_nmroned_1022_27891.txt,nmroned
1,HMDB0000002,"[HMDB00002, HMDB0060172, HMDB60172]","1,3-Diaminopropane","[1,3-Propanediamine, 1,3-Propylenediamine, Pro...",C3H10N2,"propane-1,3-diamine",109-76-2,NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,XFNJVJPLKCPIBV-UHFFFAOYSA-N,74.08439833,74.1249,"1,3-Diaminopropane, also known as DAP or trime...",HMDB0000002_nmroned_1023_27894.txt,nmroned
2,HMDB0000005,"[HMDB00005, HMDB0006544, HMDB06544]",2-Ketobutyric acid,"[2-Ketobutanoic acid, 2-Oxobutyric acid, 3-Met...",C4H6O3,2-oxobutanoic acid,600-18-0,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",TYEYBOSBBBHJIV-UHFFFAOYSA-N,102.031694058,102.0886,"2-Ketobutyric acid, also known as alpha-ketobu...",HMDB0000005_nmroned_1024_27899.txt,nmroned
3,HMDB0000008,[HMDB00008],2-Hydroxybutyric acid,"[(S)-2-Hydroxybutanoic acid, 2-Hydroxybutyrate...",C4H8O3,(2S)-2-hydroxybutanoic acid,3347-90-8,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",AFENDNXGAFYKQO-VKHMYHEASA-N,104.047344118,104.105,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn...",HMDB0000008_nmroned_5245_2734397.txt,nmroned
4,HMDB0000010,"[HMDB00010, HMDB0004990, HMDB0004991, HMDB0499...",2-Methoxyestrone,"[2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-meth...",C19H24O3,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",362-08-3,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,WHEUWNKSCXYKBU-QPWUGHHJSA-N,300.172544634,300.3921,2-Methoxyestrone (or 2-ME1) belongs to the cla...,HMDB0000010_nmroned_1026_27907.txt,nmroned


In [8]:
df.shape

(648, 15)

In [9]:
# Save the DataFrame to a CSV file if needed
df.to_csv('hmdb_urine/metabolites.csv', index=False)
df.drop(columns=['description']).to_csv('metabolites_no_description.csv', index=False)

## Prepare the dataset for approximate lookup using the metabolyte names and synomyms

In [10]:
df['synonyms_cat'] = df['name'] + " " + df['synonyms'].str.join(' ')
df['synonyms_cat'] = df['synonyms_cat'].str.lower()

In [11]:
from rapidfuzz import process, fuzz

# Function to perform approximate lookup
def approximate_lookup(df, column, search_string, scorer1, scorer2, limit=10):
    # Extract the column values as a list
    choices = df[column].str.lower().tolist()
    names = df['name'].values
    search_string = search_string.lower()
    # Get the best matches
    matches = process.extract(search_string, choices, scorer=scorer1, score_cutoff=100, limit=None)
    #print(matches)
    if len(matches) > 0:
        result = []
        for match in matches:
            matchings_names = df.loc[match[2],['synonyms']].values[0]
            best_name = ""
            best_score = 0
            for name in [names[match[2]]] + matchings_names:
                score = scorer2(search_string, name.lower())
                if score > best_score:
                    best_score = score
                    best_name = name
            result.append([names[match[2]], best_name, match[2], best_score,])
        # Map to the main name
        matches = pd.DataFrame(result).sort_values(3, ascending=False)
        #for synomym in data['synomyns'].
        return matches.head(limit).values
    return None

In [12]:
df['synonyms_cat']

0      1-methylhistidine (2s)-2-amino-3-(1-methyl-1h-...
1      1,3-diaminopropane 1,3-propanediamine 1,3-prop...
2      2-ketobutyric acid 2-ketobutanoic acid 2-oxobu...
3      2-hydroxybutyric acid (s)-2-hydroxybutanoic ac...
4      2-methoxyestrone 2-(8s,9s,13s,14s)-3-hydroxy-2...
                             ...                        
643    d-methionine (s)-s-oxide methionine sulphoxide...
644    3-[[(2s)-2,4-dihydroxy-3,3-dimethylbutanoyl]am...
645    glycyl-d-proline 1-(aminoacetyl)proline 1-glyc...
646    mannoheptulose altro-heptulose d-altro-2-heptu...
647    n-acetyl-1-aspartylglutamic acid n-acetylaspar...
Name: synonyms_cat, Length: 648, dtype: object

In [13]:
matches = approximate_lookup(df, 'synonyms_cat', 'citric acid', fuzz.partial_token_ratio, fuzz.ratio, limit=3)
print(matches)

[['Citric acid' 'Citric acid' 46 100.0]
 ['trans-Aconitic acid' 'Citridic acid' 376 91.66666666666666]
 ['cis-Aconitic acid' 'Citridic acid' 37 91.66666666666666]]


## Loook for p-cresol

In [14]:
#matches = approximate_lookup(df, 'name', "3-hydroxybutirate")
matches = approximate_lookup(df, 'synonyms_cat', "ethylendiaminotetraacetic acid", fuzz.partial_token_ratio, fuzz.ratio, limit=3)
print(matches)

[['Sarcosine' 'Methylaminoacetic acid' 147 80.76923076923077]
 ['Acetylglycine' 'Acetylaminoacetic acid' 218 76.92307692307692]
 ['Methylimidazoleacetic acid' 'Methylimidazoleacetic acid' 554 75.0]]


## Look for all metabolytes into the dataset

In [15]:
entries = ["2-Aminobutyric acid","2-Hydroxybutyric acid","2-Oxoglutaric acid","3-Hydroxybutyric acid","Acetic acid","Acetoacetic acid","Acetone","Alanine","Asparagine","Ca-EDTA","Choline","Citric acid","Creatine","Creatinine","D-Galactose","Dimethylsulfone","Ethanol","Formic acid","Glucose","Glutamic acid","Glutamine","Glycerol","Glycine","Histidine","Isoleucine","K-EDTA","Lactic acid","Leucine","Lysine","Methionine","N,N-Dimethylglycine","Ornithine","Phenylalanine","Proline","Pyruvic acid","Sarcosine","Succinic acid","Threonine","Trimethylamine-N-oxide","Tyrosine","Valine"]

for entry in entries:
    matches = approximate_lookup(df, 'synonyms_cat', entry, fuzz.partial_token_ratio, fuzz.ratio, limit=3)
    print(entry, matches)

2-Aminobutyric acid [['D-alpha-Aminobutyric acid' '2-Aminobutyric acid' 250 100.0]
 ['L-alpha-Aminobutyric acid' '2-Aminobutyric acid' 198 100.0]
 ['gamma-Aminobutyric acid' '4-Aminobutyric acid' 53 94.73684210526316]]
2-Hydroxybutyric acid [['2-Hydroxybutyric acid' '2-Hydroxybutyric acid' 3 100.0]
 ['(S)-3-Hydroxybutyric acid' '3-Hydroxybutyric acid' 194
  95.23809523809523]
 ['3-Hydroxybutyric acid' '3-Hydroxybutyric acid' 5 95.23809523809523]]
2-Oxoglutaric acid [['Oxoglutaric acid' '2-Oxoglutaric acid' 112 100.0]
 ['2-Ketobutyric acid' '2-Oxobutyric acid' 2 85.71428571428572]
 ['L-Glutamic acid' '2-Aminoglutaric acid' 74 84.21052631578947]]
3-Hydroxybutyric acid [['3-Hydroxybutyric acid' '3-Hydroxybutyric acid' 5 100.0]
 ['(S)-3-Hydroxybutyric acid' '3-Hydroxybutyric acid' 194 100.0]
 ['2-Hydroxybutyric acid' '2-Hydroxybutyric acid' 3 95.23809523809523]]
Acetic acid [['Acetic acid' 'Acetic acid' 23 100.0]
 ['Acetylglycine' 'Acetic acid' 218 100.0]
 ['Acetoacetic acid' 'Diacetic aci

In [16]:
"L-Isoleucine", "L-Leucine", 'L-Valine', 'Isobutyric acid', '3-Hydroxybutyric acid', 'L-Lactic acid', 'L-Alanine', 'Acetic acid'

('L-Isoleucine',
 'L-Leucine',
 'L-Valine',
 'Isobutyric acid',
 '3-Hydroxybutyric acid',
 'L-Lactic acid',
 'L-Alanine',
 'Acetic acid')

In [17]:
entries = ["Isoleucine","Leucine","Valine","Isoleucine","Valine","Isobutyrate","Ethanol","3-hydroxybutyrate","Lactate","Alanine","Acetate","3-hydroxybutyrate","Pyruvate","3-hydroxybutyrate","Succinate","Citrate","Citrate","Creatine","beta-Glucose","Ethanol","Alanine","Lactate","alpha-Glucose","Tyrosine","1-methylhistidine","Tyrosine","1-methylhistidine","Formate"]
for entry in entries:
    matches = approximate_lookup(df, 'synonyms_cat', entry, fuzz.partial_token_ratio, fuzz.ratio, limit=3)
    print(entry, matches)

Isoleucine [['L-Isoleucine' 'ISOLEUCINE' 91 100.0]
 ['L-Alloisoleucine' 'Isoleucine' 225 100.0]
 ['3-Methyl-2-oxovaleric acid' '2-Oxoisoleucine' 209 80.0]]
Leucine [['L-Leucine' 'LEUCINE' 265 100.0]
 ['Aminocaproic acid' 'e-Leucine' 482 87.5]
 ['Beta-Leucine' 'b-Leucine' 592 87.5]]
Valine [['L-Valine' 'VALINE' 343 100.0]
 ['alpha-Ketoisovaleric acid' 'Ketovaline' 10 75.0]
 ['L-Leucine' 'LEUCINE' 265 61.53846153846154]]
Isoleucine [['L-Isoleucine' 'ISOLEUCINE' 91 100.0]
 ['L-Alloisoleucine' 'Isoleucine' 225 100.0]
 ['3-Methyl-2-oxovaleric acid' '2-Oxoisoleucine' 209 80.0]]
Valine [['L-Valine' 'VALINE' 343 100.0]
 ['alpha-Ketoisovaleric acid' 'Ketovaline' 10 75.0]
 ['L-Leucine' 'LEUCINE' 265 61.53846153846154]]
Isobutyrate [['Isobutyric acid' 'Isobutyrate' 466 100.0]
 ['alpha-Hydroxyisobutyric acid' 'Hydroxyisobutyrate' 286
  75.86206896551724]
 ['2-Aminoisobutyric acid' 'a-Aminoisobutyrate' 484 75.86206896551724]]
Ethanol [['Ethanol' 'Ethanol' 52 100.0]
 ['Methanol' 'Methanol' 468 93.33

In [18]:
f = open('hmdb_urine/hmdb_nmr_peak_lists/HMDB0001370_nmroned_1690_31773.txt')# + df['file_name'].values[16], "r")
entry1 = f.read()
f.close()
print(entry1)

Table of Peaks 
No. 	(ppm) 	(Hz) 	Height 
1 	3.76 	2256.7 	0.1688 
2 	3.75 	2250.6 	0.3232 
3 	3.74 	2244.6 	0.1723 
4 	3.74 	2239.8 	0.1610 
5 	3.73 	2233.0 	0.2302 
6 	3.72 	2227.3 	0.1604 
7 	1.94 	1162.6 	0.0405 
8 	1.93 	1156.9 	0.0468 
9 	1.92 	1153.6 	0.0572 
10 	1.92 	1151.4 	0.0597 
11 	1.92 	1147.9 	0.1220 
12 	1.91 	1142.4 	0.1463 
13 	1.90 	1137.2 	0.1818 
14 	1.89 	1134.5 	0.1699 
15 	1.88 	1128.4 	0.1682 
16 	1.88 	1125.1 	0.0874 
17 	1.87 	1121.8 	0.1179 
18 	1.86 	1115.6 	0.0731 
19 	1.85 	1108.1 	0.0442 
20 	1.84 	1104.6 	0.0521 
21 	1.83 	1096.9 	0.0279 
22 	1.82 	1090.1 	0.0204 
23 	1.48 	884.7 	0.0301 
24 	1.46 	876.1 	0.0908 
25 	1.45 	868.4 	0.1181 
26 	1.43 	859.8 	0.0830 
27 	1.42 	851.9 	0.0456 

Table of Multiplets 
No. 	Shift1 (ppm) 	Hs 	Type 	J (Hz) 	Atom1 	Multiplet1 	(ppm) 
1 	1.45	2 	m 	- 	7 	M02 	1.36 .. 1.49 
2 	1.88	4 	m 	- 	8 6 	M01 	1.79 .. 1.96 
3 	3.74	2 	m 	- 	9 4 	M00 	3.70 .. 3.78 
 
Table of Assignments
No. 	Atom Exp. 	Shift (ppm) 	Multiplet 
1

# Guess the spectral frequency, compile the multiples, and recalculate the integrals that should match Hs, but sometimes has errors

In [19]:
import re
import pandas as pd
import math
import io

# Function to strip trailing commas
def strip_trailing_commas(s):
    if isinstance(s, str):
        return s.strip(',')
    return s


def parse_hmdb_data_clean(text, accession):
    """Parses HMDB NMR data and creates a data frame with peak information.

    Args:
        text: A string containing the HMDB NMR data in the specified format.

    Returns:
        A pandas DataFrame with one row per multiplet and columns for peak positions (ppm and Hz), heights, and assigned atoms.
    """
    text = text.lower()
    if( not( "peaks" in text )):
        text = "peaks" + text
        
    text = text.replace("muliplets", "multiplets")
    text = text.replace("mulitplets", "multiplets")
    text = text.replace("mnltiplets", "multiplets")
    text = text.replace("multuplets", "multiplets")
    text = text.replace("mutiplets", "multiplets")

    text = text.replace("praks", "peaks")
    text = text.replace("assignements", "assignments")
    
    text = text.replace("assignment\n", "assignments\n")

    text = text.replace("peaks", "peaks\n")
    text = text.replace("multiplets", "multiplets\n")
    text = text.replace("assignments", "assignments\n")
    text = text.replace("table", "\ntable")
    text = text.replace("atom exp.", "atom\texp.")

    #text = text.replace("\n\n\n", "\n\n")
    #text = text.replace(r"\n[\n]+", "\n\n")
    #print(text)
    try:
        # Extract tables using regular expressions
        table1 = re.search(r"peaks[\n ]+(.*?)\n([\t ]*)\n", text, re.DOTALL).group(1)
        #print(table1)
        table2 = re.search(r"multiplets[\n ]+(.*?)\n([\t ]*)\n", text, re.DOTALL).group(1)
        #print(multiplets_table)
        table3 = re.search(r"assignments[\n ]+(.*?)\n([\t ]*)\n", text, re.DOTALL)
        if table3 is not None:
           table3 = table3.group(1)
        else:
            table3 =  "no.\tatom\texp.shift(ppm)\tmultiplet\n1\t1\t1\1\ts0"
        #print(table2)
        # Guess which table is who
        assignments_table = None
        for table in [table1, table2, table3]:
            if "height" in table :
                peaks_table = table
            else:
                if "atom1" in table or "j (hz)" in table or "hs" in table:
                    multiplets_table = table
                else:
                    assignments_table = table
        if assignments_table == None:
            assignments_table = "no.\tatom\texp.shift(ppm)\tmultiplet\n1\t1\t1\1\ts0"
        
        multiplets_table = multiplets_table.replace("multiplet1 (ppm)", "multiplet1\t(ppm)")
        multiplets_table = re.sub(r"shift1[\s\t]*\(ppm\)", r"shift1(ppm)", multiplets_table)
        
        #print(assignments_table)

        #print(multiplets_table.replace(" ", ","))
        # Convert tables to pandas DataFrames
        peaks_df = pd.read_csv(io.StringIO(peaks_table.replace(" ", ",")), sep=r"\s+", engine="python")#sep="\t+", dtype=str)
        multiplets_df = pd.read_csv(io.StringIO(multiplets_table.replace(" ", ",")), sep=r"\s+", engine="python")
        assignments_df = pd.read_csv(io.StringIO(assignments_table.replace(" ", ",")), sep=r"\s+", engine="python")
        
        # Remove spaces in names
        peaks_df.columns = peaks_df.columns.str.replace(' ', '').str.replace(',', '')
        multiplets_df.columns = multiplets_df.columns.str.replace(' ', '').str.replace(',', '')
        assignments_df.columns = assignments_df.columns.str.replace(' ', '').str.replace(',', '')


        # Apply the function to each string column
        for col in peaks_df.select_dtypes(include=['object']).columns:
            peaks_df[col] = peaks_df[col].apply(strip_trailing_commas)
            
        # Apply the function to each string column
        for col in multiplets_df.select_dtypes(include=['object']).columns:
            multiplets_df[col] = multiplets_df[col].apply(strip_trailing_commas)  
        
        # Apply the function to each string column
        for col in assignments_df.select_dtypes(include=['object']).columns:
            assignments_df[col] = assignments_df[col].apply(strip_trailing_commas)  

        if "multiplet" not in assignments_df:
            assignments_df["multiplet"] = "x"

        peaks_df["(ppm)"] = pd.to_numeric(peaks_df["(ppm)"], errors='coerce')


        # Create an empty list to store the data for the new DataFrame
        data = []
    
        # Iterate over the multiplets
        for _, multiplet in multiplets_df.iterrows():
            if "m" not in multiplet["multiplet1"]:
                multiplet["(ppm)"] = multiplet["multiplet1"]
                multiplet["multiplet1"] = multiplet["atom1"]
                multiplet["atom1"] = ""
            if("atom1" not in multiplet):
                multiplet["atom1"] = ""
            if(isinstance(multiplet["atom1"], int) or isinstance(multiplet["atom1"], float)):
                multiplet["atom1"] = [multiplet["atom1"]]
            else:
                multiplet["atom1"] = multiplet["atom1"].split(",")

            #print(multiplet)
            if not ("j(hz)" in multiplet):
                multiplet["j(hz)"] = "-"
            if(isinstance(multiplet["j(hz)"], int) or isinstance(multiplet["j(hz)"], float)):
                 multiplet["j(hz)"] = [multiplet["j(hz)"]]
            else:
                multiplet["j(hz)"] = multiplet["j(hz)"].replace("-", "").split(",")
    
            ppm_range = multiplet["(ppm)"].replace(",", "").split("..")
            ppm_min = float(ppm_range[0])
            ppm_max = float(ppm_range[1])
            # Filter peaks within the multiplet range
            peaks_in_range = peaks_df[
                (peaks_df["(ppm)"] >= ppm_min) & (peaks_df["(ppm)"] <= ppm_max)
            ]
    
            # Extract peak information
            ppm_values = peaks_in_range["(ppm)"].tolist()
            heights = peaks_in_range["height"].tolist()
            if ("(hz)" in peaks_in_range):
                hz_values = peaks_in_range["(hz)"].tolist()
            else:
                hz_values = np.array(ppm_values) * 601
    
            # Find assigned atoms
            assigned_atoms = assignments_df[
                assignments_df["multiplet"] == multiplet["multiplet1"]
            ]["atom"].tolist()

            if "shift1(ppm)" not in multiplet:
                multiplet["shift1(ppm)"] = (ppm_min + ppm_max) / 2
            # Append data for the current multiplet to the list
            data.append(
                {
                    "accession": accession,
                    "multiplet": multiplet["multiplet1"],
                    "shift1(ppm)": multiplet["shift1(ppm)"],
                    "hs": multiplet["hs"],
                    "j(hz)": multiplet["j(hz)"],
                    "atom1": multiplet["atom1"],
                    "type": multiplet["type"],
                    "from": ppm_min,
                    "to": ppm_max,
                    "ppm": ppm_values,
                    "hz": hz_values,
                    "heights": heights,
                    "assigned atoms": assigned_atoms,
                }
            )
    
        # Create the final DataFrame
        return pd.DataFrame(data)
    except Exception as e:
        #print(text)
        print(e)
        return None;

In [20]:
assignment_entry = parse_hmdb_data_clean(
"""
Table of Peaks 
No. 	(ppm) 	(Hz) 	Height 
1 	3.76 	2256.7 	0.1688 
2 	3.75 	2250.6 	0.3232 
3 	3.74 	2244.6 	0.1723 
4 	3.74 	2239.8 	0.1610 
5 	3.73 	2233.0 	0.2302 
6 	3.72 	2227.3 	0.1604 
7 	1.94 	1162.6 	0.0405 
8 	1.93 	1156.9 	0.0468 
9 	1.92 	1153.6 	0.0572 
10 	1.92 	1151.4 	0.0597 
11 	1.92 	1147.9 	0.1220 
12 	1.91 	1142.4 	0.1463 
13 	1.90 	1137.2 	0.1818 
14 	1.89 	1134.5 	0.1699 
15 	1.88 	1128.4 	0.1682 
16 	1.88 	1125.1 	0.0874 
17 	1.87 	1121.8 	0.1179 
18 	1.86 	1115.6 	0.0731 
19 	1.85 	1108.1 	0.0442 
20 	1.84 	1104.6 	0.0521 
21 	1.83 	1096.9 	0.0279 
22 	1.82 	1090.1 	0.0204 
23 	1.48 	884.7 	0.0301 
24 	1.46 	876.1 	0.0908 
25 	1.45 	868.4 	0.1181 
26 	1.43 	859.8 	0.0830 
27 	1.42 	851.9 	0.0456 

Table of Multiplets 
No. 	Shift1 (ppm) 	Hs 	Type 	J (Hz) 	Atom1 	Multiplet1 	(ppm) 
1 	1.45 2 	m 	- 	7 	M02 	1.36 .. 1.49 
2 	1.88 4 	m 	- 	8 6 	M01 	1.79 .. 1.96 
3 	3.74 2 	m 	- 	9 4 	M00 	3.70 .. 3.78 
 
Table of Assignments
No. 	Atom Exp. 	Shift (ppm) 	Multiplet 
1 	4 	3.74 	M00 
2 	6 	1.88 	M01 
3 	7 	1.45 	M02 
4 	8 	1.88 	M01 
5 	9 	3.74 	M00 

""", "HMDB1")
assignment_entry

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
0,HMDB1,m02,"1.45,2",m,[7],[],-,1.36,1.49,"[1.48, 1.46, 1.45, 1.43, 1.42]","[884.7, 876.1, 868.4, 859.8, 851.9]","[0.0301, 0.0908, 0.1181, 0.0830, 0.0456]",[]
1,HMDB1,m01,"1.88,4",m,"[8, 6]",[],-,1.79,1.96,"[1.94, 1.93, 1.92, 1.92, 1.92, 1.91, 1.9, 1.89...","[1162.6, 1156.9, 1153.6, 1151.4, 1147.9, 1142....","[0.0405, 0.0468, 0.0572, 0.0597, 0.1220, 0.146...",[]
2,HMDB1,m00,"3.74,2",m,"[9, 4]",[],-,3.7,3.78,"[3.76, 3.75, 3.74, 3.74, 3.73, 3.72]","[2256.7, 2250.6, 2244.6, 2239.8, 2233.0, 2227.3]","[0.1688, 0.3232, 0.1723, 0.1610, 0.2302, 0.1604]",[]


In [21]:
assignment_entry = parse_hmdb_data_clean(entry1 + "\n\n", "HMDB1")
assignment_entry.head()

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
0,HMDB1,m02,1.45,2,[],[7],m,1.36,1.49,"[1.48, 1.46, 1.45, 1.43, 1.42]","[884.7, 876.1, 868.4, 859.8, 851.9]","[0.0301, 0.0908, 0.1181, 0.0830, 0.0456]",[]
1,HMDB1,m01,1.88,4,[],"[8, 6]",m,1.79,1.96,"[1.94, 1.93, 1.92, 1.92, 1.92, 1.91, 1.9, 1.89...","[1162.6, 1156.9, 1153.6, 1151.4, 1147.9, 1142....","[0.0405, 0.0468, 0.0572, 0.0597, 0.1220, 0.146...",[]
2,HMDB1,m00,3.74,2,[],"[9, 4]",m,3.7,3.78,"[3.76, 3.75, 3.74, 3.74, 3.73, 3.72]","[2256.7, 2250.6, 2244.6, 2239.8, 2233.0, 2227.3]","[0.1688, 0.3232, 0.1723, 0.1610, 0.2302, 0.1604]",[]


# Index the whole db

In [22]:
nmrdb = None
HMDB_PATH = 'hmdb_urine/hmdb_nmr_peak_lists/'
index = 0
count_errors = 0

for entry in df.iterrows():
    index += 1
    #531, 582
    if(index in [69, 301, 365, 387, 409, 412, 465, 493, 500, 501, 515, 531, 563, 580, 581]):
        continue
    f = open(HMDB_PATH + entry[1]['file_name'], "r")
    content = f.read()
    f.close()
    assignment_entry = parse_hmdb_data_clean(content + "\n\n", entry[1]['accession'])
    if assignment_entry is not None:
        if nmrdb is None:
            nmrdb = assignment_entry
        else:
            nmrdb = pd.concat([nmrdb, assignment_entry])
    else:
        count_errors += 1
        print(index)
        print(HMDB_PATH + entry[1]['file_name'])
        print(content)
        break;

nmrdb['from'] = pd.to_numeric(nmrdb['from'], errors='coerce')
nmrdb['to'] = pd.to_numeric(nmrdb['to'], errors='coerce')
nmrdb['shift1(ppm)'] = pd.to_numeric(nmrdb['shift1(ppm)'], errors='coerce')

nmrdb = nmrdb.sort_values("shift1(ppm)").reset_index(drop=True)

nmrdb.to_csv('inst/spectral1hnmr.csv', index=False)
print(count_errors, index)
nmrdb.shape

'(ppm)'
378
hmdb_urine/hmdb_nmr_peak_lists/HMDB0000965_nmroned_1626_31427.txt
Hypotaurine (HMDB00965)
1H NMR spectrum 600 MHz in H2O
Sample 25 mM at pH 7.0
Referenced to DSS

Table of Peaks

No. (ppm) (Hz) Height
1 2.648 1589.41 0.4276
2 2.659 1596.34 0.9079
3 2.671 1603.27 0.4484
4 3.326 1996.72 0.4964
5 3.338 2003.65 1.0000
6 3.349 2010.58 0.4572

Table of Multiplets

No. Shift1 (ppm) Hs Type J (Hz) Atom1 Multiplet1 (ppm)
1 2.66 2 t 6.93 4 M02 2.63 .. 2.70
2 3.34 2 t 6.93 5 M01 3.30 .. 3.37

Table of Assignments

No. Atom Exp. Shift (ppm) Multiplet
1 4 2.66 M02
2 5 3.34 M01
1 378


(2170, 13)

In [133]:
import bisect

def triangle_overlap_area(x_a, y_a, x_b, y_b, b):
    """
    Compute the overlap area between two isosceles triangles with:
        - same base length b
        - bases on y = 0
        - centers at x_a and x_b on the x-axis
        - heights y_a and y_b
    """
    def tri_height(x, xc, h, b):
        """Return the height of the triangle at position x (piecewise linear)."""
        half = b / 2
        left, right = xc - half, xc + half
        if x < left or x > right:
            return 0.0
        # Linear interpolation
        if x <= xc:
            return h * (x - left) / (xc - left)
        else:
            return h * (right - x) / (right - xc)

    # Compute global horizontal overlap range
    L = max(x_a - b/2, x_b - b/2)
    R = min(x_a + b/2, x_b + b/2)

    if R <= L:
        return 0.0  # No overlap

    # Integrate numerically (analytical solution is possible but long)
    # Fine grid for accuracy
    xs = np.linspace(L, R, 2001)
    h_min = np.minimum(
        [tri_height(x, x_a, y_a, b) for x in xs],
        [tri_height(x, x_b, y_b, b) for x in xs]
    )

    # Numerical integration using trapezoid rule
    area = np.trapz(h_min, xs)
    return area


# Range query function
def range_query_and(query, nmrdb):
    indexer = [(a, b) for a, b in zip(nmrdb['shift1(ppm)'].values, nmrdb.index.values)]
    result_ids = None
    for rng in query:
        range_i = rng['range']
        left_index = bisect.bisect_left(indexer,  (range_i[0], -1))
        right_index = bisect.bisect_right(indexer, (range_i[1], float('inf')))
        res = set(nmrdb.loc[[item[1] for item in indexer[left_index:(right_index + 1)]],['accession']]['accession'].values)
        if result_ids is None:
            result_ids = res
        else:
            result_ids = result_ids & res
    return list(result_ids)

def multiplet_match(test_ppm, test_heights, query_ppm, query_heights):
    #1.  Center both multiplets at center of mass
    cm_test = np.sum(test_ppm * test_heights) / np.sum(test_heights)
    test_ppm -= cm_test
    cm_query = np.sum(query_ppm * query_heights) / np.sum(query_heights)
    query_ppm -= cm_query
    #2. Compare area overlap when both multiplets has area 1.
    score =0
    base = 0.008 # TODO: We could change this. 
    for i in range(test_ppm.shape[0]):
        for j in range(query_ppm.shape[0]):
            score += triangle_overlap_area(test_ppm[i], test_heights[i], query_ppm[j], query_heights[j], base)
    assert (score >=0) & (score <= 1), "Score out of range"
    return score * 2 / base    
    

def multiple_query(query, nmrdb, metabolytes):
    res = range_query_and(query, nmrdb)
    filtered_df = metabolytes.loc[metabolytes['accession'].isin(res), ]
    result = []
    
    # Ensure correct query structure
    for query_i in query:
        if 'ppm' in query_i:
            query_i['ppm'] = np.array(query_i['ppm'])
            query_i['heights'] = np.array(query_i['heights'])
            query_i['heights'] /= np.sum(query_i['heights'])
        
    for row in filtered_df.iterrows():
        signals = nmrdb[nmrdb['accession'] == row[1]["accession"]]
        matches = 0
        for query_i in query:
            range_i = query_i['range']
            mult = query_i['mult']
            from_i = range_i[0]
            to_i = range_i[1]
            mul_i = mult
            for signal in signals.iterrows():
                if signal[1]['shift1(ppm)'] >= from_i and signal[1]['shift1(ppm)'] <= to_i and ((signal[1]['type'] == mul_i) or (mul_i == "*")):
                    matches += 1
                    # Check multiplet alignment
                    if 'ppm' in query_i:
                        matches += multiplet_match(signal[1]['ppm'], signal[1]['heights'], query_i['ppm'], query_i['heights'])
                    break
        result.append((row[1]["accession"], row[1]["name"], len(query) / signals.shape[0] + matches /  len(query)))
                    
    result = pd.DataFrame(result, columns=["accession", "name", "similarity"])
    result = result.sort_values("similarity", ascending=False)
    return result


In [134]:
import pandas as pd

nmrdb = pd.read_csv("inst/spectral1hnmr.csv")
df = pd.read_csv("inst/metabolites.csv")

In [135]:
# Convert ppm from string to np.array
nmrdb['ppm'] = nmrdb['ppm'].apply(lambda x: np.array(ast.literal_eval(x.replace("'",''))))

def h_converter(x):
    x = np.array(ast.literal_eval(x.replace("'",'')))
    return x / np.sum(x) 
    
nmrdb['heights'] = nmrdb['heights'].apply(lambda x: h_converter(x))

In [136]:
res = df[df['name'].isin(["L-Isoleucine", "L-Leucine", 'L-Valine', 'Isobutyric acid', '3-Hydroxybutyric acid', 'L-Lactic acid', 'L-Alanine', 'Acetic acid'])]
res = res['accession']
res

5      HMDB0000011
20     HMDB0000042
81     HMDB0000161
88     HMDB0000172
96     HMDB0000190
253    HMDB0000687
336    HMDB0000883
453    HMDB0001873
Name: accession, dtype: object

In [137]:
nmrlist = nmrdb[nmrdb['accession'].isin(res.values)]
nmrlist = nmrlist[['accession', 'ppm', 'heights', 'hs']]

nmrlist

Unnamed: 0,accession,ppm,heights,hs
128,HMDB0000172,"[0.913, 0.926, 0.938]","[0.22693594031386674, 0.5112297401595061, 0.26...",4
143,HMDB0000687,"[0.936, 0.948, 0.96]","[0.24292368326764602, 0.494199335301894, 0.262...",57
155,HMDB0000883,"[0.969, 0.983]","[0.4918957370052335, 0.5081042629947665]",3
170,HMDB0000172,"[0.991, 1.003]","[0.4922826969943136, 0.5077173030056864]",4
183,HMDB0000883,"[1.022, 1.036]","[0.4978061011282908, 0.5021938988717092]",3
246,HMDB0000011,"[1.199, 1.209]","[0.4984703345202869, 0.5015296654797131]",3
264,HMDB0000172,"[1.21, 1.223, 1.235, 1.238, 1.245, 1.25, 1.257...","[0.03087253818122261, 0.09118453916390287, 0.1...",1
319,HMDB0000190,"[1.31, 1.324]","[0.49952454832090487, 0.5004754516790951]",4
376,HMDB0000172,"[1.423, 1.432, 1.436, 1.444, 1.448, 1.457, 1.4...","[0.036775647293907573, 0.04651162790697675, 0....",1
384,HMDB0000161,"[1.459, 1.474]","[0.49510249419367874, 0.5048975058063213]",5


## Rank matches according to multiplet similarity

In [138]:
query = [ {'range': (3.87, 3.93), 'mult': 'dd', 'ppm': np.array([3.8892, 3.8930, 3.9097, 3.9134]), 'heights': np.array([0.25, 0.25, 0.25, 0.25])},]

result = multiple_query(query, nmrdb, df)
result.head(10) 

Unnamed: 0,accession,name,similarity
7,HMDB0000122,D-Glucose,2.126998
14,HMDB0000191,L-Aspartic acid,1.965755
40,HMDB0000884,Ribothymidine,1.952838
33,HMDB0000660,D-Fructose,1.942185
19,HMDB0000258,Sucrose,1.905719
36,HMDB0000742,Homocysteine,1.898036
21,HMDB0000296,Uridine,1.828641
10,HMDB0000158,L-Tyrosine,1.786886
15,HMDB0000195,Inosine,1.778414
24,HMDB0000423,"3,4-Dihydroxyhydrocinnamic acid",1.730708


In [139]:
query = [{'range': (2.5, 2.6), 'mult': 'd'}, {'range': (2.6, 2.7), 'mult': 'd'}]

result = multiple_query(query, nmrdb, df)
result.head()

Unnamed: 0,accession,name,similarity
1,HMDB0000094,Citric acid,2.0
3,HMDB0000402,2-Isopropylmalic acid,1.4
4,HMDB0000700,Hydroxypropionic acid,1.0
5,HMDB0000736,Isobutyryl-L-carnitine,0.722222
7,HMDB0000812,N-Acetyl-L-aspartic acid,0.4


In [140]:
query = [{'range':(1.25, 1.35), 'mult': 'd'}, {'range': (4.05, 4.15), 'mult': 'q'}]

result = multiple_query(query, nmrdb, df)
result.head(6)    

Unnamed: 0,accession,name,similarity
1,HMDB0000190,L-Lactic acid,2.0
0,HMDB0000030,Biotin,0.7
2,HMDB0000701,Hexanoylglycine,0.4


In [141]:
def count_atoms(formula):
    """
    Counts the number of atoms of each element in a chemical formula.

    Args:
        formula: A string representing the chemical formula.

    Returns:
        A dictionary where keys are element symbols and values are the number of atoms of that element.
    """
    atoms = {}
    for match in re.findall(r"([A-Z][a-z]*)(\d*)", formula):
        element = match[0]
        count = int(match[1]) if match[1] else 1
        atoms[element] = atoms.get(element, 0) + count
    return atoms

# Apply the function to the 'chemical_formula' column
atom_counts = df['chemical_formula'].apply(count_atoms)

# Expand the dictionary of atom counts into separate columns
#df = pd.concat([df, atom_counts.apply(pd.Series)], axis=1)
#print(df)

In [143]:
query = [{'range': (3.87, 3.93), 'mult': '*'},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
3,HMDB0000064,Creatine,1.5
1,HMDB0000043,Betaine,1.5
38,HMDB0000807,3-Phosphoglyceric acid,1.333333
36,HMDB0000742,Homocysteine,1.333333
39,HMDB0000819,Normetanephrine,1.25
4,HMDB0000086,Glycerophosphocholine,1.25


In [144]:
query = [{'range': (3.87, 3.93), 'mult': 'dd'},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
36,HMDB0000742,Homocysteine,1.333333
10,HMDB0000158,L-Tyrosine,1.2
27,HMDB0000479,3-Methylhistidine,1.2
13,HMDB0000181,L-Dopa,1.166667
24,HMDB0000423,"3,4-Dihydroxyhydrocinnamic acid",1.166667
14,HMDB0000191,L-Aspartic acid,1.166667


In [145]:
nmrdb[nmrdb['accession']=='HMDB0000742']

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
726,HMDB0000742,m03,2.14,1,"['14.86', '14.70', '7.47']",[3],m,2.06,2.22,"[2.07, 2.09, 2.1, 2.12, 2.13, 2.14, 2.15, 2.15...","[1037.1, 1045.1, 1051.7, 1058.8, 1066.2, 1068....","[0.018460648148148146, 0.028732638888888884, 0...",[3]
952,HMDB0000742,m02,2.65,2,"['10.73', '8.12', '6.73']",[2],ddd,2.59,2.72,"[2.6, 2.62, 2.63, 2.64, 2.64, 2.65, 2.66, 2.66...","[1300.1, 1307.2, 1313.7, 1320.7, 1321.7, 1324....","[0.029597332649397283, 0.036881251602975125, 0...",[2]
1443,HMDB0000742,m01,3.87,1,"['7.13', '5.62']",[4],dd,3.83,3.9,"[-0.01001467351430696, -1.4673514306728919e-05...","[1928.4, 1934.0, 1935.5, 1941.1]","[0.23097445474554795, 0.2749282998732742, 0.26...",[4]


In [146]:
# Looking for Leucine
query = [{'range': (0.94, 0.99), 'mult': 't', 'ppm': np.array([0.949542,0.96010, 0.970836]), 'heights': np.array([0.25,0.5,0.25])},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
9,HMDB0000452,L-alpha-Aminobutyric acid,2.241946
17,HMDB0000687,L-Leucine,2.154733
16,HMDB0000650,D-alpha-Aminobutyric acid,1.754231
7,HMDB0000339,2-Methylbutyrylglycine,1.563549
13,HMDB0000557,L-Alloisoleucine,0.25
21,HMDB0000883,L-Valine,0.25


In [147]:
# Looking for Threonine
query = [{'range': (4.22, 4.28), 'mult': '*', 'ppm': np.array([4.2351,4.243,4.2461,4.254 ,4.2571,4.265 ,4.2681,4.276]), 'heights': np.array([1,1,3,3,3,3,1,1]) / np.sum([1,1,3,3,3,3,1,1])},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
1,HMDB0000167,L-Threonine,2.232421
3,HMDB0000244,Riboflavin,2.166881
8,HMDB0000565,Galactonic acid,1.909901
13,HMDB0000884,Ribothymidine,1.884521
2,HMDB0000195,Inosine,1.882465
11,HMDB0000748,L-3-Phenyllactic acid,1.876399


In [151]:
# Looking for Tyrosine
query = [{'range': (7.16, 7.6), 'mult': '*', 'ppm': [7.185000,7.189691,7.192445,7.200093,7.203968,7.208965], 'heights': [0.14,1,0.3,0.28,0.92,0.14]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
3,HMDB0000158,L-Tyrosine,2.297146
5,HMDB0000205,Phenylpyruvic acid,2.178898
2,HMDB0000152,Gentisic acid,2.119148
19,HMDB0000484,Vanillic acid,2.116746
41,HMDB0000840,Salicyluric acid,2.011233
4,HMDB0000197,Indoleacetic acid,1.974162


In [159]:
# Looking for Phenylalanine
query = [{'range': (7.2, 7.5), 'mult': '*', 'ppm': [7.414916,7.418995,7.421035,7.429193,7.431233,7.434292,7.440411,7.442960,7.4450005], 'heights': [0.16,0.8,0.2,0.53,1,0.2,0.13,0.33,0.2]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
30,HMDB0000684,L-Kynurenine,2.466096
3,HMDB0000132,Guanine,2.396479
8,HMDB0000228,Phenol,2.286237
33,HMDB0000715,Kynurenic acid,2.23897
32,HMDB0000714,Hippuric acid,2.238588
6,HMDB0000205,Phenylpyruvic acid,2.234585


In [163]:
# Looking for Lactate
query = [{'range': (4, 4.2), 'mult': 'q', 'ppm': [4.095,4.1065,4.118,4.1295], 'heights': [0.33,1,1,0.33]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
21,HMDB0000190,L-Lactic acid,2.205696
12,HMDB0000125,Glutathione,1.832804
20,HMDB0000174,L-Fucose,1.74853
10,HMDB0000101,Deoxyadenosine,1.740319
37,HMDB0000562,Creatinine,0.5
60,HMDB0000863,Isopropyl alcohol,0.5


In [164]:
nmrdb[nmrdb['accession'] == 'HMDB0000190']

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
319,HMDB0000190,m01,1.32,4,['6.96'],['3'],d,1.3,1.33,"[1.31, 1.324]","['654.7', '661.7']","[0.49952454832090487, 0.5004754516790951]",['3']
1579,HMDB0000190,m02,4.1,2,"['6.93', '6.93']",['2'],q,4.08,4.13,"[-0.020341658481829845, -0.006341658481829611,...","['2041.1', '2048.1', '2055.0', '2061.9']","[0.12670127683457275, 0.378981338571629, 0.373...",['2']


In [177]:
!pwd

/Users/andres/Documents/git/github/phenocare/hmdb_utils


In [175]:
# Looking for Lactate
query = [{'range': (4, 4.2), 'mult': 'q', 'ppm': [4.095,4.1065,4.118,4.1295], 'heights': [0.33,1,1,0.33]},
         {'range': (1.25, 1.36), 'mult': 'd', 'ppm': [1.31, 1.324]	, 'heights': [0.49952454832090487, 0.5004754516790951]	}]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
4,HMDB0000190,L-Lactic acid,2.852848
3,HMDB0000174,L-Fucose,1.822182
0,HMDB0000030,Biotin,0.824622
5,HMDB0000424,2-Hydroxydecanedioic acid,0.4
6,HMDB0000701,Hexanoylglycine,0.4
8,HMDB0000903,Tetrahydrocortisone,0.166667


In [176]:
nmrdb[nmrdb['accession'] == 'HMDB0000174']

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
249,HMDB0000174,m09,1.21,8,['6.65'],['22'],d,1.2,1.22,"[1.22, 1.21]","['729.2', '722.5']","[0.513590033975085, 0.486409966024915]",['22']
265,HMDB0000174,m08,1.25,18,['6.48'],['11'],d,1.24,1.27,"[0.004998405103668267, -0.005001594896331742]","['753.2', '746.7']","[0.5001594896331738, 0.4998405103668262]",['11']
1173,HMDB0000174,m07,3.45,9,"['9.89', '7.97']",['4'],dd,3.42,3.48,"[3.47, 3.45, 3.45, 3.44]","['2078.0', '2070.1', '2068.2', '2060.1']","[0.2532330827067669, 0.2706766917293233, 0.265...",['4']
1260,HMDB0000174,m06,3.64,8,"['9.92', '3.49']",['3'],dd,3.62,3.67,"[3.66, 3.65, 3.64, 3.63]","['2191.4', '2187.9', '2181.5', '2178.0']","[0.23790849673202616, 0.2173202614379085, 0.28...",['3']
1379,HMDB0000174,m05,3.79,24,[''],"['5', '14', '16', '6']",m,3.73,3.83,"[3.82, 3.81, 3.8, 3.79, 3.78, 3.78, 3.77, 3.76...","['2292.6', '2286.1', '2279.6', '2273.2', '2267...","[0.046151282905698095, 0.19976674441852715, 0....","['5', '14', '16', '6']"
1440,HMDB0000174,m04,3.86,4,"['10.30', '3.38']",['15'],dd,3.84,3.89,"[3.87, 3.87, 3.86, 3.85]","['2322.5', '2319.1', '2312.2', '2308.8']","[0.21412964311726146, 0.2199563000728332, 0.24...",['15']
1621,HMDB0000174,m03,4.2,3,['6.61'],['17'],q,4.17,4.23,"[0.016282352941176234, 0.006282352941176444, -...","['2527.9', '2521.3', '2514.7', '2508.1']","[0.11882352941176472, 0.3776470588235294, 0.38...",['17']
1716,HMDB0000174,m02,4.56,6,['7.91'],['2'],d,4.53,4.58,"[4.57, 4.55]","['2737.5', '2729.5']","[0.49450286806883365, 0.5054971319311663]",['2']
1754,HMDB0000174,m01,5.21,3,['3.90'],['13'],d,5.2,5.22,"[5.21, 5.21]","['3124.0', '3120.1']","[0.4947751022262608, 0.5052248977737392]",['13']
