# HMDB database importer

In [2]:
from rapidfuzz import process, fuzz

import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import ast
import re

import sys
sys.path.append("src")

from hmdb_local_tools import  multiple_query, approximate_lookup

In [3]:
#tree = ET.parse('hmdb_urine/urine_metabolites.xml')
dbname = "serum"
tree = ET.parse(f'hmdb/{dbname}/{dbname}_metabolites.xml')
root = tree.getroot()

In [4]:
def print_element(element, level=0):
    indent = "  " * level
    print(f"{indent}Tag: {element.tag}, Attributes: {element.attrib}")
    
    if element.text and element.text.strip():
        print(f"{indent}  Text: {element.text.strip()}")
    
    for child in element:
        print_element(child, level + 1)

first = True
for child in root:
    if first:
        #print_element(child)
        first = False

In [5]:
import pandas as pd
# List to hold metabolite data
# Function to extract text from an XML element, if it exists
def get_text(element, tag, namespace):
    child = element.find(f'ns:{tag}', namespace)
    return child.text if child is not None else None

# Function to extract a list of texts from XML elements, if they exist
def get_text_list(element, tag, namespace):
    children = element.findall(f'ns:{tag}', namespace)
    return [child.text for child in children] if children is not None else None

data = []
namespace = {'ns': 'http://www.hmdb.ca'}

# Iterate through each metabolite
for metabolite in root.findall('ns:metabolite', namespace):
    # Extract required elements
    accession = get_text(metabolite, 'accession', namespace)
    secondary_accessions = get_text_list(metabolite.find('ns:secondary_accessions', namespace), 'accession', namespace)
    name = get_text(metabolite, 'name', namespace)
    synonyms = get_text_list(metabolite.find('ns:synonyms', namespace), 'synonym', namespace)
    chemical_formula = get_text(metabolite, 'chemical_formula', namespace)
    iupac_name = get_text(metabolite, 'iupac_name', namespace)
    cas_registry_number = get_text(metabolite, 'cas_registry_number', namespace)
    smiles = get_text(metabolite, 'smiles', namespace)
    inchi = get_text(metabolite, 'inchi', namespace)
    inchikey = get_text(metabolite, 'inchikey', namespace)
    description = get_text(metabolite, 'description', namespace)
    monisotopic_molecular_weight = get_text(metabolite, 'monisotopic_molecular_weight', namespace)
    average_molecular_weight = get_text(metabolite, 'average_molecular_weight', namespace)
    
    # Append the metabolite data to the list
    data.append([accession, secondary_accessions, name, synonyms, chemical_formula, iupac_name, cas_registry_number, smiles, inchi, inchikey, monisotopic_molecular_weight, average_molecular_weight, description])

# Create a DataFrame
columns = ['accession', 'secondary_accessions', 'name', 'synonyms', 'chemical_formula', 'iupac_name', 'cas_registry_number', 'smiles', 'inchi', 'inchikey', 'monisotopic_molecular_weight', 'average_molecular_weight', 'description']
df = pd.DataFrame(data, columns=columns)

# Display the DataFrame
df.head()

# Save the DataFrame to a CSV file if needed
#df.to_csv('metabolites.csv', index=False)
#df.drop('description').to_csv('metabolites_no_description.csv', index=False)

Unnamed: 0,accession,secondary_accessions,name,synonyms,chemical_formula,iupac_name,cas_registry_number,smiles,inchi,inchikey,monisotopic_molecular_weight,average_molecular_weight,description
0,HMDB0000001,"[HMDB00001, HMDB0004935, HMDB0006703, HMDB0006...",1-Methylhistidine,[(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)pro...,C7H11N3O2,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,332-80-9,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,BRMWTNUJHUMWMS-LURJTMIESA-N,169.085126611,169.1811,"1-Methylhistidine, also known as 1-MHis or 1MH..."
1,HMDB0000002,"[HMDB00002, HMDB0060172, HMDB60172]","1,3-Diaminopropane","[1,3-Propanediamine, 1,3-Propylenediamine, Pro...",C3H10N2,"propane-1,3-diamine",109-76-2,NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,XFNJVJPLKCPIBV-UHFFFAOYSA-N,74.08439833,74.1249,"1,3-Diaminopropane, also known as DAP or trime..."
2,HMDB0000005,"[HMDB00005, HMDB0006544, HMDB06544]",2-Ketobutyric acid,"[2-Ketobutanoic acid, 2-Oxobutyric acid, 3-Met...",C4H6O3,2-oxobutanoic acid,600-18-0,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",TYEYBOSBBBHJIV-UHFFFAOYSA-N,102.031694058,102.0886,"2-Ketobutyric acid, also known as alpha-ketobu..."
3,HMDB0000008,[HMDB00008],2-Hydroxybutyric acid,"[(S)-2-Hydroxybutanoic acid, 2-Hydroxybutyrate...",C4H8O3,(2S)-2-hydroxybutanoic acid,3347-90-8,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",AFENDNXGAFYKQO-VKHMYHEASA-N,104.047344118,104.105,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn..."
4,HMDB0000010,"[HMDB00010, HMDB0004990, HMDB0004991, HMDB0499...",2-Methoxyestrone,"[2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-meth...",C19H24O3,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",362-08-3,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,WHEUWNKSCXYKBU-QPWUGHHJSA-N,300.172544634,300.3921,2-Methoxyestrone (or 2-ME1) belongs to the cla...


In [7]:
import os

def is_1h_file(path):
    if os.path.isfile(path):
        with open(path) as f:
            first_line = f.readline()
            return "_13C_" not in first_line and "ADDRESS" not in first_line
    return False

def list_files(folder_path):
    try:
        # Get all entries in the folder
        entries = os.listdir(folder_path)
        # Filter out only files
        files = [f for f in entries if is_1h_file(os.path.join(folder_path, f))]
        return files
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
        return []
    except PermissionError:
        print(f"Permission denied: {folder_path}")
        return []
    except UnicodeDecodeError: 
        print(f"Other error: {folder_path}")
        return []

spectra_files = list_files('hmdb/nmr_peak_lists')
print(len(spectra_files))

1786


In [8]:
def create_dataframe(file_list):
    # List to hold the data for each file
    data = []

    for file_name in file_list:
        # Split the file name by "_" and take the first part as accession
        tokens  = file_name.split('_')
        # Append the data as a tuple (accession, file_name)
        data.append((tokens[0], file_name, tokens[1]))

    # Create the DataFrame
    df = pd.DataFrame(data, columns=['accession', 'file_name', 'dim'])

    return df

df_files = create_dataframe(spectra_files)

#Filter out only 1d files
df_files = df_files[df_files['dim'] == 'nmroned']
print(df_files.shape)
df_files.head()

(892, 3)


Unnamed: 0,accession,file_name,dim
4,HMDB0003072,HMDB0003072_nmroned_5251_2734492.txt,nmroned
7,HMDB0000176,HMDB0000176_nmroned_1144_28428.txt,nmroned
9,HMDB0002873,HMDB0002873_nmroned_1910_33109.txt,nmroned
11,HMDB0000622,HMDB0000622_nmroned_1440_29992.txt,nmroned
12,HMDB0000548,HMDB0000548_nmroned_1415_29734.txt,nmroned


In [9]:
# Save the DataFrame to a CSV file if needed
df.to_csv(f'inst/{dbname}_metabolites.csv', index=False)
df.drop(columns=['description']).to_csv(f'inst/{dbname}_metabolites_no_description.csv', index=False)

In [10]:
df = df.join(df_files.set_index('accession'), on='accession', how='left')
df = df.dropna(subset=['file_name'])
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,accession,secondary_accessions,name,synonyms,chemical_formula,iupac_name,cas_registry_number,smiles,inchi,inchikey,monisotopic_molecular_weight,average_molecular_weight,description,file_name,dim
0,HMDB0000001,"[HMDB00001, HMDB0004935, HMDB0006703, HMDB0006...",1-Methylhistidine,[(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)pro...,C7H11N3O2,(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)prop...,332-80-9,CN1C=NC(C[C@H](N)C(O)=O)=C1,InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11...,BRMWTNUJHUMWMS-LURJTMIESA-N,169.085126611,169.1811,"1-Methylhistidine, also known as 1-MHis or 1MH...",HMDB0000001_nmroned_1022_27891.txt,nmroned
1,HMDB0000002,"[HMDB00002, HMDB0060172, HMDB60172]","1,3-Diaminopropane","[1,3-Propanediamine, 1,3-Propylenediamine, Pro...",C3H10N2,"propane-1,3-diamine",109-76-2,NCCCN,InChI=1S/C3H10N2/c4-2-1-3-5/h1-5H2,XFNJVJPLKCPIBV-UHFFFAOYSA-N,74.08439833,74.1249,"1,3-Diaminopropane, also known as DAP or trime...",HMDB0000002_nmroned_1023_27894.txt,nmroned
2,HMDB0000005,"[HMDB00005, HMDB0006544, HMDB06544]",2-Ketobutyric acid,"[2-Ketobutanoic acid, 2-Oxobutyric acid, 3-Met...",C4H6O3,2-oxobutanoic acid,600-18-0,CCC(=O)C(O)=O,"InChI=1S/C4H6O3/c1-2-3(5)4(6)7/h2H2,1H3,(H,6,7)",TYEYBOSBBBHJIV-UHFFFAOYSA-N,102.031694058,102.0886,"2-Ketobutyric acid, also known as alpha-ketobu...",HMDB0000005_nmroned_1024_27899.txt,nmroned
3,HMDB0000008,[HMDB00008],2-Hydroxybutyric acid,"[(S)-2-Hydroxybutanoic acid, 2-Hydroxybutyrate...",C4H8O3,(2S)-2-hydroxybutanoic acid,3347-90-8,CC[C@H](O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(...",AFENDNXGAFYKQO-VKHMYHEASA-N,104.047344118,104.105,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn...",HMDB0000008_nmroned_5245_2734397.txt,nmroned
4,HMDB0000010,"[HMDB00010, HMDB0004990, HMDB0004991, HMDB0499...",2-Methoxyestrone,"[2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-meth...",C19H24O3,"(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-methyl...",362-08-3,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,InChI=1S/C19H24O3/c1-19-8-7-12-13(15(19)5-6-18...,WHEUWNKSCXYKBU-QPWUGHHJSA-N,300.172544634,300.3921,2-Methoxyestrone (or 2-ME1) belongs to the cla...,HMDB0000010_nmroned_1026_27907.txt,nmroned


In [11]:
# Save the DataFrame to a CSV file if needed
df.to_csv(f'inst/{dbname}_metabolites_with_spectra.csv', index=False)
df.drop(columns=['description']).to_csv(f'inst/{dbname}_metabolites_with_spectra_no_description.csv', index=False)

## Prepare the dataset for approximate lookup using the metabolyte names and synomyms

In [13]:
df['synonyms_cat'] = df['name'] + " " + df['synonyms'].str.join(' ')
df['synonyms_cat'] = df['synonyms_cat'].str.lower()

In [14]:
matches = approximate_lookup(df, 'synonyms_cat', 'citric acid', fuzz.partial_token_ratio, fuzz.ratio, limit=3)
print(matches)

[['Citric acid' 'Citric acid' 46 100.0]
 ['trans-Aconitic acid' 'Citridic acid' 376 91.66666666666666]
 ['cis-Aconitic acid' 'Citridic acid' 37 91.66666666666666]]


In [15]:
matches = approximate_lookup(df, 'synonyms_cat', 'Ibupro', fuzz.partial_token_ratio, fuzz.ratio, limit=3)
print(matches)

[['Ibuprofen' 'Ibuprofen' 491 80.0]]


In [16]:
df[df['name'] == 'Ibuprofen']

Unnamed: 0,accession,secondary_accessions,name,synonyms,chemical_formula,iupac_name,cas_registry_number,smiles,inchi,inchikey,monisotopic_molecular_weight,average_molecular_weight,description,file_name,dim,synonyms_cat
491,HMDB0001925,"[HMDB0001872, HMDB01872, HMDB01925]",Ibuprofen,"[(+-)-2-(p-Isobutylphenyl)propionic acid, (+-)...",C13H18O2,2-[4-(2-methylpropyl)phenyl]propanoic acid,15687-27-1,CC(C)CC1=CC=C(C=C1)C(C)C(O)=O,InChI=1S/C13H18O2/c1-9(2)8-11-4-6-12(7-5-11)10...,HEFNNWSXXWATRW-UHFFFAOYSA-N,206.13067982,206.2808,Ibuprofen is a nonsteroidal anti-inflammatory ...,HMDB0001925_nmroned_1804_32666.txt,nmroned,ibuprofen (+-)-2-(p-isobutylphenyl)propionic a...


## Loook for p-cresol

In [17]:
#matches = approximate_lookup(df, 'name', "3-hydroxybutirate")
matches = approximate_lookup(df, 'synonyms_cat', "ethylendiaminotetraacetic acid", fuzz.partial_token_ratio, fuzz.ratio, limit=3)
print(matches)

[['Sarcosine' 'Methylaminoacetic acid' 147 80.76923076923077]
 ['Acetylglycine' 'Acetylaminoacetic acid' 218 76.92307692307692]
 ['Methylimidazoleacetic acid' 'Methylimidazoleacetic acid' 554 75.0]]


## Look for all metabolytes into the dataset

In [None]:
entries = ["2-Aminobutyric acid","2-Hydroxybutyric acid","2-Oxoglutaric acid","3-Hydroxybutyric acid","Acetic acid","Acetoacetic acid","Acetone","Alanine","Asparagine","Ca-EDTA","Choline","Citric acid","Creatine","Creatinine","D-Galactose","Dimethylsulfone","Ethanol","Formic acid","Glucose","Glutamic acid","Glutamine","Glycerol","Glycine","Histidine","Isoleucine","K-EDTA","Lactic acid","Leucine","Lysine","Methionine","N,N-Dimethylglycine","Ornithine","Phenylalanine","Proline","Pyruvic acid","Sarcosine","Succinic acid","Threonine","Trimethylamine-N-oxide","Tyrosine","Valine"]

for entry in entries:
    matches = approximate_lookup(df, 'synonyms_cat', entry, fuzz.partial_token_ratio, fuzz.ratio, limit=3)
    print(entry, matches)

In [None]:
entries = ["Isoleucine","Leucine","Valine","Isoleucine","Valine","Isobutyrate","Ethanol","3-hydroxybutyrate","Lactate","Alanine","Acetate","3-hydroxybutyrate","Pyruvate","3-hydroxybutyrate","Succinate","Citrate","Citrate","Creatine","beta-Glucose","Ethanol","Alanine","Lactate","alpha-Glucose","Tyrosine","1-methylhistidine","Tyrosine","1-methylhistidine","Formate"]
for entry in entries:
    matches = approximate_lookup(df, 'synonyms_cat', entry, fuzz.partial_token_ratio, fuzz.ratio, limit=3)
    print(entry, matches)

In [19]:
f = open('hmdb/nmr_peak_lists/HMDB0001925_nmroned_1804_32666.txt')# + df['file_name'].values[16], "r")
entry1 = f.read()
f.close()
print(entry1)

Table of Peaks
No.	(ppm)	(Hz)	Height
1	0.89	446.4	0.9597
2	0.91	453.0	1.0000
3	1.50	750.1	0.4451
4	1.52	757.3	0.4708
5	1.81	903.3	0.0068
6	1.82	910.3	0.0226
7	1.83	917.0	0.0463
8	1.85	923.7	0.0572
9	1.86	930.4	0.0451
10	1.88	937.3	0.0236
11	1.89	944.0	0.0071
12	2.44	1221.1	0.2454
13	2.46	1228.3	0.2314
14	3.70	1849.6	0.0197
15	3.71	1856.8	0.0681
16	3.73	1863.9	0.0724
17	3.74	1871.1	0.0227
18	7.10	3548.0	0.1472
19	7.11	3556.1	0.2058
20	7.22	3608.0	0.2120
21	7.23	3616.0	0.1681

Table of Multiplets
No.	Shift1 (ppm)	Hs	Type	J (Hz)	Atom1	Multiplet1	 (ppm)
1	0.90	6	d	6.61	9 12	M07	0.86 .. 0.94
2	1.51	3	d	7.16	11	M06	1.48 .. 1.54
3	1.85	1	m	-	8	M05	1.79 .. 1.91
4	2.45	2	d	7.18	7	M04	2.42 .. 2.48
5	3.72	1	q	7.17	10	M03	3.67 .. 3.77
6	7.11	2	d	8.10	2 6	M02	7.08 .. 7.13
7	7.23	2	d	8.03	3 5	M01	7.20 .. 7.25

Table of Assignments
No.	Atom	Exp. Shift (ppm)	Multiplet
1	9	0.90	M07
2	12	0.90	M07
3	11	1.51	M06
4	8	1.85	M05
5	7	2.45	M04
6	10	3.72	M03
7	2	7.11	M02
8	6	7.11	M02
9	3	7.23	M01
10	5	7.23	M01



# Guess the spectral frequency, compile the multiples, and recalculate the integrals that should match Hs, but sometimes has errors

In [20]:
import re
import pandas as pd
import math
import io

# Function to strip trailing commas
def strip_trailing_commas(s):
    if isinstance(s, str):
        return s.strip(',')
    return s


def parse_hmdb_data_clean(text, accession):
    """Parses HMDB NMR data and creates a data frame with peak information.

    Args:
        text: A string containing the HMDB NMR data in the specified format.

    Returns:
        A pandas DataFrame with one row per multiplet and columns for peak positions (ppm and Hz), heights, and assigned atoms.
    """
    text = text.lower()
    if( not( "peaks" in text )):
        text = "peaks" + text
        
    text = text.replace("muliplets", "multiplets")
    text = text.replace("mulitplets", "multiplets")
    text = text.replace("mnltiplets", "multiplets")
    text = text.replace("multuplets", "multiplets")
    text = text.replace("mutiplets", "multiplets")

    text = text.replace("praks", "peaks")
    text = text.replace("assignements", "assignments")
    
    text = text.replace("assignment\n", "assignments\n")

    text = text.replace("peaks", "peaks\n")
    text = text.replace("multiplets", "multiplets\n")
    text = text.replace("assignments", "assignments\n")
    text = text.replace("table", "\ntable")
    text = text.replace("atom exp.", "atom\texp.")

    #text = text.replace("\n\n\n", "\n\n")
    #text = text.replace(r"\n[\n]+", "\n\n")
    #print(text)
    try:
        # Extract tables using regular expressions
        table1 = re.search(r"peaks[\n ]+(.*?)\n([\t ]*)\n", text, re.DOTALL).group(1)
        #print(table1)
        table2 = re.search(r"multiplets[\n ]+(.*?)\n([\t ]*)\n", text, re.DOTALL).group(1)
        #print(multiplets_table)
        table3 = re.search(r"assignments[\n ]+(.*?)\n([\t ]*)\n", text, re.DOTALL)
        if table3 is not None:
           table3 = table3.group(1)
        else:
            table3 =  "no.\tatom\texp.shift(ppm)\tmultiplet\n1\t1\t1\1\ts0"
        #print(table2)
        # Guess which table is who
        assignments_table = None
        for table in [table1, table2, table3]:
            if "height" in table :
                peaks_table = table
            else:
                if "atom1" in table or "j (hz)" in table or "hs" in table:
                    multiplets_table = table
                else:
                    assignments_table = table
        if assignments_table == None:
            assignments_table = "no.\tatom\texp.shift(ppm)\tmultiplet\n1\t1\t1\1\ts0"
        
        multiplets_table = multiplets_table.replace("multiplet1 (ppm)", "multiplet1\t(ppm)")
        multiplets_table = re.sub(r"shift1[\s\t]*\(ppm\)", r"shift1(ppm)", multiplets_table)
        
        #print(assignments_table)

        #print(multiplets_table.replace(" ", ","))
        # Convert tables to pandas DataFrames
        peaks_df = pd.read_csv(io.StringIO(peaks_table.replace(" ", ",")), sep=r"\s+", engine="python")#sep="\t+", dtype=str)
        multiplets_df = pd.read_csv(io.StringIO(multiplets_table.replace(" ", ",")), sep=r"\s+", engine="python")
        assignments_df = pd.read_csv(io.StringIO(assignments_table.replace(" ", ",")), sep=r"\s+", engine="python")
        
        # Remove spaces in names
        peaks_df.columns = peaks_df.columns.str.replace(' ', '').str.replace(',', '')
        multiplets_df.columns = multiplets_df.columns.str.replace(' ', '').str.replace(',', '')
        assignments_df.columns = assignments_df.columns.str.replace(' ', '').str.replace(',', '')


        # Apply the function to each string column
        for col in peaks_df.select_dtypes(include=['object']).columns:
            peaks_df[col] = peaks_df[col].apply(strip_trailing_commas)
            
        # Apply the function to each string column
        for col in multiplets_df.select_dtypes(include=['object']).columns:
            multiplets_df[col] = multiplets_df[col].apply(strip_trailing_commas)  
        
        # Apply the function to each string column
        for col in assignments_df.select_dtypes(include=['object']).columns:
            assignments_df[col] = assignments_df[col].apply(strip_trailing_commas)  

        if "multiplet" not in assignments_df:
            assignments_df["multiplet"] = "x"

        peaks_df["(ppm)"] = pd.to_numeric(peaks_df["(ppm)"], errors='coerce')


        # Create an empty list to store the data for the new DataFrame
        data = []
    
        # Iterate over the multiplets
        for _, multiplet in multiplets_df.iterrows():
            if "m" not in multiplet["multiplet1"]:
                multiplet["(ppm)"] = multiplet["multiplet1"]
                multiplet["multiplet1"] = multiplet["atom1"]
                multiplet["atom1"] = ""
            if("atom1" not in multiplet):
                multiplet["atom1"] = ""
            if(isinstance(multiplet["atom1"], int) or isinstance(multiplet["atom1"], float)):
                multiplet["atom1"] = [multiplet["atom1"]]
            else:
                multiplet["atom1"] = multiplet["atom1"].split(",")

            #print(multiplet)
            if not ("j(hz)" in multiplet):
                multiplet["j(hz)"] = "-"
            if(isinstance(multiplet["j(hz)"], int) or isinstance(multiplet["j(hz)"], float)):
                 multiplet["j(hz)"] = [multiplet["j(hz)"]]
            else:
                multiplet["j(hz)"] = multiplet["j(hz)"].replace("-", "").split(",")
    
            ppm_range = multiplet["(ppm)"].replace(",", "").split("..")
            ppm_min = float(ppm_range[0])
            ppm_max = float(ppm_range[1])
            # Filter peaks within the multiplet range
            peaks_in_range = peaks_df[
                (peaks_df["(ppm)"] >= ppm_min) & (peaks_df["(ppm)"] <= ppm_max)
            ]
    
            # Extract peak information
            ppm_values = peaks_in_range["(ppm)"].tolist()
            heights = peaks_in_range["height"].tolist()
            if ("(hz)" in peaks_in_range):
                hz_values = peaks_in_range["(hz)"].tolist()
            else:
                hz_values = np.array(ppm_values) * 601
    
            # Find assigned atoms
            assigned_atoms = assignments_df[
                assignments_df["multiplet"] == multiplet["multiplet1"]
            ]["atom"].tolist()

            if "shift1(ppm)" not in multiplet:
                multiplet["shift1(ppm)"] = (ppm_min + ppm_max) / 2
            # Append data for the current multiplet to the list
            data.append(
                {
                    "accession": accession,
                    "multiplet": multiplet["multiplet1"],
                    "shift1(ppm)": multiplet["shift1(ppm)"],
                    "hs": multiplet["hs"],
                    "j(hz)": multiplet["j(hz)"],
                    "atom1": multiplet["atom1"],
                    "type": multiplet["type"],
                    "from": ppm_min,
                    "to": ppm_max,
                    "ppm": ppm_values,
                    "hz": hz_values,
                    "heights": heights,
                    "assigned atoms": assigned_atoms,
                }
            )
    
        # Create the final DataFrame
        return pd.DataFrame(data)
    except Exception as e:
        #print(text)
        print(e)
        return None;

In [21]:
assignment_entry = parse_hmdb_data_clean(entry1 + "\n\n", "HMDB1")
assignment_entry.head()

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
0,HMDB1,m07,0.9,6,[6.61],"[9, 12]",d,0.86,0.94,"[0.89, 0.91]","[446.4, 453.0]","[0.9597, 1.0]","[9, 12]"
1,HMDB1,m06,1.51,3,[7.16],[11],d,1.48,1.54,"[1.5, 1.52]","[750.1, 757.3]","[0.4451, 0.4708]",[11]
2,HMDB1,m05,1.85,1,[],[8],m,1.79,1.91,"[1.81, 1.82, 1.83, 1.85, 1.86, 1.88, 1.89]","[903.3, 910.3, 917.0, 923.7, 930.4, 937.3, 944.0]","[0.0068, 0.0226, 0.0463, 0.0572, 0.0451, 0.023...",[8]
3,HMDB1,m04,2.45,2,[7.18],[7],d,2.42,2.48,"[2.44, 2.46]","[1221.1, 1228.3]","[0.2454, 0.2314]",[7]
4,HMDB1,m03,3.72,1,[7.17],[10],q,3.67,3.77,"[3.7, 3.71, 3.73, 3.74]","[1849.6, 1856.8, 1863.9, 1871.1]","[0.0197, 0.0681, 0.0724, 0.0227]",[10]


# Index the whole db

In [22]:
print(df[df['accession'] == 'HMDB0001925'])

       accession                 secondary_accessions       name  \
491  HMDB0001925  [HMDB0001872, HMDB01872, HMDB01925]  Ibuprofen   

                                              synonyms chemical_formula  \
491  [(+-)-2-(p-Isobutylphenyl)propionic acid, (+-)...         C13H18O2   

                                     iupac_name cas_registry_number  \
491  2-[4-(2-methylpropyl)phenyl]propanoic acid          15687-27-1   

                            smiles  \
491  CC(C)CC1=CC=C(C=C1)C(C)C(O)=O   

                                                 inchi  \
491  InChI=1S/C13H18O2/c1-9(2)8-11-4-6-12(7-5-11)10...   

                        inchikey monisotopic_molecular_weight  \
491  HEFNNWSXXWATRW-UHFFFAOYSA-N                 206.13067982   

    average_molecular_weight  \
491                 206.2808   

                                           description  \
491  Ibuprofen is a nonsteroidal anti-inflammatory ...   

                              file_name      dim  \
491  HMDB0

In [23]:
nmrdb = None
HMDB_PATH = 'hmdb/nmr_peak_lists/'
index = 0
count_errors = 0

for entry in df.iterrows():
    index += 1
    #531, 582
    if(index in [69, 301, 365, 387, 409, 412, 465, 493, 500, 501, 515, 531, 563, 580, 581]):
        continue

    f = open(HMDB_PATH + entry[1]['file_name'], "r")
    content = f.read()
    f.close()

    assignment_entry = parse_hmdb_data_clean(content + "\n\n", entry[1]['accession'])
        
    if assignment_entry is not None:
        if nmrdb is None:
            nmrdb = assignment_entry
        else:
            nmrdb = pd.concat([nmrdb, assignment_entry])
    else:
        count_errors += 1
        print("ERROR")
        print(index)
        print(HMDB_PATH + entry[1]['file_name'])
        #print(content)
        #break;

nmrdb['from'] = pd.to_numeric(nmrdb['from'], errors='coerce')
nmrdb['to'] = pd.to_numeric(nmrdb['to'], errors='coerce')
nmrdb['shift1(ppm)'] = pd.to_numeric(nmrdb['shift1(ppm)'], errors='coerce')

nmrdb = nmrdb.sort_values("shift1(ppm)").reset_index(drop=True)

nmrdb.to_csv('inst/spectral1hnmr.csv', index=False)
print(count_errors, index)
nmrdb.shape

'(ppm)'
ERROR
378
hmdb/nmr_peak_lists/HMDB0000965_nmroned_1626_31427.txt
'NoneType' object has no attribute 'group'
ERROR
400
hmdb/nmr_peak_lists/HMDB0001262_nmroned_1673_31694.txt
'NoneType' object has no attribute 'group'
ERROR
413
hmdb/nmr_peak_lists/HMDB0001372_nmroned_1691_31776.txt
'NoneType' object has no attribute 'group'
ERROR
425
hmdb/nmr_peak_lists/HMDB0001432_nmroned_1706_31842.txt
'NoneType' object has no attribute 'group'
ERROR
428
hmdb/nmr_peak_lists/HMDB0001460_nmroned_1711_31864.txt
'NoneType' object has no attribute 'group'
ERROR
478
hmdb/nmr_peak_lists/HMDB0001891_nmroned_1784_32227.txt
'NoneType' object has no attribute 'group'
ERROR
526
hmdb/nmr_peak_lists/HMDB0002097_nmroned_1852_32866.txt
'NoneType' object has no attribute 'group'
ERROR
527
hmdb/nmr_peak_lists/HMDB0002107_nmroned_1854_32873.txt
'NoneType' object has no attribute 'group'
ERROR
542
hmdb/nmr_peak_lists/HMDB0002390_nmroned_1885_32999.txt
cannot access local variable 'peaks_table' where it is not asso

(3608, 13)

## Read imported dataset

In [24]:
import pandas as pd

nmrdb = pd.read_csv("inst/spectral1hnmr.csv")
df = pd.read_csv(f"inst/{dbname}_metabolites.csv")

In [25]:
# Convert ppm from string to np.array
nmrdb['ppm'] = nmrdb['ppm'].apply(lambda x: np.array(ast.literal_eval(x.replace("'",''))))

def h_converter(x):
    x = np.array(ast.literal_eval(x.replace("'",'')))
    return x / np.sum(x) 
    
nmrdb['heights'] = nmrdb['heights'].apply(lambda x: h_converter(x))

In [26]:
query = [ {'range': (3.87, 3.93), 'mult': 'dd', 'ppm': np.array([3.8892, 3.8930, 3.9097, 3.9134]), 'heights': np.array([0.25, 0.25, 0.25, 0.25])},]

result = multiple_query(query, nmrdb, df)
result.head(10) 

Unnamed: 0,accession,name,similarity
7,HMDB0000122,D-Glucose,2.126998
15,HMDB0000191,L-Aspartic acid,1.965755
42,HMDB0000884,Ribothymidine,1.952838
35,HMDB0000660,D-Fructose,1.942185
33,HMDB0000609,DL-Dopa,1.941478
20,HMDB0000258,Sucrose,1.905719
38,HMDB0000742,Homocysteine,1.898036
66,HMDB0002006,"2,3-Diaminopropionic acid",1.89419
67,HMDB0002545,Galacturonic acid,1.865068
22,HMDB0000296,Uridine,1.828641


In [27]:
query = [{'range': (2.5, 2.6), 'mult': 'd'}, {'range': (2.6, 2.7), 'mult': 'd'}]

result = multiple_query(query, nmrdb, df)
result.head()

Unnamed: 0,accession,name,similarity
1,HMDB0000094,Citric acid,2.0
3,HMDB0000402,2-Isopropylmalic acid,1.4
4,HMDB0000736,Isobutyryl-L-carnitine,0.722222
7,HMDB0001257,Spermidine,0.666667
8,HMDB0001844,Methylsuccinic acid,0.5


In [28]:
query = [{'range':(1.25, 1.35), 'mult': 'd'}, {'range': (4.05, 4.15), 'mult': 'q'}]

result = multiple_query(query, nmrdb, df)
result.head(6)    

Unnamed: 0,accession,name,similarity
1,HMDB0000190,L-Lactic acid,2.0
0,HMDB0000030,Biotin,0.7
7,HMDB0005000,Loratadine,0.642857
4,HMDB0000701,Hexanoylglycine,0.4
3,HMDB0000554,Dihydroandrosterone,0.2
2,HMDB0000546,Epietiocholanolone,0.181818


In [29]:
query = [{'range': (3.87, 3.93), 'mult': '*'},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
2,HMDB0000043,Betaine,1.5
3,HMDB0000064,Creatine,1.5
66,HMDB0002006,"2,3-Diaminopropionic acid",1.5
63,HMDB0001991,7-Methylxanthine,1.5
59,HMDB0001867,4-Aminohippuric acid,1.333333
53,HMDB0001398,Guaiacol,1.333333


In [30]:
query = [{'range': (3.87, 3.93), 'mult': 'dd'},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
66,HMDB0002006,"2,3-Diaminopropionic acid",1.5
38,HMDB0000742,Homocysteine,1.333333
11,HMDB0000158,L-Tyrosine,1.2
28,HMDB0000479,3-Methylhistidine,1.2
0,HMDB0000021,Iodotyrosine,1.166667
33,HMDB0000609,DL-Dopa,1.166667


In [31]:
nmrdb[nmrdb['accession']=='HMDB0000742']

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
1055,HMDB0000742,m03,2.14,1,"['14.86', '14.70', '7.47']",[3],m,2.06,2.22,"[2.07, 2.09, 2.1, 2.12, 2.13, 2.14, 2.15, 2.15...","[1037.1, 1045.1, 1051.7, 1058.8, 1066.2, 1068....","[0.018460648148148146, 0.028732638888888884, 0...",[3]
1415,HMDB0000742,m02,2.65,2,"['10.73', '8.12', '6.73']",[2],ddd,2.59,2.72,"[2.6, 2.62, 2.63, 2.64, 2.64, 2.65, 2.66, 2.66...","[1300.1, 1307.2, 1313.7, 1320.7, 1321.7, 1324....","[0.029597332649397283, 0.036881251602975125, 0...",[2]
2231,HMDB0000742,m01,3.87,1,"['7.13', '5.62']",[4],dd,3.83,3.9,"[-0.01001467351430696, -1.4673514306728919e-05...","[1928.4, 1934.0, 1935.5, 1941.1]","[0.23097445474554795, 0.2749282998732742, 0.26...",[4]


In [32]:
# Looking for Leucine
query = [{'range': (0.94, 0.99), 'mult': 't', 'ppm': np.array([0.949542,0.96010, 0.970836]), 'heights': np.array([0.25,0.5,0.25])},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
12,HMDB0000452,L-alpha-Aminobutyric acid,2.241946
22,HMDB0000687,L-Leucine,2.154733
21,HMDB0000650,D-alpha-Aminobutyric acid,1.754231
34,HMDB0001987,2-Hydroxy-2-methylbutyric acid,1.712065
8,HMDB0000339,2-Methylbutyrylglycine,1.563549
31,HMDB0001388,alpha-Linolenic acid,1.392834


In [33]:
# Looking for Threonine
query = [{'range': (4.22, 4.28), 'mult': '*', 'ppm': np.array([4.2351,4.243,4.2461,4.254 ,4.2571,4.265 ,4.2681,4.276]), 'heights': np.array([1,1,3,3,3,3,1,1]) / np.sum([1,1,3,3,3,3,1,1])},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

  cm_test = np.sum(test_ppm * test_heights) / np.sum(test_heights)


Unnamed: 0,accession,name,similarity
2,HMDB0000167,L-Threonine,2.232421
5,HMDB0000244,Riboflavin,2.166881
4,HMDB0000217,NADP,1.968235
14,HMDB0000565,Galactonic acid,1.909901
21,HMDB0000982,5-Methylcytidine,1.90507
25,HMDB0001563,1-Methylguanosine,1.892107


In [34]:
# Looking for Tyrosine
query = [{'range': (7.16, 7.6), 'mult': '*', 'ppm': [7.185000,7.189691,7.192445,7.200093,7.203968,7.208965], 'heights': [0.14,1,0.3,0.28,0.92,0.14]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
4,HMDB0000158,L-Tyrosine,2.297146
103,HMDB0004811,"2,4-Dichlorophenol",2.268032
109,HMDB0005794,Quercetin,2.252696
6,HMDB0000205,Phenylpyruvic acid,2.178898
98,HMDB0003312,Daidzein,2.15463
85,HMDB0002055,o-Cresol,2.142429


In [35]:
# Looking for Phenylalanine
query = [{'range': (7.2, 7.5), 'mult': '*', 'ppm': [7.414916,7.418995,7.421035,7.429193,7.431233,7.434292,7.440411,7.442960,7.4450005], 'heights': [0.16,0.8,0.2,0.53,1,0.2,0.13,0.33,0.2]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
22,HMDB0000684,L-Kynurenine,2.466096
80,HMDB0004812,"2,5-Furandicarboxylic acid",2.396479
60,HMDB0001942,Phenylpropanolamine,2.301726
5,HMDB0000228,Phenol,2.286237
24,HMDB0000715,Kynurenic acid,2.23897
3,HMDB0000205,Phenylpyruvic acid,2.234585


In [36]:
# Looking for Lactate
query = [{'range': (4, 4.2), 'mult': 'q', 'ppm': [4.095,4.1065,4.118,4.1295], 'heights': [0.33,1,1,0.33]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
24,HMDB0000190,L-Lactic acid,2.205696
14,HMDB0000125,Glutathione,1.832804
105,HMDB0005000,Loratadine,1.816799
22,HMDB0000174,L-Fucose,1.74853
11,HMDB0000101,Deoxyadenosine,1.740319
101,HMDB0003072,Quinic acid,1.375292


In [37]:
nmrdb[nmrdb['accession'] == 'HMDB0000190']

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
442,HMDB0000190,m01,1.32,4,['6.96'],['3'],d,1.3,1.33,"[1.31, 1.324]","['654.7', '661.7']","[0.49952454832090487, 0.5004754516790951]",['3']
2440,HMDB0000190,m02,4.1,2,"['6.93', '6.93']",['2'],q,4.08,4.13,"[-0.020341658481831004, -0.00634165848183077, ...","['2041.1', '2048.1', '2055.0', '2061.9']","[0.12670127683457275, 0.378981338571629, 0.373...",['2']


In [38]:
# Looking for Lactate
query = [{'range': (4, 4.2), 'mult': 'q', 'ppm': [4.095,4.1065,4.118,4.1295], 'heights': [0.33,1,1,0.33]},
         {'range': (1.25, 1.36), 'mult': 'd', 'ppm': [1.31, 1.324]	, 'heights': [0.49952454832090487, 0.5004754516790951]	}]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
4,HMDB0000190,L-Lactic acid,2.852848
3,HMDB0000174,L-Fucose,1.822182
16,HMDB0005000,Loratadine,1.015542
0,HMDB0000030,Biotin,0.824622
5,HMDB0000424,2-Hydroxydecanedioic acid,0.4
9,HMDB0000701,Hexanoylglycine,0.4


In [39]:
nmrdb[nmrdb['accession'] == 'HMDB0001925']

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
141,HMDB0001925,m07,0.9,6,['6.61'],"['9', '12']",d,0.86,0.94,"[0.89, 0.91]","[446.4, 453.0]","[0.48971781395111497, 0.5102821860488851]","[9, 12]"
562,HMDB0001925,m06,1.51,3,['7.16'],['11'],d,1.48,1.54,"[1.5, 1.52]","[750.1, 757.3]","[0.4859700840703134, 0.5140299159296867]",[11]
810,HMDB0001925,m05,1.85,1,[''],['8'],m,1.79,1.91,"[1.81, 1.82, 1.83, 1.85, 1.86, 1.88, 1.89]","[903.3, 910.3, 917.0, 923.7, 930.4, 937.3, 944.0]","[0.03258265452803066, 0.10828941063727837, 0.2...",[8]
1323,HMDB0001925,m04,2.45,2,['7.18'],['7'],d,2.42,2.48,"[2.44, 2.46]","[1221.1, 1228.3]","[0.5146812080536913, 0.4853187919463087]",[7]
2039,HMDB0001925,m03,3.72,1,['7.17'],['10'],q,3.67,3.77,"[3.7, 3.71, 3.73, 3.74]","[1849.6, 1856.8, 1863.9, 1871.1]","[0.10770913067249863, 0.37233460907599775, 0.3...",[10]
3176,HMDB0001925,m02,7.11,2,['8.10'],"['2', '6']",d,7.08,7.13,"[7.1, 7.11]","[3548.0, 3556.1]","[0.4169971671388102, 0.5830028328611898]","[2, 6]"
3237,HMDB0001925,m01,7.23,2,['8.03'],"['3', '5']",d,7.2,7.25,"[-0.004422520389371517, 0.005577479610629158]","[3608.0, 3616.0]","[0.5577479610628782, 0.44225203893712184]","[3, 5]"


In [40]:
# Looking for Ibupeofen
query = [{'range': (7.07, 7.199), 'mult': 'd', 'ppm': [7.1228, 7.136], 'heights': [0.83,1]},
        {'range': (7.21, 7.35), 'mult': 'd', 'ppm': [7.241, 7.2542], 'heights': [1, 0.83]}]
     #   {'range': (0.88, 0.96), 'mult': 'd', 'ppm': [0.916, 0.927], 'heights': [1.0,1.0]}]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
10,HMDB0001925,Ibuprofen,1.923175
15,HMDB0005000,Loratadine,1.489583
9,HMDB0001868,5-Methoxysalicylic acid,1.323297
8,HMDB0001713,m-Coumaric acid,1.222001
1,HMDB0000259,Serotonin,1.158999
3,HMDB0000706,Aspartylphenylalanine,1.155716
