# HMDB database importer

In [1]:
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import ast
import re

import bisect
from rapidfuzz import process, fuzz

import sys
sys.path.append("src")

from hmdb_local_tools import  multiple_query, approximate_lookup

## Read compiled DB

In [2]:
dbname = "serum"
nmrdb = pd.read_csv("inst/spectral1hnmr.csv")
df = pd.read_csv(f"inst/{dbname}_metabolites_with_spectra.csv")

# Convert ppm from string to np.array
nmrdb['ppm'] = nmrdb['ppm'].apply(lambda x: np.array(ast.literal_eval(x.replace("'",''))))

def h_converter(x):
    x = np.array(ast.literal_eval(x.replace("'",'')))
    return x / np.sum(x) 
    
nmrdb['heights'] = nmrdb['heights'].apply(lambda x: h_converter(x))

## Prepare the dataset for approximate lookup using the metabolyte names and synomyms
df['synonyms'] = df['synonyms'].apply(lambda x: ast.literal_eval(x))
df['synonyms_cat'] = df['name'] + " " + df['synonyms'].str.join('')
df['synonyms_cat'] = df['synonyms_cat'].str.lower()

## Test approximage lookup

In [3]:
matches = approximate_lookup(df, 'synonyms_cat', 'citric acid', fuzz.partial_token_ratio, fuzz.ratio, limit=3)
print(matches)

[['Citric acid' 'Citric acid' 46 100.0]
 ['trans-Aconitic acid' 'Citridic acid' 376 91.66666666666666]
 ['cis-Aconitic acid' 'Citridic acid' 37 91.66666666666666]]


## Rank matches according to multiplet similarity

In [4]:
query = [ {'range': (3.87, 3.93), 'mult': 'dd', 'ppm': np.array([3.8892, 3.8930, 3.9097, 3.9134]), 'heights': np.array([0.25, 0.25, 0.25, 0.25])},]

result = multiple_query(query, nmrdb, df)
result.head(10) 

Unnamed: 0,accession,name,similarity
7,HMDB0000122,D-Glucose,2.126998
15,HMDB0000191,L-Aspartic acid,1.965755
42,HMDB0000884,Ribothymidine,1.952838
35,HMDB0000660,D-Fructose,1.942185
33,HMDB0000609,DL-Dopa,1.941478
20,HMDB0000258,Sucrose,1.905719
38,HMDB0000742,Homocysteine,1.898036
66,HMDB0002006,"2,3-Diaminopropionic acid",1.89419
67,HMDB0002545,Galacturonic acid,1.865068
22,HMDB0000296,Uridine,1.828641


In [5]:
query = [{'range': (2.5, 2.6), 'mult': 'd'}, {'range': (2.6, 2.7), 'mult': 'd'}]

result = multiple_query(query, nmrdb, df)
result.head()

Unnamed: 0,accession,name,similarity
1,HMDB0000094,Citric acid,2.0
3,HMDB0000402,2-Isopropylmalic acid,1.4
4,HMDB0000736,Isobutyryl-L-carnitine,0.722222
7,HMDB0001257,Spermidine,0.666667
8,HMDB0001844,Methylsuccinic acid,0.5


In [6]:
query = [{'range':(1.25, 1.35), 'mult': 'd'}, {'range': (4.05, 4.15), 'mult': 'q'}]

result = multiple_query(query, nmrdb, df)
result.head(6)    

Unnamed: 0,accession,name,similarity
1,HMDB0000190,L-Lactic acid,2.0
0,HMDB0000030,Biotin,0.7
7,HMDB0005000,Loratadine,0.642857
4,HMDB0000701,Hexanoylglycine,0.4
3,HMDB0000554,Dihydroandrosterone,0.2
2,HMDB0000546,Epietiocholanolone,0.181818


In [7]:
query = [{'range': (3.87, 3.93), 'mult': '*'},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
2,HMDB0000043,Betaine,1.5
3,HMDB0000064,Creatine,1.5
66,HMDB0002006,"2,3-Diaminopropionic acid",1.5
63,HMDB0001991,7-Methylxanthine,1.5
59,HMDB0001867,4-Aminohippuric acid,1.333333
53,HMDB0001398,Guaiacol,1.333333


In [8]:
query = [{'range': (3.87, 3.93), 'mult': 'dd'},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
66,HMDB0002006,"2,3-Diaminopropionic acid",1.5
38,HMDB0000742,Homocysteine,1.333333
11,HMDB0000158,L-Tyrosine,1.2
28,HMDB0000479,3-Methylhistidine,1.2
0,HMDB0000021,Iodotyrosine,1.166667
33,HMDB0000609,DL-Dopa,1.166667


In [9]:
nmrdb[nmrdb['accession']=='HMDB0000742']

Unnamed: 0,accession,multiplet,shift1(ppm),hs,j(hz),atom1,type,from,to,ppm,hz,heights,assigned atoms
1055,HMDB0000742,m03,2.14,1,"['14.86', '14.70', '7.47']",[3],m,2.06,2.22,"[2.07, 2.09, 2.1, 2.12, 2.13, 2.14, 2.15, 2.15...","[1037.1, 1045.1, 1051.7, 1058.8, 1066.2, 1068....","[0.018460648148148146, 0.028732638888888884, 0...",[3]
1415,HMDB0000742,m02,2.65,2,"['10.73', '8.12', '6.73']",[2],ddd,2.59,2.72,"[2.6, 2.62, 2.63, 2.64, 2.64, 2.65, 2.66, 2.66...","[1300.1, 1307.2, 1313.7, 1320.7, 1321.7, 1324....","[0.029597332649397283, 0.036881251602975125, 0...",[2]
2231,HMDB0000742,m01,3.87,1,"['7.13', '5.62']",[4],dd,3.83,3.9,"[-0.01001467351430696, -1.4673514306728919e-05...","[1928.4, 1934.0, 1935.5, 1941.1]","[0.23097445474554795, 0.2749282998732742, 0.26...",[4]


In [10]:
# Looking for Leucine
query = [{'range': (0.94, 0.99), 'mult': 't', 'ppm': np.array([0.949542,0.96010, 0.970836]), 'heights': np.array([0.25,0.5,0.25])},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
12,HMDB0000452,L-alpha-Aminobutyric acid,2.241946
22,HMDB0000687,L-Leucine,2.154733
21,HMDB0000650,D-alpha-Aminobutyric acid,1.754231
34,HMDB0001987,2-Hydroxy-2-methylbutyric acid,1.712065
8,HMDB0000339,2-Methylbutyrylglycine,1.563549
31,HMDB0001388,alpha-Linolenic acid,1.392834


In [11]:
# Looking for Threonine
query = [{'range': (4.22, 4.28), 'mult': '*', 'ppm': np.array([4.2351,4.243,4.2461,4.254 ,4.2571,4.265 ,4.2681,4.276]), 'heights': np.array([1,1,3,3,3,3,1,1]) / np.sum([1,1,3,3,3,3,1,1])},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

  cm_test = np.sum(test_ppm * test_heights) / np.sum(test_heights)


Unnamed: 0,accession,name,similarity
2,HMDB0000167,L-Threonine,2.232421
5,HMDB0000244,Riboflavin,2.166881
4,HMDB0000217,NADP,1.968235
14,HMDB0000565,Galactonic acid,1.909901
21,HMDB0000982,5-Methylcytidine,1.90507
25,HMDB0001563,1-Methylguanosine,1.892107


In [12]:
# Looking for Tyrosine
query = [{'range': (7.16, 7.6), 'mult': '*', 'ppm': [7.185000,7.189691,7.192445,7.200093,7.203968,7.208965], 'heights': [0.14,1,0.3,0.28,0.92,0.14]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

Unnamed: 0,accession,name,similarity
4,HMDB0000158,L-Tyrosine,2.297146
103,HMDB0004811,"2,4-Dichlorophenol",2.268032
109,HMDB0005794,Quercetin,2.252696
6,HMDB0000205,Phenylpyruvic acid,2.178898
98,HMDB0003312,Daidzein,2.15463
85,HMDB0002055,o-Cresol,2.142429


In [None]:
# Looking for Phenylalanine
query = [{'range': (7.2, 7.5), 'mult': '*', 'ppm': [7.414916,7.418995,7.421035,7.429193,7.431233,7.434292,7.440411,7.442960,7.4450005], 'heights': [0.16,0.8,0.2,0.53,1,0.2,0.13,0.33,0.2]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

In [None]:
# Looking for Lactate
query = [{'range': (4, 4.2), 'mult': 'q', 'ppm': [4.095,4.1065,4.118,4.1295], 'heights': [0.33,1,1,0.33]},]

result = multiple_query(query, nmrdb, df)
result.head(6) 

In [None]:
nmrdb[nmrdb['accession'] == 'HMDB0000190']

In [None]:
# Looking for Lactate
query = [{'range': (4, 4.2), 'mult': 'q', 'ppm': [4.095,4.1065,4.118,4.1295], 'heights': [0.33,1,1,0.33]},
         {'range': (1.25, 1.36), 'mult': 'd', 'ppm': [1.31, 1.324]	, 'heights': [0.49952454832090487, 0.5004754516790951]	}]

result = multiple_query(query, nmrdb, df)
result.head(6) 

In [None]:
nmrdb[nmrdb['accession'] == 'HMDB0000174']

In [None]:
# Looking for Ibupeofen
query = [{'range': (7.07, 7.35), 'mult': '*', 'ppm': [7.122, 7.136, 7.241, 7.254], 'heights': [0.8,1,1,0.8]},]
     #   {'range': (0.88, 0.96), 'mult': 'd', 'ppm': [0.916, 0.927], 'heights': [1.0,1.0]}]

result = multiple_query(query, nmrdb, df)
result.head(6) 

In [None]:
nmrdb[nmrdb['accession'] == 'HMDB0000667']

In [None]:
# Looking for Ibupeofen
query = [{'range': (7.07, 7.2), 'mult': 'd', 'ppm': [7.1228, 7.136], 'heights': [0.83,1]},
        {'range': (7.2, 7.35), 'mult': 'd', 'ppm': [7.241, 7.2542], 'heights': [1, 0.83]}]
     #   {'range': (0.88, 0.96), 'mult': 'd', 'ppm': [0.916, 0.927], 'heights': [1.0,1.0]}]

result = multiple_query(query, nmrdb, df)
result.head(6) 

In [None]:
nmrdb[nmrdb['accession'] == 'HMDB0001925']
#[HMDB0001872, HMDB01872, HMDB01925]

In [None]:
# Looking for Ethyl vinyl ether
query = [{'range': (3.72, 3.82), 'mult': 'q', 'ppm': [3.7440, 3.7620, 3.7799, 3.7969], 'heights': [1.0, 3.0, 3.0, 1.0]},
         {'range': (1.25, 1.36), 'mult': 'd', 'ppm': [1.2903, 1.3081, 1.3256]	, 'heights': [1.0, 2.0, 1.0]}]

result = multiple_query(query, nmrdb, df)
result.head(6) 