In [10]:
from __future__ import print_function
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pubchempy
from pandas import Series, DataFrame
from rdkit import Chem
from IPython.core.pylabtools import figsize

%matplotlib inline
figsize(16, 8)

In [28]:
# Load the data
df_std1 = pd.read_csv('../data/std1_dilutions.csv', keep_default_na=False)
df_std2 = pd.read_csv('../data/std2_dilutions.csv', keep_default_na=False)

# Fix str column paddings
str_columns = ['Name', 'Formula']

for df in [df_std1, df_std2]:
    for c in str_columns:
        df[c] = df[c].apply(str.strip)

# Drop redundant (for now...) information
df_std1 = df_std1.drop(['Unnamed: 0', 'Dil_1_5', 'Dil_1_10', 'Dil_1_50', 'Dil_1_100', 'Dil_1_1000'], axis=1)
df_std2 = df_std2.drop(['Unnamed: 0', 'Dil_1_5', 'Dil_1_10', 'Dil_1_50', 'Dil_1_100', 'Dil_1_1000'], axis=1)

# Pick metabolites that have useful information
df_std1_mh = df_std1[(df_std1['Dil_1_1'] > 0) & (df_std1['Adduct'] == 'M+H')]
df_std2_mh = df_std2[(df_std2['Dil_1_1'] > 0) & (df_std2['Adduct'] == 'M+H')]

In [29]:
df_std1_mh.head()

Unnamed: 0,Name,Formula,Adduct,Dil_1_1
6,Spermidine,C7H19N3,M+H,24278942
28,Glycerol,C3H8O3,M+H,2168900
160,Inosine,C10H12N4O5,M+H,70263872
182,L-Phenylalanine,C9H11NO2,M+H,158189072
204,L-Leucine,C6H13NO2,M+H,197374192


In [14]:
def get_mincid_compound(compounds):
    """Gets the compout with lowest CID"""
    
    if compounds:
        return compounds[min(enumerate(compounds), key=lambda c: c[1].cid)[0]]
    
    return None

def exctract_compound_properties(compound, get_inchi=False):
    info = {
        'cid': compound.cid,
        'exact_mass': compound.exact_mass,
        'complexity': compound.complexity,
        'xlogp': compound.xlogp,
        'h_bond_acceptor_count': compound.h_bond_acceptor_count,
        'h_bond_donor_count': compound.h_bond_donor_count,
        'rotatable_bond_count': compound.rotatable_bond_count,
        'heavy_atom_count': compound.heavy_atom_count,
        'tpsa': compound.tpsa,
        'molecular_formula': compound.molecular_formula
    }
    
    if get_inchi:
        info['InChI_Key'] = compound.inchikey
        info['InChI'] = compound.inchi
    
    return info

In [15]:
compounds_info = []

for i in range(len(df_std1_mh)):
    metabolite = df_std1_mh.iloc[i]
    
    compounds = pubchempy.get_compounds(metabolite['Name'].strip(), namespace='name')
    
    compound = get_mincid_compound(compounds)
    
    if compound:
        print('Loading info for', str.strip(metabolite['Name']))
        
        info = exctract_compound_properties(compound, get_inchi=True)
        
        info['index'] = metabolite.name
        
        compounds_info.append(info)

Loading info for Spermidine
Loading info for Glycerol
Loading info for Inosine
Loading info for L-Phenylalanine
Loading info for L-Leucine
Loading info for L-Tryptophan
Loading info for 2-Phenylglycine
Loading info for Selenomethionine
Loading info for L-Methionine
Loading info for Guanine
Loading info for Pyridoxine
Loading info for Imidazole-4-acetate
Loading info for L-Valine
Loading info for Adenine
Loading info for L-Proline
Loading info for Serotonin
Loading info for Taurine
Loading info for trans-4-Hydroxy-L-proline
Loading info for Creatinine
Loading info for N2-Acetyl-L-lysine
Loading info for L-Threonine
Loading info for L-Aspartate
Loading info for N-Acetyl-D-glucosamine
Loading info for L-Glutamine
Loading info for beta-Alanine
Loading info for L-Asparagine
Loading info for dGMP
Loading info for L-Serine
Loading info for L-Citrulline
Loading info for Cytidine
Loading info for Ethanolamine phosphate
Loading info for Glycine
Loading info for sn-glycero-3-Phosphocholine
Loadin

In [16]:
df = DataFrame(compounds_info)
df.head(3)

Unnamed: 0,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,index,molecular_formula,rotatable_bond_count,tpsa,xlogp
0,"InChI=1S/C7H19N3/c8-4-1-2-6-10-7-3-5-9/h10H,1-9H2",ATHGHQPFGPMSJY-UHFFFAOYSA-N,1102,56.8,145.157898,3,3,10,6,C7H19N3,7,64.1,-1.0
1,"InChI=1S/C3H8O3/c4-1-3(6)2-5/h3-6H,1-2H2",PEDCQBHIVMGVHV-UHFFFAOYSA-N,753,25.2,92.047344,3,3,6,28,C3H8O3,2,60.7,-1.8
2,InChI=1S/C10H12N4O5/c15-1-4-6(16)7(17)10(19-4)...,UGQMRVRMYYASKQ-KQYNXXCUSA-N,6021,405.0,268.08077,6,4,19,160,C10H12N4O5,2,129.0,-2.1


In [17]:
df.set_index('index', inplace=True)
df.head(3)

Unnamed: 0_level_0,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,molecular_formula,rotatable_bond_count,tpsa,xlogp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6,"InChI=1S/C7H19N3/c8-4-1-2-6-10-7-3-5-9/h10H,1-9H2",ATHGHQPFGPMSJY-UHFFFAOYSA-N,1102,56.8,145.157898,3,3,10,C7H19N3,7,64.1,-1.0
28,"InChI=1S/C3H8O3/c4-1-3(6)2-5/h3-6H,1-2H2",PEDCQBHIVMGVHV-UHFFFAOYSA-N,753,25.2,92.047344,3,3,6,C3H8O3,2,60.7,-1.8
160,InChI=1S/C10H12N4O5/c15-1-4-6(16)7(17)10(19-4)...,UGQMRVRMYYASKQ-KQYNXXCUSA-N,6021,405.0,268.08077,6,4,19,C10H12N4O5,2,129.0,-2.1


In [18]:
df_merged = pd.concat([df_std1_mh, df], axis=1)
df_merged['Formula'] = df_merged['Formula'].apply(str.strip)
df_merged['molecular_formula'] = df_merged['molecular_formula'].apply(lambda x: str(x))
df_merged['Formula_Match'] = (df_merged['Formula'] == df_merged['molecular_formula'])

# Metabolites we could not find

In [19]:
df_merged[df_merged['Formula_Match'] != True].head()

Unnamed: 0,Name,Formula,Adduct,Dil_1_1,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,molecular_formula,rotatable_bond_count,tpsa,xlogp,Formula_Match
908,meso-2_6-Diaminoheptanedioate,C7H14N2O4,M+H,18883046.0,,,,,,,,,,,,,False
952,L-2_4-Diaminobutanoate,C4H10N2O2,M+H,1768055.0,,,,,,,,,,,,,False
1018,Thiamin,C12H16N4OS,M+H,20085794.0,InChI=1S/C12H17N4OS/c1-8-11(3-4-17)18-7-16(8)6...,JZRWCGZRTZMZEH-UHFFFAOYSA-N,1130.0,269.0,265.112307,5.0,2.0,18.0,C12H17N4OS+,4.0,104.0,1.0,False
1150,Phthalate,C8H6O4,M+H,5324067.0,InChI=1S/C8H6O4/c9-7(10)5-3-1-2-4-6(5)8(11)12/...,XNGIFLGASWRNHJ-UHFFFAOYSA-L,181977.0,166.0,164.010959,4.0,0.0,12.0,C8H4O4-2,0.0,80.3,2.0,False
1568,4-Aminobenzoate,C7H7NO2,M+H,3753899.5,InChI=1S/C7H7NO2/c8-6-3-1-5(2-4-6)7(9)10/h1-4H...,ALYNCZNDIQEVRV-UHFFFAOYSA-M,4876.0,122.0,136.039853,3.0,1.0,10.0,C7H6NO2-,0.0,66.2,1.5,False


#Best match

In [24]:
df_match = df_merged[df_merged['Formula_Match'] == True]
df_match.head()

Unnamed: 0,Name,Formula,Adduct,Dil_1_1,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,molecular_formula,rotatable_bond_count,tpsa,xlogp,Formula_Match
6,Spermidine,C7H19N3,M+H,24278942,"InChI=1S/C7H19N3/c8-4-1-2-6-10-7-3-5-9/h10H,1-9H2",ATHGHQPFGPMSJY-UHFFFAOYSA-N,1102,56.8,145.157898,3,3,10,C7H19N3,7,64.1,-1.0,True
28,Glycerol,C3H8O3,M+H,2168900,"InChI=1S/C3H8O3/c4-1-3(6)2-5/h3-6H,1-2H2",PEDCQBHIVMGVHV-UHFFFAOYSA-N,753,25.2,92.047344,3,3,6,C3H8O3,2,60.7,-1.8,True
160,Inosine,C10H12N4O5,M+H,70263872,InChI=1S/C10H12N4O5/c15-1-4-6(16)7(17)10(19-4)...,UGQMRVRMYYASKQ-KQYNXXCUSA-N,6021,405.0,268.08077,6,4,19,C10H12N4O5,2,129.0,-2.1,True
182,L-Phenylalanine,C9H11NO2,M+H,158189072,InChI=1S/C9H11NO2/c10-8(9(11)12)6-7-4-2-1-3-5-...,COLNVLDHVKWLRT-QMMMGPOBSA-N,6140,153.0,165.078979,3,2,12,C9H11NO2,3,63.3,-1.5,True
204,L-Leucine,C6H13NO2,M+H,197374192,"InChI=1S/C6H13NO2/c1-4(2)3-5(7)6(8)9/h4-5H,3,7...",ROHFNLRQFUQHCH-YFKPBYRVSA-N,6106,101.0,131.094629,3,2,9,C6H13NO2,3,63.3,-1.5,True


In [30]:
df_match.drop(['Formula_Match', 'molecular_formula'], axis=1)
csv_output_file = '../data/std1_mh_11_pubchem.csv'
df_match.to_csv(csv_output_file, encoding='utf-8', index=False)