In [16]:
from __future__ import print_function
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pubchempy
from pandas import Series, DataFrame
from rdkit import Chem
from IPython.core.pylabtools import figsize

%matplotlib inline
figsize(16, 8)

In [17]:
# Load the data
df_std1 = pd.read_csv('../data/std1_dilutions.csv', keep_default_na=False)
df_std2 = pd.read_csv('../data/std2_dilutions.csv', keep_default_na=False)

# Fix str column paddings
str_columns = ['Name', 'Formula']

for df in [df_std1, df_std2]:
    for c in str_columns:
        df[c] = df[c].apply(str.strip)

# Drop redundant (for now...) information
df_std1 = df_std1.drop(['Unnamed: 0', 'Dil_1_5', 'Dil_1_10', 'Dil_1_50', 'Dil_1_100', 'Dil_1_1000'], axis=1)
df_std2 = df_std2.drop(['Unnamed: 0', 'Dil_1_5', 'Dil_1_10', 'Dil_1_50', 'Dil_1_100', 'Dil_1_1000'], axis=1)

# Pick metabolites that have useful information
df_std1_mh = df_std1[(df_std1['Dil_1_1'] > 0) & (df_std1['Adduct'] == 'M+H')]
df_std2_mh = df_std2[(df_std2['Dil_1_1'] > 0) & (df_std2['Adduct'] == 'M+H')]

In [18]:
df_std1_mh.head(2)

Unnamed: 0,Name,Formula,Adduct,Dil_1_1
6,Spermidine,C7H19N3,M+H,24278942
28,Glycerol,C3H8O3,M+H,2168900


In [19]:
df_std2_mh.head(2)

Unnamed: 0,Name,Formula,Adduct,Dil_1_1
6,Hypoxanthine,C5H4N4O,M+H,64196896
28,Nicotinamide,C6H6N2O,M+H,949647488


In [20]:
def get_mincid_compound(compounds):
    """Gets the compout with lowest CID"""
    
    if compounds:
        return compounds[min(enumerate(compounds), key=lambda c: c[1].cid)[0]]
    
    return None

def exctract_compound_properties(compound, get_inchi=False):
    info = {
        'cid': compound.cid,
        'exact_mass': compound.exact_mass,
        'complexity': compound.complexity,
        'xlogp': compound.xlogp,
        'h_bond_acceptor_count': compound.h_bond_acceptor_count,
        'h_bond_donor_count': compound.h_bond_donor_count,
        'rotatable_bond_count': compound.rotatable_bond_count,
        'heavy_atom_count': compound.heavy_atom_count,
        'tpsa': compound.tpsa,
        'molecular_formula': str(compound.molecular_formula)
    }
    
    if get_inchi:
        info['InChI_Key'] = compound.inchikey
        info['InChI'] = compound.inchi
    
    return info

In [21]:
def get_pubchem_info(df):
    compounds_info = []

    for i in range(len(df)):
        metabolite = df.iloc[i]

        compounds = pubchempy.get_compounds(metabolite['Name'].strip(), namespace='name')

        compound = get_mincid_compound(compounds)

        if compound:
            print('Loading info for', str.strip(metabolite['Name']))

            info = exctract_compound_properties(compound, get_inchi=True)

            info['index'] = metabolite.name

            compounds_info.append(info)

    df = DataFrame(compounds_info)
    df.set_index('index', inplace=True)
    
    return df

In [22]:
def merge_and_save(left, right, csv_output_file):
    df_merged = pd.concat([left, right], axis=1)
    
    df_no_match = df_merged[df_merged['Formula'] != df_merged['molecular_formula']]
    df_match = df_merged[df_merged['Formula'] == df_merged['molecular_formula']]
    
    df_match.drop(['molecular_formula'], axis=1)
    df_match.to_csv(csv_output_file, encoding='utf-8', index=False)

    return (df_match, df_no_match)

In [23]:
df_std1_mh_pubchem_info = get_pubchem_info(df_std1_mh)

df_match, df_no_match = merge_and_save(df_std1_mh, df_std1_mh_pubchem_info, '../data/std1_mh_11_pubchem.csv')

df_no_match

Loading info for Spermidine
Loading info for Glycerol
Loading info for Inosine
Loading info for L-Phenylalanine
Loading info for L-Leucine
Loading info for L-Tryptophan
Loading info for 2-Phenylglycine
Loading info for Selenomethionine
Loading info for L-Methionine
Loading info for Guanine
Loading info for Pyridoxine
Loading info for Imidazole-4-acetate
Loading info for L-Valine
Loading info for Adenine
Loading info for L-Proline
Loading info for Serotonin
Loading info for Taurine
Loading info for trans-4-Hydroxy-L-proline
Loading info for Creatinine
Loading info for N2-Acetyl-L-lysine
Loading info for L-Threonine
Loading info for L-Aspartate
Loading info for N-Acetyl-D-glucosamine
Loading info for L-Glutamine
Loading info for beta-Alanine
Loading info for L-Asparagine
Loading info for dGMP
Loading info for L-Serine
Loading info for L-Citrulline
Loading info for Cytidine
Loading info for Ethanolamine phosphate
Loading info for Glycine
Loading info for sn-glycero-3-Phosphocholine
Loadin

Unnamed: 0,Name,Formula,Adduct,Dil_1_1,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,molecular_formula,rotatable_bond_count,tpsa,xlogp
908,meso-2_6-Diaminoheptanedioate,C7H14N2O4,M+H,18883050.0,,,,,,,,,,,,
952,L-2_4-Diaminobutanoate,C4H10N2O2,M+H,1768055.0,,,,,,,,,,,,
1018,Thiamin,C12H16N4OS,M+H,20085790.0,InChI=1S/C12H17N4OS/c1-8-11(3-4-17)18-7-16(8)6...,JZRWCGZRTZMZEH-UHFFFAOYSA-N,1130.0,269.0,265.112307,5.0,2.0,18.0,C12H17N4OS+,4.0,104.0,1.0
1150,Phthalate,C8H6O4,M+H,5324067.0,InChI=1S/C8H6O4/c9-7(10)5-3-1-2-4-6(5)8(11)12/...,XNGIFLGASWRNHJ-UHFFFAOYSA-L,181977.0,166.0,164.010959,4.0,0.0,12.0,C8H4O4-2,0.0,80.3,2.0
1568,4-Aminobenzoate,C7H7NO2,M+H,3753900.0,InChI=1S/C7H7NO2/c8-6-3-1-5(2-4-6)7(9)10/h1-4H...,ALYNCZNDIQEVRV-UHFFFAOYSA-M,4876.0,122.0,136.039853,3.0,1.0,10.0,C7H6NO2-,0.0,66.2,1.5
1678,3'_5'-Cyclic AMP,C10H12N5O6P,M+H,110906200.0,,,,,,,,,,,,
1766,D-ribose 5-phosphate,C5H11O8P,M+H,786923.4,,,,,,,,,,,,
1810,Nicotinate,C6H5NO2,M+H,890650700.0,"InChI=1S/C6H5NO2/c8-6(9)5-2-1-3-7-4-5/h1-4H,(H...",PVNIIMVLHYAWGP-UHFFFAOYSA-M,937.0,108.0,122.024203,3.0,0.0,9.0,C6H4NO2-,0.0,53.0,0.9
1832,D-glucose 6-phosphate,C6H13O9P,M+H,980880.0,,,,,,,,,,,,


In [24]:
df_std2_mh_pubchem_info = get_pubchem_info(df_std2_mh)

df_match, df_no_match = merge_and_save(df_std2_mh, df_std2_mh_pubchem_info, '../data/std2_mh_11_pubchem.csv')

df_no_match

Loading info for Hypoxanthine
Loading info for Nicotinamide
Loading info for 5-Oxoproline
Loading info for 5'-Methylthioadenosine
Loading info for Deoxyadenosine
Loading info for L-Kynurenine
Loading info for Adenosine
Loading info for Guanosine
Loading info for 6-Methylaminopurine
Loading info for L-isoleucine
Loading info for Picolinic acid
Loading info for Pyridoxal
Loading info for L-Tyrosine
Loading info for 4-Trimethylammoniobutanoate
Loading info for Betaine
Loading info for L-2-Aminoadipate
Loading info for 4-(beta-Acetylaminoethyl)imidazole
Loading info for 1-Aminocyclopropane-1-carboxylate
Loading info for L-Carnitine
Loading info for Menadione
Loading info for L-Alanine
Loading info for dAMP
Loading info for 4-Aminobutanoate
Loading info for L-homoserine
Loading info for Choline
Loading info for AMP
Loading info for cytosine
Loading info for N-Acetylornithine
Loading info for 1-Aminopropan-2-ol
Loading info for Ala-Gly
Loading info for Glycylglycine
Loading info for S-Adenos

Unnamed: 0,Name,Formula,Adduct,Dil_1_1,InChI,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,molecular_formula,rotatable_bond_count,tpsa,xlogp
336,4-Trimethylammoniobutanoate,C7H15NO2,M+H,4638722000.0,"InChI=1S/C7H15NO2/c1-8(2,3)6-4-5-7(9)10/h4-6H2...",JHPNVNIEXXLNTR-UHFFFAOYSA-O,134,115.0,146.118104,2,1,10,C7H16NO2+,4,37.3,0.1
600,Choline,C5H13NO,M+H,145715300.0,"InChI=1S/C5H14NO/c1-6(2,3)4-5-7/h7H,4-5H2,1-3H...",OEYIOHPDSNJKLS-UHFFFAOYSA-N,305,46.5,104.107539,1,1,7,C5H14NO+,2,20.2,-0.4
842,Choline phosphate,C5H14NO4P,M+H,3903888.0,"InChI=1S/C5H14NO4P/c1-6(2,3)4-5-10-11(7,8)9/h4...",YHHSONZFOIEMCP-UHFFFAOYSA-O,1014,158.0,184.07387,4,2,11,C5H15NO4P+,4,66.8,-1.5
1282,succinate semialdehyde,C4H6O3,M+H,894314.8,"InChI=1S/C4H6O3/c5-3-1-2-4(6)7/h3H,1-2H2,(H,6,...",UIUJIQZEACWQSV-UHFFFAOYSA-M,9543238,71.5,101.023869,3,0,7,C4H5O3-,2,57.2,-0.2
1326,Succinate,C4H6O4,M+H,1203196.0,"InChI=1S/C4H6O4/c5-3(6)1-2-4(7)8/h1-2H2,(H,5,6...",KDYFGRWQOYBRFD-UHFFFAOYSA-L,160419,81.6,116.010959,4,0,8,C4H4O4-2,1,80.3,0.7
