In [25]:
from __future__ import print_function
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pubchempy
from pandas import Series, DataFrame
from rdkit import Chem
from IPython.core.pylabtools import figsize

%matplotlib inline
figsize(16, 8)

# Load Data and compute InChI Keys

In [26]:
df_train = pd.read_csv('../data/TableS2_training.csv', keep_default_na=False)

# Calculate InChI Key
df_train['InChI_Key'] = df_train['InChI'].apply(lambda x: Chem.inchi.InchiToInchiKey(x))

# Load information for L-Asparagine

In [27]:
metabolite = df_train.iloc[11]
metabolite[['Name', 'InChI_Key']]

Name                        L-Asparagine
InChI_Key    DCXYFEDJOCDNAF-REOHCLBHSA-N
Name: 11, dtype: object

In [28]:
compounds = pubchempy.get_compounds(metabolite['InChI_Key'], 'inchikey')

In [29]:
compounds

[Compound(6992089), Compound(6267)]

In [30]:
def get_mincid_compound(compounds):
    """Gets the compout with lowest CID"""
    
    if compounds:
        return compounds[min(enumerate(compounds), key=lambda c: c[1].cid)[0]]
    
    return None

In [31]:
min_cid_compound = get_mincid_compound(compounds)

In [32]:
print("xLogP", min_cid_compound.xlogp)
print("tpsa", min_cid_compound.tpsa)

xLogP -3.4
tpsa 106


# Get compound properties from PubChem

In [33]:
def exctract_compound_properties(compound):
    info = {
        'cid': compound.cid,
        'exact_mass': compound.exact_mass,
        'complexity': compound.complexity,
        'xlogp': compound.xlogp,
        'h_bond_acceptor_count': compound.h_bond_acceptor_count,
        'h_bond_donor_count': compound.h_bond_donor_count,
        'rotatable_bond_count': compound.rotatable_bond_count,
        'heavy_atom_count': compound.heavy_atom_count,
        'tpsa': compound.tpsa
    }
    
    return info

def get_pubchem_properties(inchi_keys):
    properties = []

    for inchi_key in inchi_keys:
        compounds = pubchempy.get_compounds(inchi_key, 'inchikey')
        
        compound = get_mincid_compound(compounds)

        if compound:
            print('Loading info for CID({}) - {}'.format(compound.cid, compound.iupac_name))
            
            info = exctract_compound_properties(compound)
            
            info['InChI_Key'] = inchi_key

            properties.append(info)

    return properties

In [34]:
def merge_and_save(df, properties, csv_output_file):
    df_properties = DataFrame(properties)
    df_merged = pd.merge(df, df_properties, on='InChI_Key')
    df_merged.to_csv(csv_output_file, encoding='utf-8')

    return df_merged

# Load info for the training data

In [35]:
# Get properties from PubChem for the training set
train_properties = get_pubchem_properties(df_train['InChI_Key'])
df = merge_and_save(df_train, train_properties, '../data/TableS2_training_pubchem.csv')

df.head(3)

Loading info for CID(5950) - (2S)-2-aminopropanoic acid
Loading info for CID(239) - 3-aminopropanoic acid
Loading info for CID(119) - 4-aminobutanoic acid
Loading info for CID(774) - 2-(1H-imidazol-5-yl)ethanamine
Loading info for CID(586) - 2-[carbamimidoyl(methyl)amino]acetic acid
Loading info for CID(6287) - (2S)-2-amino-3-methylbutanoic acid
Loading info for CID(6288) - (2S,3R)-2-amino-3-hydroxybutanoic acid
Loading info for CID(936) - pyridine-3-carboxamide
Loading info for CID(938) - pyridine-3-carboxylic acid
Loading info for CID(6106) - (2S)-2-amino-4-methylpentanoic acid
Loading info for CID(6306) - (2S,3S)-2-amino-3-methylpentanoic acid
Loading info for CID(6267) - (2S)-2,4-diamino-4-oxobutanoic acid
Loading info for CID(5960) - (2S)-2-aminobutanedioic acid
Loading info for CID(190) - 7H-purin-6-amine
Loading info for CID(778) - 2-amino-4-sulfanylbutanoic acid
Loading info for CID(978) - 4-aminobenzoic acid
Loading info for CID(5610) - 4-(2-aminoethyl)phenol
Loading info for 

Unnamed: 0,Name,Annotation,Short Name,InChI,MH+ Fragment,MH+ Isotope,fragment/isotope,MV,logD,abs_mob,...,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,rotatable_bond_count,tpsa,xlogp
0,L-Alanine,a,Ala,"InChI=1S/C3H7NO2/c1-2(4)3(5)6/h2H,4H2,1H3,(H,5...",90,,,70.3,-2.79,3.14,...,QNAYBMKLOCPYGJ-REOHCLBHSA-N,5950,61.8,89.047678,3,2,6,1,63.3,-3.0
1,β-Alanine,a,β-Ala,"InChI=1S/C3H7NO2/c4-2-1-3(5)6/h1-2,4H2,(H,5,6)",90,,,70.4,-3.01,4.14,...,UCMIRNVEIXFBKS-UHFFFAOYSA-N,239,52.8,89.047678,3,2,6,2,63.3,-3.0
2,γ-Aminobutyric acid,a,GABA,"InChI=1S/C4H9NO2/c5-3-1-2-4(6)7/h1-3,5H2,(H,6,7)",104,,,89.2,-3.1,0.000409,...,BTCSSZJGUNDROE-UHFFFAOYSA-N,119,62.7,103.063329,3,2,7,3,63.3,-3.2


# Load info for the test data

In [36]:
df_test = pd.read_csv('../data/TableS3_training.csv', keep_default_na=False)

# Calculate InChI Key
df_test['InChI_Key'] = df_test['InChI'].apply(lambda x: Chem.inchi.InchiToInchiKey(x))

In [37]:
# Get properties from PubChem for the test set
test_properties = get_pubchem_properties(df_test['InChI_Key'])
df = merge_and_save(df_test, test_properties, '../data/TableS3_training_pubchem.csv')

df.head(3)

Loading info for CID(6262) - (2S)-2,5-diaminopentanoic acid
Loading info for CID(22880) - (2R)-2-(methylamino)butanedioic acid
Loading info for CID(64969) - (2S)-2-amino-3-(3-methylimidazol-4-yl)propanoic acid
Loading info for CID(439378) - (2S)-2-amino-5-(ethylamino)-5-oxopentanoic acid
Loading info for CID(9750) - (2S)-2-amino-5-(carbamoylamino)pentanoic acid
Loading info for CID(188824) - (3R)-3-propanoyloxy-4-(trimethylazaniumyl)butanoate
Loading info for CID(439224) - (2S)-2-(3-aminopropanoylamino)-3-(1H-imidazol-5-yl)propanoic acid
Loading info for CID(439829) - 3-butanoyloxy-4-(trimethylazaniumyl)butanoate
Loading info for CID(27476) - (2R,3S,4R,5R)-2-(hydroxymethyl)-5-(6-imino-1-methylpurin-9-yl)oxolane-3,4-diol
Loading info for CID(11953814) - (3R)-3-octanoyloxy-4-(trimethylazaniumyl)butanoate


Unnamed: 0,Name,Short Name,InChI,MH+,MV,logD,abs_mob,zeff,RRF_Measured,RRF_Predicted,InChI_Key,cid,complexity,exact_mass,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,rotatable_bond_count,tpsa,xlogp
0,L-Ornithine,Orn,"InChI=1S/C5H12N2O2/c6-3-1-2-4(7)5(8)9/h4H,1-3,...",133,113,-4.22,0.000686,1.82,0.281,0.631,AHLPHDHHMVZTML-BYPYZUCNSA-N,6262,95,132.089878,4,3,9,4,89.3,-4.4
1,N-Methyl-Aspartic acid,MeAsp,"InChI=1S/C5H9NO4/c1-6-3(5(9)10)2-4(7)8/h3,6H,2...",148,114,-2.58,0.000348,0.71,0.842,0.667,HOKKHZGPKSLGJE-GSVOUGTGSA-N,22880,145,147.053158,5,3,10,4,86.6,-3.4
2,3-Methyl-L-Histidine,MeHis,InChI=1S/C7H11N3O2/c1-10-4-9-3-5(10)2-6(8)7(11...,170,144,-2.94,0.00062,1.49,1.4,2.28,JDHILDINMRGULE-LURJTMIESA-N,64969,174,169.085127,4,2,12,3,81.1,-3.3
