<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Fixing-GNPS-annotation-output" data-toc-modified-id="Fixing-GNPS-annotation-output-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Fixing GNPS annotation output</a></span><ul class="toc-item"><li><span><a href="#Import-libraries" data-toc-modified-id="Import-libraries-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Import libraries</a></span></li><li><span><a href="#Glob-GNPS-output-file" data-toc-modified-id="Glob-GNPS-output-file-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Glob GNPS output file</a></span></li><li><span><a href="#Select-columns-of-interest" data-toc-modified-id="Select-columns-of-interest-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Select columns of interest</a></span></li><li><span><a href="#Translate-InChIKey-to-xref-in-KEGG,-BioCyc,-HMDB,-etc." data-toc-modified-id="Translate-InChIKey-to-xref-in-KEGG,-BioCyc,-HMDB,-etc.-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Translate InChIKey to xref in KEGG, BioCyc, HMDB, etc.</a></span></li><li><span><a href="#Use-InChIKey-to-extract-Smiles,-Molecular-Formulas-from-PubChem" data-toc-modified-id="Use-InChIKey-to-extract-Smiles,-Molecular-Formulas-from-PubChem-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Use InChIKey to extract Smiles, Molecular Formulas from PubChem</a></span></li><li><span><a href="#Save" data-toc-modified-id="Save-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Save</a></span></li></ul></li></ul></div>

# Fixing GNPS annotation output

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import glob
import requests
import time
import json
import io

## Glob GNPS output file

In [2]:
gnps_file = glob.glob('gnps/ProteoSAFe-MOLECULAR-LIBRARYSEARCH-*/*.tsv')[0]
gnps_file

'gnps/ProteoSAFe-MOLECULAR-LIBRARYSEARCH-V2-d46f30ee-download_all_identifications/MOLECULAR-LIBRARYSEARCH-V2-d46f30ee-download_all_identifications-main.tsv'

In [3]:
# import
df = pd.read_csv(gnps_file, sep = '\t')

# remove positive hits
df = df[df['IonMode']=='Negative'].reset_index()

print(df.shape)
df.head()

(35, 45)


Unnamed: 0,index,SpectrumID,Compound_Name,Ion_Source,Instrument,Compound_Source,PI,Data_Collector,Adduct,Precursor_MZ,...,NumberHits,full_CCMS_path,tags,MoleculeExplorerDatasets,MoleculeExplorerFiles,InChIKey,InChIKey-Planar,superclass,class,subclass
0,10,CCMSLIB00003140214,Spectral Match to (-)-Catechin gallate from NI...,ESI,IT/ion trap,Isolated,Data from Norberto Peporine Lopes,Data deposited by faustocn,M-H,441.084,...,1,nathaliagg/URen/consensus/ms2spectra_consensus...,,18,613,,,,,
1,11,CCMSLIB00003107838,PE(14:0/0:0); [M-H]- C19H39N1O7P1,LC-ESI,CID; Velos,Commercial,Thomas Metz,Thomas Metz,M-H,425.254,...,1,nathaliagg/URen/consensus/ms2spectra_consensus...,,0,0,,,,,
2,17,CCMSLIB00003099089,PE(18:0/0:0); [M-H]- C23H47N1O7P1,LC-ESI,CID; Velos,Commercial,Thomas Metz,Thomas Metz,M-H,481.317,...,1,nathaliagg/URen/consensus/ms2spectra_consensus...,,57,3123,,,,,
3,21,CCMSLIB00005740087,Massbank:PR305485 isorhamnetin-3-O-rutinoside,ESI,qTof,Isolated,Massbank,Massbank,M-H,623.162,...,1,nathaliagg/URen/consensus/ms2spectra_consensus...,,0,0,UIDGLYUNOUKLBM-UHFFFAOYSA-N,UIDGLYUNOUKLBM,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides
4,23,CCMSLIB00005771063,Massbank:MT000020 Docosapentaenoic acid (22:5(...,ESI,Ion Trap,Isolated,Massbank,Massbank,M-H,329.4,...,1,nathaliagg/URen/consensus/ms2spectra_consensus...,,0,0,YUFFSWGQGVEMMI-UHFFFAOYSA-N,YUFFSWGQGVEMMI,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates


## Select columns of interest

These columns match the output by Sirius as well.

In [4]:
subset_df = df.copy()

subset_df = subset_df.loc[:, [
    'Adduct',
    'IonMode',
    'InChIKey',
    '#Scan#',
    'superclass',
    'class',
    'subclass',
]]

subset_df = subset_df[~subset_df['InChIKey'].isna()].reset_index(drop=True)
subset_df['#Scan#'] = "FT"+subset_df['#Scan#'].apply(lambda x: '{0:0>4}'.format(x))
subset_df.rename(columns={'#Scan#':'Features'}, inplace=True)
subset_df = subset_df.set_index('Features')
subset_df = subset_df.reset_index()
list_columns = list(subset_df.columns)
new_columns = ['MolecularFormula', 'ExactMass', 'InChI', 'CanonicalSmiles', 
               'PubChem_CID', 'KEGG', 'BioCyc', 'LIPIDMAPS', 'ChEBI', 'HMDB']

list_columns.extend(new_columns)
subset_df = subset_df.reindex(columns = list_columns, fill_value='')

print(subset_df.shape)
subset_df.head()

(19, 17)


Unnamed: 0,Features,Adduct,IonMode,InChIKey,superclass,class,subclass,MolecularFormula,ExactMass,InChI,CanonicalSmiles,PubChem_CID,KEGG,BioCyc,LIPIDMAPS,ChEBI,HMDB
0,FT2518,M-H,Negative,UIDGLYUNOUKLBM-UHFFFAOYSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,,,,,,
1,FT0551,M-H,Negative,YUFFSWGQGVEMMI-UHFFFAOYSA-N,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,,,,,,,,,,
2,FT2417,M+H,Negative,YPZRWBKMTBYPTK-BJDJZHNGSA-N,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",,,,,,,,,,
3,FT2103,M-H,Negative,XFZJEEAOWLFHDH-NFJBMHMQSA-N,Phenylpropanoids and polyketides,Flavonoids,Biflavonoids and polyflavonoids,,,,,,,,,,
4,FT1134,M-H,Negative,JPUKWEQWGBDDQB-QSOFNFLRSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,,,,,,


## Translate InChIKey to xref in KEGG, BioCyc, HMDB, etc.

I used the REST API by [Chemical Translation Service - CTS](https://cts.fiehnlab.ucdavis.edu/)

In [5]:
for index, row in subset_df.iterrows():
    #print(index, row['InChIKey'])
    
    query = row['InChIKey']
    
    ## translate inchikey using the Chemical Translation Service - CTS
    s = requests.Session()
    main_url = "https://cts.fiehnlab.ucdavis.edu/rest/"

    list_dbs = ['KEGG', 'BioCyc', 'LIPIDMAPS', 'ChEBI', 'Human Metabolome Database', "PubChem CID"]
    result_dict = {key:None for key in list_dbs}
    result_dict['InChIKey'] = query

    for db in list_dbs:

        r = s.get(main_url+'convert/InChIKey/'+db+'/'+query)
        r = json.loads(r.text)

        result = dict()

        if type(r) == dict: # error
            pass
        else:
            hit_dict = r[0]
            #print(hit_dict)
            if 'results' in hit_dict:
                if len(hit_dict['results']) == 1:
                    result_dict[db] = hit_dict['results'][0]
                elif len(hit_dict['results']) > 1:
                    result_dict[db] = ",".join(hit_dict['results'])
                else:
                    result_dict[db] = ""
            else:
                result_dict[db] = ""
    
    subset_df.loc[index,'KEGG'] = result_dict['KEGG']
    subset_df.loc[index,'BioCyc'] = result_dict['BioCyc']
    subset_df.loc[index,'LIPIDMAPS'] = result_dict['LIPIDMAPS']
    subset_df.loc[index,'ChEBI'] = result_dict['ChEBI']
    subset_df.loc[index,'HMDB'] = result_dict['Human Metabolome Database']
    subset_df.loc[index,'PubChem_CID'] = result_dict['PubChem CID']
    
print('Done')
subset_df.head()

Done


Unnamed: 0,Features,Adduct,IonMode,InChIKey,superclass,class,subclass,MolecularFormula,ExactMass,InChI,CanonicalSmiles,PubChem_CID,KEGG,BioCyc,LIPIDMAPS,ChEBI,HMDB
0,FT2518,M-H,Negative,UIDGLYUNOUKLBM-UHFFFAOYSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,6223069,,,,,HMDB0037746
1,FT0551,M-H,Negative,YUFFSWGQGVEMMI-UHFFFAOYSA-N,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,,,,,3145,,,,,
2,FT2417,M+H,Negative,YPZRWBKMTBYPTK-BJDJZHNGSA-N,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",,,,,1121565265359,"C00127,D00031",,,CHEBI:17858,
3,FT2103,M-H,Negative,XFZJEEAOWLFHDH-NFJBMHMQSA-N,Phenylpropanoids and polyketides,Flavonoids,Biflavonoids and polyflavonoids,,,,,122738,C17639,CPD-1981,LMPK12030002,CHEBI:75632,HMDB0033973
4,FT1134,M-H,Negative,JPUKWEQWGBDDQB-QSOFNFLRSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,5282102,C12249,,,CHEBI:30200,HMDB0037429


## Use InChIKey to extract Smiles, Molecular Formulas from PubChem

GNPS does not provide these info in their output.

I used the Pug REST Service by PubChem.

In [6]:
for index, row in subset_df.iterrows():
    #print(index, row['InChIKey'])
    
    query = row['InChIKey']
    
    ## search pubchem for molecular formula, exact mass, smiles and inchi using inchikey
    ## 'MolecularFormula', 'ExactMass', 'InChI', 'CanonicalSmiles',
    properties = 'MolecularFormula,MolecularWeight,ExactMass,CanonicalSMILES,InChI'

    r = s.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/'+query+'/property/'+properties+'/CSV').content

    result_df = pd.read_csv(io.StringIO(r.decode('utf-8')))
    
    if 'Status: 404' in result_df.columns:
        pass
    
    elif not result_df.empty == True:
        
        mol_formula = ",".join(list(result_df['MolecularFormula'].unique()))
        subset_df.loc[index,'MolecularFormula'] = mol_formula
        
        ex_mass = ",".join([str(x) for x in result_df['ExactMass'].unique()])
        subset_df.loc[index,'ExactMass'] = ex_mass
        
        can_smiles = ",".join(list(result_df['CanonicalSMILES'].unique()))
        subset_df.loc[index,'CanonicalSmiles'] = can_smiles
        
        inchi = ",".join(list(result_df['InChI'].unique()))
        subset_df.loc[index,'InChI'] = inchi
    
    else:
        pass
    
    
    if index%5 == 0:
        time.sleep(3)

    #print(index, row['InChIKey'], 'Done searches')
    
print('Done')
subset_df.head()

Done


Unnamed: 0,Features,Adduct,IonMode,InChIKey,superclass,class,subclass,MolecularFormula,ExactMass,InChI,CanonicalSmiles,PubChem_CID,KEGG,BioCyc,LIPIDMAPS,ChEBI,HMDB
0,FT2518,M-H,Negative,UIDGLYUNOUKLBM-UHFFFAOYSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,C28H32O16,624.169035,InChI=1S/C28H32O16/c1-9-18(32)21(35)23(37)27(4...,CC1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3=C(OC4=CC(=CC(...,6223069,,,,,HMDB0037746
1,FT0551,M-H,Negative,YUFFSWGQGVEMMI-UHFFFAOYSA-N,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,C22H34O2,330.25588,InChI=1S/C22H34O2/c1-2-3-4-5-6-7-8-9-10-11-12-...,CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)O,3145,,,,,
2,FT2417,M+H,Negative,YPZRWBKMTBYPTK-BJDJZHNGSA-N,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",C20H32N6O12S2,612.151963,InChI=1S/C20H32N6O12S2/c21-9(19(35)36)1-3-13(2...,C(CC(=O)NC(CSSCC(C(=O)NCC(=O)O)NC(=O)CCC(C(=O)...,1121565265359,"C00127,D00031",,,CHEBI:17858,
3,FT2103,M-H,Negative,XFZJEEAOWLFHDH-NFJBMHMQSA-N,Phenylpropanoids and polyketides,Flavonoids,Biflavonoids and polyflavonoids,C30H26O12,578.142426,InChI=1S/C30H26O12/c31-13-7-20(37)24-23(8-13)4...,C1C(C(OC2=C1C(=CC(=C2C3C(C(OC4=CC(=CC(=C34)O)O...,122738,C17639,CPD-1981,LMPK12030002,CHEBI:75632,HMDB0033973
4,FT1134,M-H,Negative,JPUKWEQWGBDDQB-QSOFNFLRSA-N,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,C21H20O11,448.100561,InChI=1S/C21H20O11/c22-7-13-15(26)17(28)18(29)...,C1=CC(=CC=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)OC4C...,5282102,C12249,,,CHEBI:30200,HMDB0037429


## Save

In [7]:
subset_df.to_csv('summary_output_GNPS.csv', index=False)