<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Now,-let's-deal-with-SIRIUS-output" data-toc-modified-id="Now,-let's-deal-with-SIRIUS-output-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Now, let's deal with SIRIUS output</a></span><ul class="toc-item"><li><span><a href="#Import-libraries" data-toc-modified-id="Import-libraries-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Import libraries</a></span></li><li><span><a href="#Glob-compound-identifications" data-toc-modified-id="Glob-compound-identifications-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Glob compound identifications</a></span></li><li><span><a href="#Glob-compound-classification-output-(CANOPUS)" data-toc-modified-id="Glob-compound-classification-output-(CANOPUS)-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Glob compound classification output (CANOPUS)</a></span></li><li><span><a href="#Open-these-files-as-dataframes-and-merge-them" data-toc-modified-id="Open-these-files-as-dataframes-and-merge-them-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Open these files as dataframes and merge them</a></span></li><li><span><a href="#Let's-deal-with-the-annotations..." data-toc-modified-id="Let's-deal-with-the-annotations...-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Let's deal with the annotations...</a></span></li><li><span><a href="#Save" data-toc-modified-id="Save-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Save</a></span></li></ul></li><li><span><a href="#For-MAGI" data-toc-modified-id="For-MAGI-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>For MAGI</a></span><ul class="toc-item"><li><span><a href="#PUGREST" data-toc-modified-id="PUGREST-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>PUGREST</a></span></li><li><span><a href="#Merge-the-exploded-dataframe-with-the-dataframe-of-pubchem-hits" data-toc-modified-id="Merge-the-exploded-dataframe-with-the-dataframe-of-pubchem-hits-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Merge the exploded dataframe with the dataframe of pubchem hits</a></span></li></ul></li></ul></div>

# Now, let's deal with SIRIUS output

... which is TOTALLY different than GNPS!

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import glob,os,re,sys
from natsort import natsorted
import requests
import time
import json
import io
import pprint

s = requests.Session()
main_url = "https://cts.fiehnlab.ucdavis.edu/rest/"


def split(a, n):
    """
    Function to split list a in n groups
    """
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

## Glob compound identifications

In [2]:
str_candidates_files = natsorted(glob.glob('sirius/SIRIUS_output/group_*/compound_identifications.tsv'))
print(len(str_candidates_files))
pprint.pprint(str_candidates_files[:2])

305
['sirius/SIRIUS_output/group_1/compound_identifications.tsv',
 'sirius/SIRIUS_output/group_2/compound_identifications.tsv']


## Glob compound classification output (CANOPUS)

In [3]:
canopus_files = natsorted(glob.glob('sirius/SIRIUS_output/group_*/canopus_summary.tsv'))
print(len(canopus_files))
pprint.pprint(canopus_files[:2])

313
['sirius/SIRIUS_output/group_1/canopus_summary.tsv',
 'sirius/SIRIUS_output/group_2/canopus_summary.tsv']


## Open these files as dataframes and merge them

In [4]:
summary_df = pd.DataFrame()

for file in str_candidates_files:

    intermediate_df = pd.read_csv(file, sep='\t')

    intermediate_df['Features'] = intermediate_df['id'].str.split('_').str[-1]

    col_list = [
        'Features', 'molecularFormula',
        'adduct', 'InChI', 'smiles', 'links',
    ]
    intermediate_df = intermediate_df.loc[:, col_list]

    summary_df = pd.concat([summary_df, intermediate_df], axis=0)


print(summary_df.shape)

canopus_df = pd.DataFrame()

for file in canopus_files:

    intermediate_df = pd.read_csv(file, sep='\t', index_col=False)

    if not intermediate_df.empty:

        intermediate_df['Features'] = intermediate_df['name'].str.split(
            '_').str[-1]
        intermediate_df = intermediate_df.loc[:, [
            'Features', 'all classifications', 'superclass', 'class',
            'subclass'
        ]]

        canopus_df = pd.concat([canopus_df, intermediate_df],
                            axis=0).reset_index(drop=True)

    else:
        continue

summary_df = summary_df.merge(canopus_df, on="Features", how='left')

print(summary_df.shape)

summary_df.head()

(916, 6)
(916, 10)


Unnamed: 0,Features,molecularFormula,adduct,InChI,smiles,links,all classifications,superclass,class,subclass
0,FT0158,C6H9N4O5P,[M - H]-,"InChI=1S/C6H9N4O5P/c1-14-16(13,15-2)6-4(10(11)...",COP(=O)(C1=NC=NC(=C1[N+](=O)[O-])N)OC,PubChem:(3055774);PubChem class - safety and t...,Organic compounds; Organoheterocyclic compound...,Organoheterocyclic compounds,Heteroaromatic compounds,
1,FT0216,C17H12OS,[M - H]-,InChI=1S/C17H12OS/c1-10-4-12-6-15-9-17-11(2-3-...,CC1=CC2=CC3=C(C=C4C=CSC4=C3)C=C2C=C1O,PubChem:(122448813),Organic compounds; Organosulfur compounds; Phe...,Benzenoids,Phenols,1-hydroxy-2-unsubstituted benzenoids
2,FT0221,C15H18BN3O,[M - H]-,InChI=1S/C15H18BN3O/c1-9(2)19(10(3)4)13-8-17-7...,[B]C(=O)C1=NC2=C(C=NC=C2C=C1)N(C(C)C)C(C)C,PubChem:(58495757),Organic compounds; Organic acids and derivativ...,Benzenoids,,
3,FT0199,C10H28N8,[M - H]-,InChI=1S/C10H28N8/c11-1-2-13-3-4-14-5-6-15-7-8...,C(CNCCNCCNCCNCCN=NN)N,PubChem:(118629314),Organic compounds; Organoheterocyclic compound...,Organoheterocyclic compounds,Azacyclic compounds,
4,FT0227,C10H12N4O5,[M - H]-,InChI=1S/C10H12N4O5/c15-2-5-4(16)1-6(19-5)14-3...,C1C(C(OC1N2C=NC3=C2NC(=O)NC3=O)CO)O,PubChem:(65372 14282796 54124565 57169553 5864...,Organic compounds; Halobenzenes; Hydrazines an...,Benzenoids,Benzene and substituted derivatives,Halobenzenes


## Let's deal with the annotations...

All possible annotations are in the `links` column. Extract the annotation identifiers using `re`.

I am not extracting entries from all databases. I am only extracting the most important ones like HMDB, YMDB, BioCyc, KEGG, PubChem, CHEBI, and COCONUT because this database name is great lol.

In [5]:
annotation_df = summary_df.copy()

## Extract HMDB, YMDB and KNApSAcK hits
## it will match to everything inside two parenthesis ()
## preceded by the name of the database like HMDB(*)
## but pay attention that in some instances, there is more than one entry/database
reg_expr = '\(([^)]+)\)'
to_extract = [('HMDB', 'HMDB:'+reg_expr),
              ('YMDB', 'YMDB:'+reg_expr), 
              ('KNApSAcK', 'KNApSAcK:'+reg_expr),
              ('CHEBI', 'CHEBI:'+reg_expr),
              ('PlantCyc', 'Plantcyc:'+reg_expr),
              ('BioCyc', 'Biocyc:'+reg_expr), 
              ('KEGG', 'KEGG:'+reg_expr),
              ('COCONUT', 'COCONUT:'+reg_expr),
              ('PubChem_CID', 'PubChem:'+reg_expr)]

for t in to_extract:
    annotation_df[t[0]] = annotation_df['links'].str.extract(t[1]).fillna('')

## Fix HMDB, YMDB and KNApSAcK entries!!
to_fix = [('HMDB',  'HMDB{0:0>7}'),
          ('YMDB',  'YMDB{0:0>5}'), 
          ('KNApSAcK', 'C{0:0>8}'),
          ('CHEBI', 'CHEBI:{}')]

for t in to_fix:
    # print(t)
    for index, row in annotation_df.iterrows():

        if row[t[0]] == "":
            continue
        elif " " in row[t[0]]:
            item_list = [t[1].format(x) for x in row[t[0]].split(' ')]
            item_str = ",".join(item_list)
            # print(t[0], index, row[t[0]], item_str)
            annotation_df.loc[index, t[0]] = item_str
        else:
            item_str = t[1].format(row[t[0]])
            # print(t[0], index, row[t[0]], item_str)
            annotation_df.loc[index, t[0]] = item_str


annotation_df.head()

Unnamed: 0,Features,molecularFormula,adduct,InChI,smiles,links,all classifications,superclass,class,subclass,HMDB,YMDB,KNApSAcK,CHEBI,PlantCyc,BioCyc,KEGG,COCONUT,PubChem_CID
0,FT0158,C6H9N4O5P,[M - H]-,"InChI=1S/C6H9N4O5P/c1-14-16(13,15-2)6-4(10(11)...",COP(=O)(C1=NC=NC(=C1[N+](=O)[O-])N)OC,PubChem:(3055774);PubChem class - safety and t...,Organic compounds; Organoheterocyclic compound...,Organoheterocyclic compounds,Heteroaromatic compounds,,,,,,,,,,3055774
1,FT0216,C17H12OS,[M - H]-,InChI=1S/C17H12OS/c1-10-4-12-6-15-9-17-11(2-3-...,CC1=CC2=CC3=C(C=C4C=CSC4=C3)C=C2C=C1O,PubChem:(122448813),Organic compounds; Organosulfur compounds; Phe...,Benzenoids,Phenols,1-hydroxy-2-unsubstituted benzenoids,,,,,,,,,122448813
2,FT0221,C15H18BN3O,[M - H]-,InChI=1S/C15H18BN3O/c1-9(2)19(10(3)4)13-8-17-7...,[B]C(=O)C1=NC2=C(C=NC=C2C=C1)N(C(C)C)C(C)C,PubChem:(58495757),Organic compounds; Organic acids and derivativ...,Benzenoids,,,,,,,,,,,58495757
3,FT0199,C10H28N8,[M - H]-,InChI=1S/C10H28N8/c11-1-2-13-3-4-14-5-6-15-7-8...,C(CNCCNCCNCCNCCN=NN)N,PubChem:(118629314),Organic compounds; Organoheterocyclic compound...,Organoheterocyclic compounds,Azacyclic compounds,,,,,,,,,,118629314
4,FT0227,C10H12N4O5,[M - H]-,InChI=1S/C10H12N4O5/c15-2-5-4(16)1-6(19-5)14-3...,C1C(C(OC1N2C=NC3=C2NC(=O)NC3=O)CO)O,PubChem:(65372 14282796 54124565 57169553 5864...,Organic compounds; Halobenzenes; Hydrazines an...,Benzenoids,Benzene and substituted derivatives,Halobenzenes,,,,,,,,,65372 14282796 54124565 57169553 58648961 6962...


## Save

In [6]:
annotation_df.to_csv('summary_output_SIRIUS.csv', index=False)

# For MAGI

MAGI is the genome-metabolite integration tool I'll use next, and it requires the chemical structure of compounds (the freaking InChIKey). 

SIRIUS does **not** resolve the full 3D structure because it's *in silico*, so the output is just a partial or 2D structure, which does not work with MAGI. **Luckily**, SIRIUS xreferences to PubChem, and I can retrieve the 3D InChIKey from PubChem using the PUGRest. 

Some of these Sirius predictions have more than one PubChem hit, which could be different isomers. For MAGI, it's important to include these isomers, so I'll use the function `explode()` to expand rows from features that have more than one PubChem hit.

In [7]:
exploded_df = annotation_df.copy()

exploded_df['PubChem_CID'] = exploded_df['PubChem_CID'].replace("", 'NA')

exploded_df['PubChem_CID'] = exploded_df['PubChem_CID'].str.split()

## explode!!
exploded_df = exploded_df.explode('PubChem_CID')

## PUGREST

PUGREST allows the search of multiple compounds at once, but still doesn't like too many requests. So I am dividing the PubChem ids in lists of 10 ids, and searching 10 compounds at once. I am also setting a 20 s sleep time to give some time in between requests.

In [8]:
## list of pubchem ids 
list_cid = [x for x in exploded_df['PubChem_CID'].values if x != 'NA' ]

## split the list every 10 items
query_list = list(split(list_cid, 10))

## merge items in sublists for search in pugrest
query_string = [",".join(x) for x in query_list]

properties = 'MolecularFormula,ExactMass,InChIKey'

all_pubchem_hits = pd.DataFrame()

for query in query_string:

    r = s.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' +
              query + '/property/' + properties + '/CSV').content

    result_df = pd.read_csv(io.StringIO(r.decode('utf-8')))
    
    all_pubchem_hits = pd.concat([all_pubchem_hits, result_df], axis=0)
    
    time.sleep(20)
    
    
all_pubchem_hits  

Unnamed: 0,CID,MolecularFormula,ExactMass,InChIKey
0,3055774,C6H9N4O5P,248.031056,BSDBYSKGESWCFF-UHFFFAOYSA-N
1,122448813,C17H12OS,264.060886,IDBSUEYMOGXINU-UHFFFAOYSA-N
2,58495757,C15H18BN3O,267.154292,RJKKLMSSEUZHKT-UHFFFAOYSA-N
3,118629314,C10H28N8,260.243693,BEWWACPMZXTGDS-UHFFFAOYSA-N
4,65372,C10H12N4O5,268.080769,NQAZHXBSLFDVKM-KVQBGUIXSA-N
...,...,...,...,...
129,10486436,C60H85N11O12,1151.637917,HYTNEVIAWLBLPM-IPHVNIDGSA-N
130,11665321,C56H81N15O11S,1171.596070,XYLCCDVVSFIDAE-CLQPFZBRSA-N
131,90872839,C48H56N2O33,1188.276532,SAIFNDSBLPJMTH-UHFFFAOYSA-N
132,23428107,C59H98O24,1190.644804,SISUGQHDWWHKGJ-ZYHIYBPQSA-N


## Merge the exploded dataframe with the dataframe of pubchem hits

and save!

In [9]:
pubchem_hits = all_pubchem_hits.copy()

pubchem_hits['CID'] = pubchem_hits['CID'].astype(str)

merged = exploded_df.merge(pubchem_hits, left_on = "PubChem_CID", right_on = "CID", how = "left")

merged['PubChem_CID'] = merged['PubChem_CID'].replace('NA', '')

merged = merged.drop('CID', axis=1)

print(merged.shape)

merged.to_csv('summary_output_SIRIUS_exploded.csv', index=False)

merged.head()

(1449, 22)


Unnamed: 0,Features,molecularFormula,adduct,InChI,smiles,links,all classifications,superclass,class,subclass,...,KNApSAcK,CHEBI,PlantCyc,BioCyc,KEGG,COCONUT,PubChem_CID,MolecularFormula,ExactMass,InChIKey
0,FT0158,C6H9N4O5P,[M - H]-,"InChI=1S/C6H9N4O5P/c1-14-16(13,15-2)6-4(10(11)...",COP(=O)(C1=NC=NC(=C1[N+](=O)[O-])N)OC,PubChem:(3055774);PubChem class - safety and t...,Organic compounds; Organoheterocyclic compound...,Organoheterocyclic compounds,Heteroaromatic compounds,,...,,,,,,,3055774,C6H9N4O5P,248.031056,BSDBYSKGESWCFF-UHFFFAOYSA-N
1,FT0216,C17H12OS,[M - H]-,InChI=1S/C17H12OS/c1-10-4-12-6-15-9-17-11(2-3-...,CC1=CC2=CC3=C(C=C4C=CSC4=C3)C=C2C=C1O,PubChem:(122448813),Organic compounds; Organosulfur compounds; Phe...,Benzenoids,Phenols,1-hydroxy-2-unsubstituted benzenoids,...,,,,,,,122448813,C17H12OS,264.060886,IDBSUEYMOGXINU-UHFFFAOYSA-N
2,FT0221,C15H18BN3O,[M - H]-,InChI=1S/C15H18BN3O/c1-9(2)19(10(3)4)13-8-17-7...,[B]C(=O)C1=NC2=C(C=NC=C2C=C1)N(C(C)C)C(C)C,PubChem:(58495757),Organic compounds; Organic acids and derivativ...,Benzenoids,,,...,,,,,,,58495757,C15H18BN3O,267.154292,RJKKLMSSEUZHKT-UHFFFAOYSA-N
3,FT0199,C10H28N8,[M - H]-,InChI=1S/C10H28N8/c11-1-2-13-3-4-14-5-6-15-7-8...,C(CNCCNCCNCCNCCN=NN)N,PubChem:(118629314),Organic compounds; Organoheterocyclic compound...,Organoheterocyclic compounds,Azacyclic compounds,,...,,,,,,,118629314,C10H28N8,260.243693,BEWWACPMZXTGDS-UHFFFAOYSA-N
4,FT0227,C10H12N4O5,[M - H]-,InChI=1S/C10H12N4O5/c15-2-5-4(16)1-6(19-5)14-3...,C1C(C(OC1N2C=NC3=C2NC(=O)NC3=O)CO)O,PubChem:(65372 14282796 54124565 57169553 5864...,Organic compounds; Halobenzenes; Hydrazines an...,Benzenoids,Benzene and substituted derivatives,Halobenzenes,...,,,,,,,65372,C10H12N4O5,268.080769,NQAZHXBSLFDVKM-KVQBGUIXSA-N
