# You're not done... Merge GNPS and SIRIUS Outputs

I am using the SIRIUS output that is not `exploded`. Exploded is just for MAGI.

## Let's gooooo

In [1]:
import pandas as pd
import numpy as np
import os
import re
from natsort import natsorted
from molmass import Formula


# -------------------------------------------------------------------------
def om_calculations(df, list_elements):
    """
    Calculations of O:C, H:C, NOSC, GFE, etc.
    """
    
    df[list_elements] = df[list_elements].replace(np.nan,0)
    df[list_elements] = df[list_elements].astype(float)
    
    df['OC'] = df['O']/df['C']
    df['HC'] = df['H']/df['C']
    df['NOSC'] = - ((4*df['C'] + df['H'] - 3*df['N'] - 2*df['O'] + 5*df['P'] - 2*df['S']) / (df['C'])) + 4
    df['GFE']= -(28.5 * df['NOSC']) + 60.3
    
    #Koch, B. P. and Dittmar, T.: From mass to structure: an aromaticity index for high-resolution mass data of natural organic matter, Rapid Commun. Mass Spectrom., 20(5), 926–932, doi:10.1002/rcm.2386, 2006.
    df['DBE'] = 1 + 0.5*(2* df['C'] - df['H'] + df['N'] + df['P'])
    df['DBE_O'] = (1 + 0.5*(2* df['C'] - df['H'] + df['N'] + df['P'])) - df['O']
    df['AI'] = (1 + df['C'] - df['O'] - df['S'] - ( (df['H']+df['P']+df['N'])*0.5) ) / (df['C'] - df['O'] - df['S'] - df['N'] - df['P'])
    df['AImod']  = (1 + df['C'] - (df['O']*0.5) - df['S'] - ( (df['H']+df['P']+df['N'])*0.5) ) / (df['C'] - (df['O']*0.5) - df['S'] - df['N'] - df['P'])
    df['DBE_AI'] = 1 + df['C'] - df['O'] - df['S'] - (0.5*(df['H'] + df['N'] + df['P']))
    
    indices = ['OC', 'HC', 'NOSC', 'GFE', 'DBE', 'DBE_O', 'AI', 'AImod', 'DBE_AI']
    
    df[indices] = df[indices].replace([-np.inf,+np.inf],[0,0])
    
    return df


# -------------------------------------------------------------------------

## Get GNPS

and fix a few column names so they concatenate correctly.

In [3]:
gnps = pd.read_csv('summary_output_GNPS.csv')

gnps = gnps.sort_values('Features').reset_index(drop=True)

features_gnps = list(gnps['Features'].values)

gnps = gnps.drop(['IonMode','LIPIDMAPS', 'InChIKey'], axis=1)

print(gnps.shape)

gnps_rename_dict = {
    'CanonicalSmiles':'Smiles'
}

gnps = gnps.rename(columns=gnps_rename_dict)

gnps.head(2)

(19, 14)


Unnamed: 0,Features,Adduct,superclass,class,subclass,MolecularFormula,ExactMass,InChI,Smiles,PubChem_CID,KEGG,BioCyc,ChEBI,HMDB
0,FT0158,M-H,Organic acids and derivatives,Organic phosphoric acids and derivatives,Phosphate esters,C9H12ClO4P,250.016174,"InChI=1S/C9H12ClO4P/c1-12-15(11,13-2)14-9-7-5-...",COP(=O)(OC)OC1=C(C2C1CC=C2)Cl,62773.0,C18660,,,HMDB0031793
1,FT0369,M-H,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",C12H23NO7,293.147452,InChI=1S/C12H23NO7/c1-3-6(2)8(11(17)18)13-5-12...,CCC(C)C(C(=O)O)NCC1(C(C(C(O1)CO)O)O)O,,,,,


## Get SIRIUS

Need some data wrangling here too...

In [4]:
sirius = pd.read_csv('summary_output_SIRIUS.csv')

sirius = sirius.sort_values('Features')

print(sirius.shape)

for ft in features_gnps:
    for index,row in sirius.iterrows():
        if row['Features'] == ft:
            sirius.drop(index, axis=0, inplace=True)
        else:
            continue

print(sirius.shape)

sirius = sirius.drop(['links','all classifications',
                      'YMDB', 'KNApSAcK','PlantCyc'], axis=1)

sirius_rename_dict = {
    'molecularFormula':'MolecularFormula',
    'adduct':'Adduct', 
    'smiles':'Smiles'
}

sirius = sirius.rename(columns=sirius_rename_dict)

sirius = sirius.reset_index(drop=True)

sirius.head(2)

(916, 19)
(902, 19)


Unnamed: 0,Features,MolecularFormula,Adduct,InChI,Smiles,superclass,class,subclass,HMDB,CHEBI,BioCyc,KEGG,COCONUT,PubChem_CID
0,FT0199,C10H28N8,[M - H]-,InChI=1S/C10H28N8/c11-1-2-13-3-4-14-5-6-15-7-8...,C(CNCCNCCNCCNCCN=NN)N,Organoheterocyclic compounds,Azacyclic compounds,,,,,,,118629314
1,FT0216,C17H12OS,[M - H]-,InChI=1S/C17H12OS/c1-10-4-12-6-15-9-17-11(2-3-...,CC1=CC2=CC3=C(C=C4C=CSC4=C3)C=C2C=C1O,Benzenoids,Phenols,1-hydroxy-2-unsubstituted benzenoids,,,,,,122448813


## Concat GNPS and SIRIUS

In [5]:
ms2_concat = pd.concat([sirius, gnps], axis=0).reset_index(drop=True)

ms2_concat = ms2_concat.replace('', np.nan)

ms2_concat = ms2_concat.sort_values('Features').reset_index(drop=True)

ms2_concat.to_csv('summary_ms2_annotation.csv', index=False)

ms2_concat.head()

Unnamed: 0,Features,MolecularFormula,Adduct,InChI,Smiles,superclass,class,subclass,HMDB,CHEBI,BioCyc,KEGG,COCONUT,PubChem_CID,ExactMass,ChEBI
0,FT0158,C9H12ClO4P,M-H,"InChI=1S/C9H12ClO4P/c1-12-15(11,13-2)14-9-7-5-...",COP(=O)(OC)OC1=C(C2C1CC=C2)Cl,Organic acids and derivatives,Organic phosphoric acids and derivatives,Phosphate esters,HMDB0031793,,,C18660,,62773,250.016174,
1,FT0199,C10H28N8,[M - H]-,InChI=1S/C10H28N8/c11-1-2-13-3-4-14-5-6-15-7-8...,C(CNCCNCCNCCNCCN=NN)N,Organoheterocyclic compounds,Azacyclic compounds,,,,,,,118629314,,
2,FT0216,C17H12OS,[M - H]-,InChI=1S/C17H12OS/c1-10-4-12-6-15-9-17-11(2-3-...,CC1=CC2=CC3=C(C=C4C=CSC4=C3)C=C2C=C1O,Benzenoids,Phenols,1-hydroxy-2-unsubstituted benzenoids,,,,,,122448813,,
3,FT0221,C15H18BN3O,[M - H]-,InChI=1S/C15H18BN3O/c1-9(2)19(10(3)4)13-8-17-7...,[B]C(=O)C1=NC2=C(C=NC=C2C=C1)N(C(C)C)C(C)C,Benzenoids,,,,,,,,58495757,,
4,FT0227,C10H12N4O5,[M - H]-,InChI=1S/C10H12N4O5/c15-2-5-4(16)1-6(19-5)14-3...,C1C(C(OC1N2C=NC3=C2NC(=O)NC3=O)CO)O,Benzenoids,Benzene and substituted derivatives,Halobenzenes,,,,,,65372 14282796 54124565 57169553 58648961 6962...,,


## From molecular formula, extract elements and calculate energetics like GFE, NOSC

and save!

In [6]:
df = ms2_concat.copy()

list_formulas = list(df['MolecularFormula'].unique())

unique_elements = set()

for formula in list_formulas:
    el_composition = [x[0] for x in Formula(formula).composition()]
    for el in el_composition:
        unique_elements.add(el)
        
for el in unique_elements:
    df[el] = df['MolecularFormula'].str.extract('('+el+'[\d]{0,3})')
    df[el] = df[el].replace(el,'1')
    df[el] = df[el].replace(el,"", regex=True)
    
list_columns = ['MolecularFormula']
list_columns.extend(list(unique_elements))

df = om_calculations(df, list(unique_elements))

df.to_csv('summary_ms2_annotation_energetics.csv', index=False)

df.head()

Unnamed: 0,Features,MolecularFormula,Adduct,InChI,Smiles,superclass,class,subclass,HMDB,CHEBI,...,N,OC,HC,NOSC,GFE,DBE,DBE_O,AI,AImod,DBE_AI
0,FT0158,C9H12ClO4P,M-H,"InChI=1S/C9H12ClO4P/c1-12-15(11,13-2)14-9-7-5-...",COP(=O)(OC)OC1=C(C2C1CC=C2)Cl,Organic acids and derivatives,Organic phosphoric acids and derivatives,Phosphate esters,HMDB0031793,,...,0.0,0.444444,1.333333,-1.0,88.8,4.5,0.5,-0.125,0.25,-0.5
1,FT0199,C10H28N8,[M - H]-,InChI=1S/C10H28N8/c11-1-2-13-3-4-14-5-6-15-7-8...,C(CNCCNCCNCCNCCN=NN)N,Organoheterocyclic compounds,Azacyclic compounds,,,,...,8.0,0.0,2.8,-0.4,71.7,1.0,1.0,-3.5,-3.5,-7.0
2,FT0216,C17H12OS,[M - H]-,InChI=1S/C17H12OS/c1-10-4-12-6-15-9-17-11(2-3-...,CC1=CC2=CC3=C(C=C4C=CSC4=C3)C=C2C=C1O,Benzenoids,Phenols,1-hydroxy-2-unsubstituted benzenoids,,,...,0.0,0.058824,0.705882,-0.470588,73.711765,12.0,11.0,0.666667,0.677419,10.0
3,FT0221,C15H18BN3O,[M - H]-,InChI=1S/C15H18BN3O/c1-9(2)19(10(3)4)13-8-17-7...,[B]C(=O)C1=NC2=C(C=NC=C2C=C1)N(C(C)C)C(C)C,Benzenoids,,,,,...,3.0,0.066667,1.2,-0.466667,73.6,8.5,7.5,0.409091,0.434783,4.5
4,FT0227,C10H12N4O5,[M - H]-,InChI=1S/C10H12N4O5/c15-2-5-4(16)1-6(19-5)14-3...,C1C(C(OC1N2C=NC3=C2NC(=O)NC3=O)CO)O,Benzenoids,Benzene and substituted derivatives,Halobenzenes,,,...,4.0,0.5,1.2,1.0,31.8,7.0,2.0,-2.0,0.142857,-2.0
