# GNPS Preprocessing Notebook

This notebook downloads the GNPS library, and does some filtering to eliminate obvious low-quality spectra or spectra that are unlikely to have CE=35.

This requires the NIST-20 TSVs generated by the preprocessing script.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')

from pandarallel import pandarallel
from multiprocessing import cpu_count
pandarallel.initialize(progress_bar=False, verbose=0, nb_workers=cpu_count()//2)

from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem.Descriptors import ExactMolWt
from pyteomics.mass import Composition

import sys
sys.path.append('../..')

In [2]:
!wget https://gnps-external.ucsd.edu/gnpslibrary/GNPS-LIBRARY.msp

In [3]:
from src.io import read_msp

df = read_msp('GNPS-LIBRARY.msp',parallel=True)
df.shape

(13756, 19)

In [4]:
df = df.dropna(subset=['INSTRUMENT'])
df = df.loc[df['INSTRUMENT'].str.contains('Orbitrap')]
df.shape

(5494, 19)

In [5]:
df = df.query('PRECURSORTYPE=="M+H" or PRECURSORTYPE=="M-H"')
df.shape

(4232, 19)

In [6]:
df = df.query('SMILES!=""')
df = df.query('SMILES!="N/A"')
df = df.dropna(subset=['SMILES'])
df = df.loc[~df['SMILES'].str.contains(r'\.')]
df.shape

(3653, 19)

In [7]:
# basic quantities
df['Spectrum'] = df['Comment'].str.split(';').str[0].str.split('=').str[1]
df['Precursor_type'] = df['PRECURSORTYPE'].map({'M+H':'[M+H]+','M-H':'[M-H]-'})
df['Formula'] = df['INCHI'].str.split('/').str[1]
df['PrecursorMZ'] = df['PRECURSORMZ'].astype(float)

df['mol'] = df['SMILES'].apply(Chem.MolFromSmiles)
df = df.dropna(subset='mol')
df['InChIKey'] = df['mol'].apply(Chem.MolToInchiKey)
df['InChIKey2D'] = df['InChIKey'].str.split('-').str[0]
df['Formula'] = df['mol'].apply(CalcMolFormula)

# fill these covariates with the modal value in the training set
df['NCE'] = 35.
df['Instrument'] = 'Thermo Finnigan Elite Orbitrap'

df.shape

(3648, 28)

In [8]:
# organics
atom_types = {'C','H','N','O','P','S','F','Cl','Br','I'}
df = df.loc[df['Formula'].parallel_apply(
    lambda f: set(Composition(formula=f.replace('+','').replace('-',''))) <= atom_types
)]
df.shape

(3636, 28)

In [9]:
# small molecules
df = df.query('PrecursorMZ<=1000')
df.shape

(3575, 28)

In [10]:
# remove anything with dubious MS1 m/z
m_H = ExactMolWt(Chem.MolFromSmiles('[H]'))
theoretical_mz = df['mol'].apply(ExactMolWt)
theoretical_mz += np.where(df['Precursor_type']=='[M+H]+',m_H,-m_H)
df = df.loc[abs(df['PrecursorMZ'] - theoretical_mz) < 0.1]
df.shape

(3429, 28)

In [11]:
# deduplicate spectra -- easiest to just sample one at random
df['num_peaks'] = df['mzs'].str.len()
df = df.sample(frac=1,random_state=0)
df = df.drop_duplicates(subset=['SMILES','Precursor_type','num_peaks'],keep='first')
df.shape

(3116, 29)

In [12]:
# reject spectra that have too many or too few peaks to plausibly be our chosen energy
normed_num_peaks = df['num_peaks'] / df['PrecursorMZ']
# these are 10th and 90th percentiles for this statistic @ NCE=35 in our NIST-20 subset
df = df.loc[(0.02 <= normed_num_peaks) & (normed_num_peaks <= 0.16)] 
df.shape

(919, 29)

In [13]:
# pick a replicate at random
df = df.sample(frac=1,random_state=0)
df = df.drop_duplicates(subset=['SMILES','Precursor_type'],keep='first')
df.shape

(891, 29)

In [14]:
# remove 2D structure matches to NIST
nist_smiles = !cat ../nist-20/hr_msms_nist_*.tsv | cut -f2 | sort | uniq
nist_inchikeys = [Chem.MolToInchiKey(Chem.MolFromSmiles(s)) for s in nist_smiles]
nist_inchikey2ds = {x.split('-')[0] for x in nist_inchikeys}

df = df.loc[~df['InChIKey2D'].isin(nist_inchikey2ds)]

print(len(df),'spectra')
print(df['InChIKey'].nunique(),'structures')

707 spectra
632 structures


In [15]:
from src.io import write_msp

tsv_path = 'gnps.tsv'
cols = ['Spectrum','SMILES','Precursor_type','NCE','Instrument']
df[cols].to_csv(tsv_path, sep='\t', header=False, index=False)

msp_path = 'gnps.msp'
cols += ['InChIKey','Formula','PrecursorMZ']
write_msp(
    msp_path,
    df['mzs'].tolist(), 
    df['intensities'].tolist(),
    **{c: df[c].tolist() for c in cols}
)

In [None]:
# clean up
!rm GNPS-LIBRARY.msp