# CASMI-16 Preprocessing Notebook

This notebook downloads CASMI-16, which is stored across various files of different formats, and converts them into files usable by our evaluation scripts.

In order to remove NIST-20 structure matches, this requires the TSVs generated by the NIST-20 preprocessing script.

In [1]:
# !wget http://www.casmi-contest.org/2016/CASMI2016_Cat2and3_Training.csv
# !wget http://www.casmi-contest.org/2016/CASMI2016_Cat2and3_Challenge.csv
# !wget http://www.casmi-contest.org/2016/solutions_casmi2016_cat2and3.csv
# !wget http://www.casmi-contest.org/2016/CASMI2016_Cat2and3_Training_positive_mgf.zip
# !wget http://www.casmi-contest.org/2016/CASMI2016_Cat2and3_Training_negative_mgf.zip
# !wget http://www.casmi-contest.org/2016/CASMI2016_Cat2and3_Challenge_positive_mgf.zip
# !wget http://www.casmi-contest.org/2016/CASMI2016_Cat2and3_Challenge_negative_mgf.zip

In [2]:
import numpy as np
import numpy.random as npr
import pandas as pd
from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')
import sys
sys.path.append('../..')

In [3]:
from src.io import read_mgf

!for f in *.zip; do unzip $f; done
casmi_mgfs = !ls *.mgf

df = pd.Series(casmi_mgfs).apply(read_mgf)
df = pd.DataFrame.from_records(sum(df,[]))

casmi_train = pd.read_csv('CASMI2016_Cat2and3_Training.csv',index_col=0)
casmi_train = casmi_train.rename(columns={'IUPAC':'INCHI','challengename':'ChallengeName'})
casmi_test = pd.read_csv('solutions_casmi2016_cat2and3.csv',index_col=0)

df['ChallengeName'] = df['filename'].str.split('.').str[0]

df = df.merge(pd.concat([casmi_train,casmi_test]),on='ChallengeName')

for c in df.columns:
    if isinstance(df[c].iloc[0],str):
        df[c] = df[c].str.strip()

df['Spectrum'] = df['ChallengeName']
df['Precursor_type'] = df['ION_MODE'].map({'POSITIVE':'[M+H]+','NEGATIVE':'[M-H]-'})
df['Formula'] = df['INCHI'].str.split('/').str[1]
df['InChIKey'] = df['INCHIKEY']
df['InChIKey2D'] = df['INCHIKEY'].str.split('-').str[0]
df['PrecursorMZ'] = df['PRECURSOR_MZ']

# fill covariates with the modal value in the training set
df['NCE'] = 35.
df['Instrument'] = 'Thermo Finnigan Elite Orbitrap'

from pyteomics.mass import Composition
atom_types = {'C','H','N','O','P','S','F','Cl','Br','I'}
df = df.loc[df['Formula'].map(lambda x: {*Composition(formula=x)} <= atom_types)]

print(df.shape)
df.head(3)

Archive:  CASMI2016_Cat2and3_Challenge_negative_mgf.zip
  inflating: Challenge-001.mgf       
  inflating: Challenge-002.mgf       
  inflating: Challenge-003.mgf       
  inflating: Challenge-004.mgf       
  inflating: Challenge-005.mgf       
  inflating: Challenge-006.mgf       
  inflating: Challenge-007.mgf       
  inflating: Challenge-008.mgf       
  inflating: Challenge-009.mgf       
  inflating: Challenge-010.mgf       
  inflating: Challenge-011.mgf       
  inflating: Challenge-012.mgf       
  inflating: Challenge-013.mgf       
  inflating: Challenge-014.mgf       
  inflating: Challenge-015.mgf       
  inflating: Challenge-016.mgf       
  inflating: Challenge-017.mgf       
  inflating: Challenge-018.mgf       
  inflating: Challenge-019.mgf       
  inflating: Challenge-020.mgf       
  inflating: Challenge-021.mgf       
  inflating: Challenge-022.mgf       
  inflating: Challenge-023.mgf       
  inflating: Challenge-024.mgf       
  inflating: Challenge-025.mgf  

Unnamed: 0,filename,precursor_mz,precursor_charge,mzs,intensities,ChallengeName,PRECURSOR_MZ,ION_MODE,RT,nPeaks,...,CSID,PC_CID,Spectrum,Precursor_type,Formula,InChIKey,InChIKey2D,PrecursorMZ,NCE,Instrument
0,Challenge-001.mgf,222.023,1,"[107.0491, 131.0502, 157.0533, 158.0611, 222.023]","[11496.1, 12601.5, 11285.7, 2146837.0, 1058536...",Challenge-001,222.023,NEGATIVE,0.803,5,...,8101,8408.0,Challenge-001,[M-H]-,C10H9NO3S,UWPJYQYRSWYIGZ-UHFFFAOYSA-N,UWPJYQYRSWYIGZ,222.023,35.0,Thermo Finnigan Elite Orbitrap
1,Challenge-002.mgf,286.969,1,"[143.0503, 159.0455, 206.0044, 207.0122, 223.0...","[730689.2, 4983.9, 6564.5, 2890253.8, 15932.3,...",Challenge-002,286.969,NEGATIVE,1.964,6,...,60073,66707.0,Challenge-002,[M-H]-,C10H8O6S2,VILFVXYKHXVYAB-UHFFFAOYSA-N,VILFVXYKHXVYAB,286.969,35.0,Thermo Finnigan Elite Orbitrap
2,Challenge-003.mgf,223.0071,1,"[143.0502, 158.0374, 159.0451, 221.9993, 223.007]","[255766.4, 55048.3, 832484.6, 13045.9, 7569678.5]",Challenge-003,223.0071,NEGATIVE,2.371,5,...,60378,67025.0,Challenge-003,[M-H]-,C10H8O4S,YLKCHWCYYNKADS-UHFFFAOYSA-N,YLKCHWCYYNKADS,223.0071,35.0,Thermo Finnigan Elite Orbitrap


In [4]:
# remove 2D structure matches to NIST
nist_smiles = !cat ../nist-20/hr_msms_nist_*.tsv | cut -f2 | sort | uniq
nist_inchikeys = [Chem.MolToInchiKey(Chem.MolFromSmiles(s)) for s in nist_smiles]
nist_inchikey2ds = {x.split('-')[0] for x in nist_inchikeys}

df = df.loc[~df['InChIKey2D'].isin(nist_inchikey2ds)]

print(len(df),'spectra')
print(df['InChIKey'].nunique(),'structures')

166 spectra
151 structures


In [5]:
from src.io import write_msp

tsv_path = 'casmi-16.tsv'
cols = ['Spectrum','SMILES','Precursor_type','NCE','Instrument']
df[cols].to_csv(tsv_path, sep='\t', header=False, index=False)

msp_path = 'casmi-16.msp'
cols += ['InChIKey','Formula','PrecursorMZ']
write_msp(
    msp_path,
    df['mzs'].tolist(), 
    df['intensities'].tolist(),
    **{c: df[c].tolist() for c in cols}
)

In [6]:
# clean up
!rm *.mgf