# ChEMBL Preprocessing Notebook

This samples structures from ChEMBL to construct an MS1 decoy library relative to NIST-20, and dumps files necessary to generate the spectra.

This requires the NIST-20 preprocessing be run first.

In [1]:
import numpy as np
import numpy.random as npr
import pandas as pd
from tqdm import tqdm

from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')

from pandarallel import pandarallel
from os import cpu_count
pandarallel.initialize(progress_bar=False, verbose=0, nb_workers=cpu_count()//2)

import sys
sys.path.append('../..')

In [2]:
!wget https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_31/chembl_31_chemreps.txt.gz

In [3]:
chembl = pd.read_csv('chembl_31_chemreps.txt.gz',sep='\t')
chembl = chembl.dropna()
chembl.shape

(2304866, 4)

In [4]:
# deduplicate
chembl = chembl.drop_duplicates(subset='canonical_smiles',keep='first')
chembl.shape

(2304723, 4)

In [5]:
# connected molecules
chembl = chembl.loc[~chembl['canonical_smiles'].str.contains(r'\.')]
chembl['formula'] = chembl['standard_inchi'].str.split('/').str[1]
chembl = chembl.loc[~chembl['formula'].str.contains(r'\.')]
chembl.shape

(2194254, 5)

In [6]:
# CHNOPSX
from pyteomics.mass import Composition

atom_types = {'C','H','N','O','P','S','F','Cl','Br','I'}

chembl = chembl.loc[chembl['formula'].parallel_apply(lambda x: set(Composition(formula=x))<=atom_types)]
chembl.shape

(2183953, 5)

In [7]:
# valid smiles
chembl['mol'] = chembl['canonical_smiles'].parallel_apply(lambda s: Chem.MolFromSmiles(s))
chembl = chembl.dropna()
chembl.shape

(2183952, 6)

In [8]:
# small molecules
from rdkit.Chem.Descriptors import ExactMolWt

chembl['mw'] = chembl['mol'].apply(ExactMolWt)
chembl = chembl.query('mw<=1000')
chembl.shape

(2141950, 7)

In [9]:
# neutrally charged
chembl['charge'] = chembl['mol'].apply(Chem.GetFormalCharge)
chembl = chembl.query('charge==0')
chembl.shape

(2121673, 8)

In [10]:
# NIST observed MS1 masses
nist_df = pd.read_pickle('../nist-20/hr_msms_nist.pkl')
nist_mws = nist_df['ExactMass'].dropna().astype(float).unique()
len(nist_mws)

12457

In [None]:
# take anything within tolerance of neutral mass

from sklearn.neighbors import BallTree
from rdkit.Chem.Descriptors import ExactMolWt

tree = BallTree(nist_mws[:,None])

precursor_tol = 0.1

chembl_mws = chembl['mw'].values[:,None]

matches = tree.query_radius(chembl_mws, precursor_tol)
num_matches = pd.Series(matches).str.len()

has_match = np.array([len(x)>0 for x in matches])

chembl = chembl.loc[has_match]
chembl.shape

In [None]:
# to reproduce the library used in the paper (which did not fix a random seed)
inchikeys = !cat 'chembl_decoys.txt'
chembl = chembl.loc[chembl['standard_inchi_key'].isin(set(inchikeys))]
chembl.shape

In [None]:
# randomly subsample
# chembl = chembl.sample(frac=0.1, random_state=0)
# chembl.shape

In [None]:
# fill in polarities and energies
dfs = []
for nce in [20.,35.,50.]:
    for precursor_type in ['[M+H]+','[M-H]-']:
        df = chembl.copy()
        df['Precursor_type'] = precursor_type
        df['NCE'] = nce
        dfs.append(df)
df = pd.concat(dfs)

# modal instrument in NIST-20
df['Instrument'] = 'Thermo Finnigan Elite Orbitrap'

df['Spectrum'] = df['chembl_id'] + '_' + df['Precursor_type'] + '_' + df['NCE'].astype(int).astype(str)
df['SMILES'] = df['canonical_smiles']
df['InChIKey'] = df['standard_inchi_key']
df['Formula'] = df['formula']
df['PrecursorMZ'] = df['mw'] + df['Precursor_type'].map({
    '[M+H]+': ExactMolWt(Chem.MolFromSmiles('[H]')),
    '[M-H]-': -ExactMolWt(Chem.MolFromSmiles('[H]')),
})

df.shape

In [None]:
# include the ground truth structures
df = pd.concat([df,nist_df.query('NCE==20 or NCE==35 or NCE==50')])
df.shape

In [None]:
from src.io import write_msp

tsv_path = 'nist-20_chembl_decoys.tsv'
cols = ['Spectrum','SMILES','Precursor_type','NCE','Instrument']
df[cols].to_csv(tsv_path, sep='\t', header=False, index=False)

In [None]:
# clean up
!rm chembl_31_chemreps.txt.gz