# Import and clean large GNPS MS/MS dataset

## Using matchms to harmonize, clean, and complete metadata
Here we run matchms filters and additional custom filters to do extensive cleaning of a large spectral dataset (>210,000 spectra).

This notebook will expect that you have ``spec2vec`` installed via conda, which (currently) automatically include ``matchms`` and ``rdkit`` as well.
You can do this by creating an environment in anaconda and installing matchms via conda:

``conda create --name spec2vec_analysis python=3.8
conda activate spec2vec_analysis
conda install --channel bioconda --channel conda-forge spec2vec``

In [1]:
import os
from pathlib import Path
import pickle
import numpy as np
from matchms.importing import load_from_mgf
#import tensorflow as tf
#from tensorflow.keras.utils import to_categorical

ROOT = Path(os.getcwd()).parents[0]

#path_data = os.path.join(Path(ROOT).parents[0], "Data", "ms_ms_data_230201")  # add your local data folder here
path_data = os.path.join(Path(ROOT), "ms_ms_data_230201")



## Load spectra raw data from GNPS and convert to pickle files
- Retrieved on 01/02/2023 from https://gnps-external.ucsd.edu/gnpslibrary

In [13]:
def load_and_pickle(filename):
    spectrums = list(load_from_mgf(filename))
    print(f"number of spectra: {len(spectrums)}")
    count_annotations(spectrums)
    
    # pickle
    filename_pickle = filename.split(".")[0] + ".pickle"
    pickle.dump(spectrums, 
        open(os.path.join(path_data, filename_pickle), "wb"))


def count_annotations(spectra):
    inchi_lst = []
    smiles_lst = []
    inchikey_lst = []
    for i, spec in enumerate(spectra):
        inchi_lst.append(spec.get("inchi"))
        smiles_lst.append(spec.get("smiles"))
        inchikey = spec.get("inchikey")
        if inchikey is None:
            inchikey = spec.get("inchikey_inchi")
        inchikey_lst.append(inchikey)

    inchi_count = sum([1 for x in inchi_lst if x])
    smiles_count = sum([1 for x in smiles_lst if x])
    inchikey_count = sum([1 for x in inchikey_lst if x])
    print("Inchis:", inchi_count, "--", len(set(inchi_lst)), "unique")
    print("Smiles:", smiles_count, "--", len(set(smiles_lst)), "unique")
    print("Inchikeys:", inchikey_count, "--", 
          len(set([x[:14] for x in inchikey_lst if x])), "unique (first 14 characters)")

### Convert all mgf to pickle files (faster to work with later on...)

In [10]:
for filename in os.listdir(path_data):
    if filename.endswith(".mgf"):
        load_and_pickle(os.path.join(path_data, filename))

number of spectra: 587756
Inchis: 582926 -- 81535 unique
Smiles: 584940 -- 38613 unique
Inchikeys: 0 -- 0 unique (first 14 characters)
number of spectra: 7232
Inchis: 7232 -- 1 unique
Smiles: 6947 -- 550 unique
Inchikeys: 0 -- 0 unique (first 14 characters)
number of spectra: 7058
Inchis: 7058 -- 1 unique
Smiles: 6908 -- 548 unique
Inchikeys: 0 -- 0 unique (first 14 characters)
number of spectra: 227307
Inchis: 227307 -- 2579 unique
Smiles: 227307 -- 2581 unique
Inchikeys: 0 -- 0 unique (first 14 characters)
number of spectra: 562
Inchis: 562 -- 489 unique
Smiles: 562 -- 450 unique
Inchikeys: 0 -- 0 unique (first 14 characters)
number of spectra: 2235
Inchis: 2235 -- 743 unique
Smiles: 2235 -- 746 unique
Inchikeys: 0 -- 0 unique (first 14 characters)
number of spectra: 64783
Inchis: 64782 -- 7097 unique
Smiles: 64783 -- 9207 unique
Inchikeys: 0 -- 0 unique (first 14 characters)
number of spectra: 1492
Inchis: 1492 -- 654 unique
Smiles: 1492 -- 694 unique
Inchikeys: 0 -- 0 unique (first

In [None]:
import pickle
outfile = os.path.join(path_data, 'GNPS_all', 'ALL_GNPS.pickle')
with open(outfile, 'rb') as file:
    spectrums = pickle.load(file)

In [None]:
count_annotations(spectrums) 

## Run basic matchs filters

In [4]:
# set logger
from matchms.logging_functions import add_logging_to_file, reset_matchms_logger, set_matchms_logger_level

reset_matchms_logger(logger_name="matchms")
set_matchms_logger_level("INFO")
add_logging_to_file("ms_ms_data_cleaning_230203.log",
                    loglevel="INFO",
                    remove_stream_handlers=True)

In [None]:
formulas = []
name_to_formulas = []
for spec in spectrums:
    if spec.get("formula"):
        formulas.append(spec.get("formula"))
        name_to_formulas.append(spec.get("compound_name") + "---" + spec.get("formula"))

In [None]:
name_to_formulas[:1000]

In [None]:
len(formulas)

In [None]:
len(list(set(formulas)))

In [None]:
list(set(formulas))

## Clean (and extend) metadata

### 1) Harmization
+ Here, undefiend entries will be harmonized (instead of having a huge variation of None,"", "N/A" etc.)
+ The ``repair_inchi_inchikey_smiles`` function will correct misplaced metadata (e.g. inchikeys entered as inchi etc.) and harmonize the entry strings.

In [5]:
from tqdm.notebook import tqdm
import matchms.filtering as ms_filters

def apply_basic_filters(s):
    s = ms_filters.default_filters(s)
    s = ms_filters.derive_adduct_from_name(s)
    s = ms_filters.add_parent_mass(s, estimate_from_adduct=True)
    return s


def clean_metadata(s):
    s = ms_filters.harmonize_undefined_inchikey(s)
    s = ms_filters.harmonize_undefined_inchi(s)
    s = ms_filters.harmonize_undefined_smiles(s)
    s = ms_filters.repair_inchi_inchikey_smiles(s)
    return s

#spectrums = [clean_metadata(apply_basic_filters(s)) for s in tqdm(spectrums)]

In [None]:
#count_annotations(spectrums)

### 2) Convert entries where possible
Where possible (and necessary, i.e. missing): Convert between smiles, inchi, inchikey to complete metadata. This is done using functions from rdkit.

In [6]:
from tqdm.notebook import tqdm
import matchms.filtering as ms_filters

def clean_metadata2(s):
    s = ms_filters.derive_inchi_from_smiles(s)
    s = ms_filters.derive_smiles_from_inchi(s)
    s = ms_filters.derive_inchikey_from_inchi(s)
    return s

#spectrums = [clean_metadata2(s) for s in tqdm(spectrums)]

### Below part was not really necessary. All seems to be included in ALL_GNPS.mgf already!!

In [14]:
for filename in os.listdir(path_data):
    if not filename.endswith(".pickle") or "cleaned_by_matchms" in filename:
        continue
        
    # Load pickle file:
    with open(os.path.join(path_data, filename), 'rb') as file:
        spectrums = pickle.load(file)
    count_annotations(spectrums)

    # First processing part
    spectrums = [clean_metadata(apply_basic_filters(s)) for s in tqdm(spectrums)]
    count_annotations(spectrums)
    
    # Second processing part
    spectrums = [clean_metadata2(s) for s in tqdm(spectrums)]
    count_annotations(spectrums) 

    # Save as pickle
    filename_new = filename.split(".pickle")[0] + "cleaned_by_matchms" + ".pickle"
    pickle.dump(spectrums, 
        open(filename_new, "wb"))

Inchis: 582926 -- 81535 unique
Smiles: 584940 -- 38613 unique
Inchikeys: 0 -- 0 unique (first 14 characters)


  0%|          | 0/587756 [00:00<?, ?it/s]

In [7]:
# Load pickle file:
filename = "ALL_GNPS.pickle"

with open(os.path.join(path_data, filename), 'rb') as file:
    spectrums = pickle.load(file)
count_annotations(spectrums)

# First processing part
spectrums = [clean_metadata(apply_basic_filters(s)) for s in tqdm(spectrums)]
count_annotations(spectrums)

# Second processing part
spectrums = [clean_metadata2(s) for s in tqdm(spectrums)]
count_annotations(spectrums) 

# Save as pickle
filename_new = filename.split(".pickle")[0] + "cleaned_by_matchms" + ".pickle"
pickle.dump(spectrums, 
    open(filename_new, "wb"))

len(spectrums)

Inchis: 582926 -- 81535 unique
Smiles: 584940 -- 38613 unique
Inchikeys: 0 -- 0 unique (first 14 characters)


  0%|          | 0/587756 [00:00<?, ?it/s]

In [7]:
import pickle
filename = os.path.join(path_data, 'ALL_GNPS_cleaned_by_matchms.pickle')
with open(filename, 'rb') as file:
    spectrums = pickle.load(file)

In [10]:
"""
# Load ALL GNPS:
with open(os.path.join(path_data, "ALL_GNPScleaned_by_matchms.pickle"), 'rb') as file:
    spectrums = pickle.load(file)

spectrum_ids = set([s.get("spectrum_id") for s in spectrums])

for filename in os.listdir(path_data):
    if not filename.endswith("cleaned_by_matchms.pickle"):
        continue
    if "ALL_GNPS" in filename:
        continue
        
    # Load pickle file:
    with open(os.path.join(path_data, filename), 'rb') as file:
        additional_spectrums = pickle.load(file)
        
    for spec in additional_spectrums:
        if spec.get("spectrum_id") not in spectrum_ids:
            print(f"Found spectrum not contained in ALL_GNPS.mgf --> {spec.get('spectrum_id')}")
            spectrums.append(spec)
            
# Save as pickle
filename_new = "all_spectra_230201_matchms_filtered.pickle"
pickle.dump(spectrums, 
    open(filename_new, "wb"))  
"""

## Manual + additional cleaning of compound names
--> as preparation for later pubchem searches

In [8]:
print(len(spectrums))

587756


In [10]:
import logging
import re


logger = logging.getLogger("matchms")
logger.info("\n-----Additional heuristic compound name cleaning-----\n")


def remove_misplaced_collision_energy(name):
    """Remove occasionally occurring collision energy addition to name."""
    regex_collision1 = r"[Cc]ollision[Ee]nergy:[0-9]*"
    regex_collision2 = r"[ -]*[0-9]{1,2}.[0-9 ]*e[vV]"
    regex_collision3 = r"[ _]*[0-9]{1,2}e[vV]"
    name = re.sub(regex_collision1, "", name)
    name = re.sub(regex_collision2, "", name)
    return re.sub(regex_collision3, "", name)


final_names = []
def clean_compound_name(spectrum_in):
    if spectrum_in is None:
        return None

    spectrum = spectrum_in.clone()

    name_original = spectrum.get("compound_name")
    name = name_original.replace("F dial M", "")
    name = re.sub(r"_1[0-9]{4,6}", "", name)  # Notoamide R_130070
    name = re.sub(r"dereplictor_pv_[0-9.e-]+", "", name)  # symplostatin_1_11773_dereplictor_pv_1.19169e-18
    name = name.replace("_", " ")
    #removals = ["F dial M", ""]

    name = remove_misplaced_collision_energy(name)

    # Remove last word if likely not correct:
    if name.split(" ")[-1] in ["M", "M?", "?", "M+2H/2", "MS34+Na", "M]", "Cat+M]", "Unk", "--"]:
        name = " ".join(name.split(" ")[:-1]).strip()

    name = name.strip("._ ")
    if name != name_original:
        final_names.append((name_original, name))
        logger.info(f"Changed compound name from {name_original} to: {name}")
        spectrum.set("compound_name", name)
    return spectrum

spectrums = [clean_compound_name(s) for s in tqdm(spectrums)]

# now re-run formula extractor (since "_" was removed/replaced)
spectrums = [ms_filters.derive_formula_from_name(s) for s in tqdm(spectrums)]

  0%|          | 0/587756 [00:00<?, ?it/s]

  0%|          | 0/587756 [00:00<?, ?it/s]

In [11]:
# Save as pickle
filename_new = "all_spectra_230201_matchms_manual_filtered.pickle"
#pickle.dump(spectrums, 
#    open(filename_new, "wb"))

In [7]:
#!pip install pubchempy
#!pip install --upgrade matchmsextras

In [7]:
filename = "all_spectra_230201_matchms_manual_filtered.pickle"

# Load pickle file:
with open(os.path.join(path_data, filename), 'rb') as file:
    spectrums = pickle.load(file)

count_annotations(spectrums)

Inchis: 505984 -- 81571 unique
Smiles: 445998 -- 41299 unique
Inchikeys: 446006 -- 25532 unique (first 14 characters)


## Run spectra with missing SMILES/InChI against PubChem

- Using function from `matchmsextras` which can be installed with `pip install matchmsextras`

In [9]:
n_batches = 10
binsize = int(np.ceil(len(spectrums)/n_batches))
batches = []
for i in range(n_batches):
    batches.append((i*binsize, (i+1)*binsize))
batches, len(spectrums)

([(0, 58776),
  (58776, 117552),
  (117552, 176328),
  (176328, 235104),
  (235104, 293880),
  (293880, 352656),
  (352656, 411432),
  (411432, 470208),
  (470208, 528984),
  (528984, 587760)],
 587756)

In [10]:
from matchmsextras.pubchem_lookup import pubchem_metadata_lookup

for i, batch in enumerate(batches):
    print(40 * "**" + f"\n ----- {i} ----- \n")
    spectrums_pubchem = [pubchem_metadata_lookup(s, name_search_depth=10, match_precursor_mz=False, formula_search=True) for s in tqdm(spectrums[batch[0]:batch[1]])]

    # Save as pickle
    filename_new = f"all_spectra_230201_matchms_manual_pubchem_{i}.pickle"
    pickle.dump(spectrums_pubchem, 
        open(filename_new, "wb"))

********************************************************************************
 ----- 0 ----- 



  0%|          | 0/58776 [00:00<?, ?it/s]

UnboundLocalError: local variable 'inchikey_pubchem' referenced before assignment

In [None]:
import csv

filename_pubchem_results = "pubchem_matches.csv"
header = f"i,spectrum_id,compound_name,inchikey,inchi,smiles\n"
with open(filename_pubchem_results, 'a', newline ='') as file:
    file.write(header)

In [16]:
import logging
import re
import time
import pubchempy as pcp
import numpy as np
from matchms.metadata_utils import is_valid_inchikey


logger = logging.getLogger("matchms")


def pubchem_metadata_lookup(spectrum_in, name_search_depth=10, match_precursor_mz=False,
                            formula_search=False,
                            mass_tolerance=2.0,
                            allowed_differences=[(18.03, 0.01)],
                            min_formula_length=6,
                            formula_search_depth=25,
                            pause_per_request=0,
                            verbose=2):
    """
    Parameters
    ----------
    spectrum_in
        Matchms type spectrum as input.
    name_search_depth: int
        How many of the most relevant name matches to explore deeper. Default = 10.
    """
    if spectrum_in is None:
        return None

    # Only run search if no valid-looking inchikey is found
    if is_valid_inchikey(spectrum_in.get("inchikey")):
        return spectrum_in

    spectrum = spectrum_in.clone()

    def _plausible_name(compound_name):
        return (isinstance(compound_name, str) and len(compound_name) > 4)

    # Only run search if (more or less) plausible name is found
    compound_name = spectrum.get("compound_name")
    if not _plausible_name(compound_name):
        logger.info("No plausible compound name found (%s)", compound_name)
        return spectrum

    # Start pubchem search
    time.sleep(pause_per_request)
    inchi = spectrum.get("inchi")
    parent_mass = spectrum.get("parent_mass")
    if isinstance(parent_mass, np.ndarray):
        parent_mass = parent_mass[0]
    formula = spectrum.get("formula")

    # 1) Search for matching compound name
    results_pubchem = pubchem_name_search(compound_name, name_search_depth=name_search_depth,
                                          verbose=verbose)

    if len(results_pubchem) > 0:
        logger.info("Found potential matches for compound name (%s) on PubChem",
                   compound_name)

        # 1a) Search for matching inchi
        if likely_has_inchi(inchi):
            inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_inchi_match(results_pubchem, inchi,
                                                                                       verbose=verbose)
        # 1b) Search for matching parent mass
        if not likely_has_inchi(inchi) or inchikey_pubchem is None:
            inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_mass_match(results_pubchem,
                                                                                      parent_mass,
                                                                                      given_mass="parent mass",
                                                                                      mass_tolerance=mass_tolerance,
                                                                                      allowed_differences=allowed_differences,
                                                                                      verbose=verbose)

        # 1c) Search for matching precursor mass (optional)
        if match_precursor_mz and inchikey_pubchem is None:
            precursor_mz = spectrum.get("precursor_mz")
            inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_mass_match(results_pubchem,
                                                                                      precursor_mz,
                                                                                      given_mass="precursor mass",
                                                                                      mass_tolerance=mass_tolerance,
                                                                                      allowed_differences=allowed_differences,
                                                                                      verbose=verbose)

        if inchikey_pubchem is not None and inchi_pubchem is not None:
            logger.info("Matching compound name: %s", compound_name)
            spectrum.set("inchikey", inchikey_pubchem)
            spectrum.set("inchi", inchi_pubchem)
            spectrum.set("smiles", smiles_pubchem)
            return spectrum

        if verbose >= 2:
            logger.info("No matches found for compound name: %s", compound_name)

    else:
        logger.info("No matches for compound name (%s) on PubChem",
                   compound_name)

    # 2) Search for matching formula
    if formula_search and formula and len(formula) >= min_formula_length:
        results_pubchem = pubchem_formula_search(formula, formula_search_depth=formula_search_depth,
                                                 verbose=verbose)

        if len(results_pubchem) > 0:
            inchikey_pubchem = None
            logger.info("Found potential matches for formula (%s) on PubChem",
                       formula)
            # 2a) Search for matching inchi
            if likely_has_inchi(inchi):
                inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_inchi_match(results_pubchem, inchi,
                                                                                           verbose=verbose)
            # 2b) Search for matching parent mass
            if inchikey_pubchem is None:
                inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_mass_match(results_pubchem,
                                                                                          parent_mass,
                                                                                          given_mass="parent mass",
                                                                                          mass_tolerance=mass_tolerance,
                                                                                          allowed_differences=allowed_differences,
                                                                                          verbose=verbose)
            # 2c) Search for matching precursor mass (optional)
            if match_precursor_mz and inchikey_pubchem is None:
                precursor_mz = spectrum.get("precursor_mz")
                inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_mass_match(results_pubchem,
                                                                                          precursor_mz,
                                                                                          given_mass="precursor mass",
                                                                                          mass_tolerance=mass_tolerance,
                                                                                          allowed_differences=allowed_differences,
                                                                                          verbose=verbose)
            if inchikey_pubchem is not None and inchi_pubchem is not None:
                logger.info("Matching formula: %s", formula)
                if verbose >= 1:
                    logger.info("Matching formula: %s", formula)
                spectrum.set("inchikey", inchikey_pubchem)
                spectrum.set("inchi", inchi_pubchem)
                spectrum.set("smiles", smiles_pubchem)
                return spectrum

            if verbose >= 2:
                logger.info("No matches found for formula: %s", formula)
        else:
            logger.info("No matches for formula (%s) on PubChem",
                       formula)

    return spectrum


def likely_has_inchi(inchi):
    """Quick test to avoid excess in-depth testing"""
    if inchi is None:
        return False
    inchi = inchi.strip('"')
    regexp = r"(InChI=1|1)(S\/|\/)[0-9, A-Z, a-z,\.]{2,}\/(c|h)[0-9]"
    if not re.search(regexp, inchi):
        return False
    return True


def likely_inchi_match(inchi_1, inchi_2, min_agreement=3):
    """Try to match defective inchi to non-defective ones.
    Compares inchi parts seperately. Match is found if at least the first
    'min_agreement' parts are a good enough match.
    The main 'defects' this method accounts for are missing '-' in the inchi.
    In addition, differences between '-', '+', and '?'will be ignored.
    Parameters
    ----------
    inchi_1: str
        inchi of molecule.
    inchi_2: str
        inchi of molecule.
    min_agreement: int
        Minimum number of first parts that MUST be a match between both input
        inchi to finally consider it a match. Default is min_agreement=3.
    """
    if min_agreement < 2:
        logger.warning("Warning! 'min_agreement' < 2 has no discriminative power. Should be => 2.")
    if min_agreement == 2:
        logger.warning("Warning! 'min_agreement' == 2 has little discriminative power",
                       "(only looking at structure formula. Better use > 2.")
    agreement = 0

    # Remove spaces and '"' to account for different notations.
    # Remove everything with little discriminative power.
    ignore_lst = ['"', ' ', '-', '+', '?']
    for ignore in ignore_lst:
        inchi_1 = inchi_1.replace(ignore, '')
        inchi_2 = inchi_2.replace(ignore, '')

    # Split inchi in parts.
    inchi_1_parts = inchi_1.split('/')
    inchi_2_parts = inchi_2.split('/')

    # Check if both inchi have sufficient parts (seperated by '/')
    if len(inchi_1_parts) >= min_agreement and len(
            inchi_2_parts) >= min_agreement:
        # Count how many parts agree well
        for i in range(min_agreement):
            agreement += (inchi_1_parts[i] == inchi_2_parts[i])

    return bool(agreement == min_agreement)


def likely_inchikey_match(inchikey_1, inchikey_2, min_agreement=1):
    """Try to match inchikeys.
    Compares inchikey parts seperately. Match is found if at least the first
    'min_agreement' parts are a good enough match.
    Parameters
    ----------
    inchikey_1: str
        inchikey of molecule.
    inchikey_2: str
        inchikey of molecule.
    min_agreement: int
        Minimum number of first parts that MUST be a match between both input
        inchikey to finally consider it a match. Default is min_agreement=1.
    """
    if min_agreement not in [1, 2, 3]:
        logger.error("Warning! 'min_agreement' should be 1, 2, or 3.")
    agreement = 0

    # Harmonize strings
    inchikey_1 = inchikey_1.upper().replace('"', '').replace(' ', '')
    inchikey_2 = inchikey_2.upper().replace('"', '').replace(' ', '')

    # Split inchikey in parts.
    inchikey_1_parts = inchikey_1.split('-')
    inchikey_2_parts = inchikey_2.split('-')

    # Check if both inchikey have sufficient parts (seperated by '/')
    if len(inchikey_1_parts) >= min_agreement and len(
            inchikey_2_parts) >= min_agreement:
        # Count how many parts mostly agree
        for i in range(min_agreement):
            agreement += (inchikey_1_parts[i] == inchikey_2_parts[i])

    return agreement == min_agreement


def pubchem_name_search(compound_name: str, name_search_depth=10, verbose=1):
    """Search pubmed for compound name"""
    results_pubchem = pcp.get_compounds(compound_name,
                                        'name',
                                        listkey_count=name_search_depth)
    if len(results_pubchem) == 0 and "_" in compound_name:
        results_pubchem = pcp.get_compounds(compound_name.replace("_", " "),
                                            'name',
                                            listkey_count=name_search_depth)
    if len(results_pubchem) == 0:
        return []

    logger.debug("Found at least %s compounds of that name on pubchem.", len(results_pubchem))
    return results_pubchem


def pubchem_formula_search(compound_formula: str, formula_search_depth=25, verbose=1):
    """Search pubmed for compound formula"""
    sids_pubchem = pcp.get_sids(compound_formula,
                                'formula',
                                listkey_count=formula_search_depth)

    results_pubchem = []
    for sid in sids_pubchem:
        result = pcp.Compound.from_cid(sid['CID'])
        results_pubchem.append(result)

    logger.debug("Found at least %s compounds of with formula: %s.",
                 len(results_pubchem), compound_formula)
    return results_pubchem


def find_pubchem_inchi_match(results_pubchem,
                             inchi,
                             min_inchi_match=3,
                             verbose=1):
    """Searches pubmed matches for inchi match.
    Then check if inchi can be matched to (defective) input inchi.
    Outputs found inchi and found inchikey (will be None if none is found).
    Parameters
    ----------
    results_pubchem: List[dict]
        List of name search results from Pubchem.
    inchi: str
        Inchi (correct, or defective...). Set to None to ignore.
    min_inchi_match: int
        Minimum number of first parts that MUST be a match between both input
        inchi to finally consider it a match. Default is min_inchi_match=3.
    """

    inchi_pubchem = None
    inchikey_pubchem = None
    smiles_pubchem = None

    # Loop through first 'name_search_depth' results found on pubchem. Stop once first match is found.
    for result in results_pubchem:
        inchi_pubchem = '"' + result.inchi + '"'
        inchikey_pubchem = result.inchikey
        smiles_pubchem = result.isomeric_smiles
        if smiles_pubchem is None:
            smiles_pubchem = result.canonical_smiles

        match_inchi = likely_inchi_match(inchi, inchi_pubchem,
                                         min_agreement=min_inchi_match)

        if match_inchi:
            logger.info("Matching inchi: %s", inchi)
            if verbose >= 1:
                logger.info("Found matching compound for inchi: %s (Pubchem: %s)",
                            inchi, inchi_pubchem)
            break

    if not match_inchi:
        inchi_pubchem = None
        inchikey_pubchem = None
        smiles_pubchem = None

        if verbose >= 2:
            logger.info("No matches found for inchi %s.", inchi)

    return inchi_pubchem, inchikey_pubchem, smiles_pubchem


def find_pubchem_mass_match(results_pubchem,
                            parent_mass,
                            mass_tolerance,
                            given_mass="parent mass",
                            allowed_differences=[(18.03, 0.01)],
                            verbose=1):
    """Searches pubmed matches for inchi match.
    Then check if inchi can be matched to (defective) input inchi.
    Outputs found inchi and found inchikey (will be None if none is found).
    Parameters
    ----------
    results_pubchem: List[dict]
        List of name search results from Pubchem.
    parent_mass: float
        Spectrum"s guessed parent mass.
    mass_tolerance: float
        Acceptable mass difference between query compound and pubchem result.
    given_mass
        String to specify the type of the given mass (e.g. "parent mass").
    """
    inchi_pubchem = None
    inchikey_pubchem = None
    smiles_pubchem = None
    mass_difference = None
    lowest_mass_difference = [np.inf, None]

    for result in results_pubchem:
        inchi_pubchem = '"' + result.inchi + '"'
        inchikey_pubchem = result.inchikey
        smiles_pubchem = result.isomeric_smiles
        if smiles_pubchem is None:
            smiles_pubchem = result.canonical_smiles

        pubchem_mass = float(results_pubchem[0].exact_mass)
        mass_difference = np.abs(pubchem_mass - parent_mass)
        if mass_difference < lowest_mass_difference[0]:
            lowest_mass_difference[0] = mass_difference
            lowest_mass_difference[1] = inchi_pubchem
        match_mass = (mass_difference <= mass_tolerance)
        for diff in allowed_differences:
            match_mass = match_mass or np.isclose(mass_difference, diff[0], atol=diff[1])

        if match_mass:
            logger.info("Matching molecular weight (%s vs %s of %s)",
                        pubchem_mass, given_mass, parent_mass)
            break

    if not match_mass:
        inchi_pubchem = None
        inchikey_pubchem = None
        smiles_pubchem = None

        logger.info("No matching molecular weight (best mass difference was %s for inchi: %s)",
                    lowest_mass_difference[0], lowest_mass_difference[1])

    return inchi_pubchem, inchikey_pubchem, smiles_pubchem

In [30]:
import time
#from matchmsextras.pubchem_lookup import pubchem_metadata_lookup
filename_pubchem_results = "pubchem_matches.csv"

i_start = 173403 #158460  #136144 # 73870  #61315  # 59915# 57915 #44272


for i_count, spec in enumerate(tqdm(spectrums[i_start:])):
    name_lower = spec.get("compound_name", "unknown").lower()
    if "suspect" in name_lower:
        continue
    if ("putative" in name_lower) and ("delta m" in name_lower):
        continue
    def get_pubchem_results(spec):
        spectrum = pubchem_metadata_lookup(spec, name_search_depth=10,
                                                match_precursor_mz=False,
                                                formula_search=False,
                                                pause_per_request=0.15)
        return spectrum

    i = i_count + i_start
    spectrum = get_pubchem_results(spec)
    
    compound_name = spectrum.get("compound_name")
    spectrum_id = spectrum.get("spectrum_id")
    inchikey = spectrum.get("inchikey")
    inchi = spectrum.get("inchi")
    smiles = spectrum.get("smiles")
    # Original content
    inchikey_orig = spec.get("inchikey")
    inchi_orig = spec.get("inchi")
    smiles_orig = spec.get("smiles")
    if (inchi_orig != inchi) or (inchikey_orig != inchikey) or (smiles_orig != smiles):
        data = f"{i},{spectrum_id},{compound_name},{inchikey},{inchi},{smiles}\n"  # "," is a BAD delimiter here!
        with open(filename_pubchem_results, 'a', newline ='') as file:
            file.write(data)
    

  0%|          | 0/414353 [00:00<?, ?it/s]

In [73]:
filename = "pubchem_matches.csv"
filename_new = "pubchem_matches_metadata.csv"
with open(filename, "r") as file:
    for line in file:
        if line.startswith("i,spectrum_id"):
            new_line = line.replace(",", "\t")
        else:
            new_line = re.sub(r"(,)([A-Z]{14}-[A-Z]{10}-[A-Z])(,)", r"\t\g<2>\t", line)
            new_line = re.sub(r"(^[0-9]*)(,)(CCMSLIB[0-9]*)(,)", r"\g<1>\t\g<3>\t", new_line)
        with open(filename_new, "a") as newfile:
            newfile.write(new_line)
        

In [53]:
import re
testline = """4845,CCMSLIB00005436304,Tris(2-butoxyethyl) phosphate,WTLBZVNBAKMVDP-UHFFFAOYSA-N,"InChI=1S/C18H39O7P/c1-4-7-10-20-13-16-23-26(19,24-17-14-21-11-8-5-2)25-18-15-22-12-9-6-3/h4-18H2,1-3H3",CCCCOCCOP(=O)(OCCOCCCC)OCCOCCCC
\n4960,CCMSLIB00005436437,N-[1,3-dihydroxyoctadec-4-en-2-yl]hexadecanamide,YDNKGFDKKRUKPY-UHFFFAOYSA-N,"InChI=1S/C34H67NO3/c1-3-5-7-9-11-13-15-17-19-21-23-25-27-29-33("""
re.sub(r"(,)([A-Z]{14}-[A-Z]{10}-N)(,)", r"\t\g<2>\t", testline)

'4845,CCMSLIB00005436304,Tris(2-butoxyethyl) phosphate\tWTLBZVNBAKMVDP-UHFFFAOYSA-N\t"InChI=1S/C18H39O7P/c1-4-7-10-20-13-16-23-26(19,24-17-14-21-11-8-5-2)25-18-15-22-12-9-6-3/h4-18H2,1-3H3",CCCCOCCOP(=O)(OCCOCCCC)OCCOCCCC\n\n4960,CCMSLIB00005436437,N-[1,3-dihydroxyoctadec-4-en-2-yl]hexadecanamide\tYDNKGFDKKRUKPY-UHFFFAOYSA-N\t"InChI=1S/C34H67NO3/c1-3-5-7-9-11-13-15-17-19-21-23-25-27-29-33('

#### Problem here: still saved "inchi,smiles" --> split:

In [2]:
filename = "pubchem_matches_metadata.csv"
filename_new = "pubchem_matches_metadata_cleaned.csv"
with open(filename, "r") as file:
    for line in file:
        new_line = line.replace("\",", "\"\t")    
        with open(filename_new, "a") as newfile:
            newfile.write(new_line)

In [3]:
import pandas as pd

metadata_pubchem = pd.read_csv("pubchem_matches_metadata_cleaned.csv", delimiter="\t")
metadata_pubchem.head()

Unnamed: 0,i,spectrum_id,compound_name,inchikey,inchi,smiles
0,273,CCMSLIB00000006817,,DOGSUNBGLRKXBL-UHFFFAOYSA-N,InChI=1S/C18H23NO/c1-2-3-4-5-6-7-8-11-15-14-18...,CCCCCCCC=CC1=NC2=CC=CC=C2C(=O)C1
1,279,CCMSLIB00000006826,,GEKPVLASKAWWBZ-UHFFFAOYSA-N,InChI=1S/C18H25NO/c1-2-3-4-5-6-7-8-11-15-14-18...,CCCCCCCCCC1=NC2=CC=CC=C2C(=O)C1
2,487,CCMSLIB00000068208,,LCVJEYNJCXTRIQ-UHFFFAOYSA-N,"InChI=1S/C15H21Br2ClO/c1-12(2)9(16)7-11-14(4,1...",CC1(C(=CC2C(C13CCC(C(C3)Br)(C)Cl)(O2)C)Br)C
3,488,CCMSLIB00000068209,,LCVJEYNJCXTRIQ-UHFFFAOYSA-N,"InChI=1S/C15H21Br2ClO/c1-12(2)9(16)7-11-14(4,1...",CC1(C(=CC2C(C13CCC(C(C3)Br)(C)Cl)(O2)C)Br)C
4,489,CCMSLIB00000068210,,GZNRNQVZDUCYFB-SOVGUPCDSA-N,InChI=1S/C18H26O3/c1-2-15-14-12-16-18(21-16)13...,CC[C@H]1[C@@H]2C[C@H]3[C@@H]([C@H]2/C=C\C/C=C\...


In [4]:
metadata_pubchem.iloc[555, :]

i                                                             5444
spectrum_id                                     CCMSLIB00005467761
compound_name                                            Succinate
inchikey                               KDYFGRWQOYBRFD-UHFFFAOYSA-L
inchi            InChI=1S/C4H6O4/c5-3(6)1-2-4(7)8/h1-2H2,(H,5,6...
smiles                                      C(CC(=O)[O-])C(=O)[O-]
Name: 555, dtype: object

In [5]:
ms_filters.derive_smiles_from_inchi(spectrums[id]).get("inchi")

NameError: name 'ms_filters' is not defined

In [8]:
len(spectrums)

587756

In [9]:
for i, row in metadata_pubchem.iterrows():
    id = int(row["i"])
    spectrum_id = row["spectrum_id"]
    if spectrums[id].get("spectrum_id") == spectrum_id:
        spectrums[id].set("inchikey", row["inchikey"])
        spectrums[id].set("inchi", row["inchi"])
        spectrums[id].set("metadata_processing", "matchms + pubchem for inchi and inchikey")
    else:
        print(f"------ {spectrum_id}")

In [10]:
row, i

(i                                                           573243
 spectrum_id                                     CCMSLIB00004683917
 compound_name                                Acacetin-5-O-xyloside
 inchikey                               NCCOYBTYDAMVTO-ZQEFQCJFSA-N
 inchi            InChI=1S/C21H20O9/c1-27-12-4-2-10(3-5-12)15-8-...
 smiles           COC1=CC=C(C=C1)C2=CC(=O)C3=C(O2)C=C(C=C3O[C@H]...
 Name: 34624, dtype: object,
 34624)

In [11]:
spectrums[555].metadata

{'pepmass': (950.487, None),
 'charge': 1,
 'mslevel': '2',
 'source_instrument': 'LC-ESI-qToF',
 'seq': '*..*',
 'ionmode': 'positive',
 'organism': 'GNPS-LIBRARY',
 'smiles': '',
 'inchi': 'InChI=1S/C47H69N5O14/c1-26-18-32(65-47(48)58)20-42(57)66-40(21-39(59-8)27(2)14-15-37(55)30(5)43(60-9)28(3)16-17-52(7)25-53)29(4)36(54)12-11-13-41-49-34(23-62-41)45-51-35(24-64-45)46-50-33(22-63-46)44(61-10)31(6)38(56)19-26/h11,13,16-17,22-32,36,38-40,43-44,54,56H,12,14-15,18-21H2,1-10H3,(H2,48,58)/b13-11-,17-16+',
 'inchiaux': 'N/A',
 'pubmed': '21410162',
 'libraryquality': '2',
 'scans': '1964',
 'file_name': '2C7_KabiramideB_MNa_950.mzXML',
 'compound_name': 'Kabiramide B',
 'principal_investigator': 'Dorrestein/Gerwick',
 'data_collector': 'Luzzatto/Garg',
 'submit_user': 'tal_lk',
 'spectrum_id': 'CCMSLIB00000070267',
 'precursor_mz': 950.487,
 'adduct': '[M+Na]+',
 'parent_mass': 927.4977819999999,
 'inchikey': 'SVDMAXBQMZIUPX-YHQKBTMFSA-N',
 'metadata_processing': 'matchms + pubchem for inc

In [12]:
# Save as pickle
filename_new = f"all_spectra_230201_matchms_manual_pubchem.pickle"
pickle.dump(spectrums,
    open(filename_new, "wb"))

In [14]:
count_annotations(spectrums)

Inchis: 540214 -- 82486 unique
Smiles: 445998 -- 41299 unique
Inchikeys: 480631 -- 26352 unique (first 14 characters)
