# Summary
Train a Spec2Vec model on AllNegative data; all negative mode spectra gathered from GNPS 09-04-2021.

In [1]:
import os
import pickle

data_path = "/mnt/scratch/louwe015/AllPositive2/"

In [4]:
all_neg_name = "ALL_GNPS_210409_negative_cleaned.pickle"
all_neg = os.path.join(data_path, all_neg_name)
os.path.exists(all_neg)

True

In [5]:
with open(all_neg, 'rb') as inf:
    spectrums = pickle.load(inf)

In [10]:
print("AllNegative spectra:", len(spectrums))

AllNegative spectra: 65583


## Processing of spectra
- normalize peaks (maximum intensity to 1)
- remove peaks outside [0, 1000] m/z window
- remove spectra with < 10 peaks
- reduce number of peaks to maximum of 0.5 * parent mass
- remove peaks with intensities < 0.001 of maximum intensity (unless this brings number of peaks to less than 10)

In [8]:
from typing import Union
from matchms.filtering import normalize_intensities
from matchms.filtering import select_by_mz
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses
from matchms.typing import SpectrumType

def post_process(spectrum_in: SpectrumType, min_peaks: int = 10) \
        -> Union[SpectrumType, None]:
    """Normal processing of spectra for Spec2Vec

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    min_peaks:
        Minimum number of peaks to pass the spectrum (otherwise -> None)
    """
    if spectrum_in is None:
        return None

    s = spectrum_in.clone()
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5)
    if s is None:
        return None
    # remove low peaks unless less than 10 peaks are left
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
    # add losses to normally processed spectra
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

In [9]:
processed_spectrums = []
for spec in spectrums:
    proc_spec = post_process(spec)
    if proc_spec:
        processed_spectrums.append(proc_spec)
print("Remaining processed spectra:", len(processed_spectrums))

Remaining processed spectra: 45709


## Prepare spectra for model training

In [15]:
from spec2vec import SpectrumDocument
documents_processed = [SpectrumDocument(s, n_decimals=2) for i, s in enumerate(processed_spectrums)]

## Train new Spec2Vec model

In [16]:
import gensim
from spec2vec.model_building import train_new_word2vec_model
model_file = os.path.join(data_path, "ALL_GNPS_210409_negative_cleaned_spec2vec.model")

iterations = [1, 3, 5, 10, 15, 20]
model = train_new_word2vec_model(documents_processed,
                                 iterations, model_file)

  Epoch 1 of 20.Change in loss after epoch 1: 2005503.0
Saving model with name: /mnt/scratch/louwe015/AllPositive2/ALL_GNPS_210409_negative_cleaned_spec2vec_iter_1.model
  Epoch 2 of 20.Change in loss after epoch 2: 1639341.25
  Epoch 3 of 20.Change in loss after epoch 3: 1238081.25
Saving model with name: /mnt/scratch/louwe015/AllPositive2/ALL_GNPS_210409_negative_cleaned_spec2vec_iter_3.model
  Epoch 4 of 20.Change in loss after epoch 4: 1091349.5
  Epoch 5 of 20.Change in loss after epoch 5: 1090830.5
Saving model with name: /mnt/scratch/louwe015/AllPositive2/ALL_GNPS_210409_negative_cleaned_spec2vec_iter_5.model
  Epoch 6 of 20.Change in loss after epoch 6: 1101830.0
  Epoch 7 of 20.Change in loss after epoch 7: 941625.5
  Epoch 8 of 20.Change in loss after epoch 8: 856778.0
  Epoch 9 of 20.Change in loss after epoch 9: 838428.0
  Epoch 10 of 20.Change in loss after epoch 10: 752245.0
Saving model with name: /mnt/scratch/louwe015/AllPositive2/ALL_GNPS_210409_negative_cleaned_spec2v