# Summary
Notebook to train Spec2Vec model for new AllPositive version

In [1]:
import os
import gensim
import pickle
import time
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from typing import Tuple, List
from copy import deepcopy

data_path = "/mnt/scratch/louwe015/Mass_differences/data/"

In [3]:
all_pos_version1 = False
if all_pos_version1:
    all_pos = "gnps_positive_ionmode_cleaned_by_matchms_and_lookups"
else:
    all_pos = "ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups"

print(all_pos)

ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups


## Reading spectra

In [4]:
all_positive_file = os.path.join(data_path, all_pos + ".pickle")
if os.path.exists(all_positive_file):
    with open(all_positive_file, 'rb') as inf:
        spectrums = pickle.load(inf)  # list of matchms.Spectrum.Spectrum
else:
    print("error")

In [5]:
print("number of spectra:", len(spectrums))

number of spectra: 144691


## Preprocessing

In [47]:
from matchms.filtering import normalize_intensities
from matchms.filtering import select_by_mz
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

def post_process_normal(s, min_peaks=10):
    s = normalize_intensities(s)
    if any(np.isnan(s.peaks[1])):
        return None  # remove spectra that have max intensity of 0 (all intensities 0)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5)
    if s is None:
        return None
    #remove low peaks unless less than 10 peaks are left
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
    #add losses to normally processed spectra
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

In [6]:
spectrums_processed = []
for spec in spectrums:
    s_normal = post_process_normal(spec)
    if s_normal is not None:
        spectrums_processed.append(s_normal)
print("{} remaining spectra in normally processed data for s2v.".format(len(spectrums_processed)))

115388 remaining spectra in normally processed data for s2v.


## Spec2Vec processing into SpectrumDocuments

In [7]:
from spec2vec import SpectrumDocument
documents_spectrums_processed = [SpectrumDocument(s, n_decimals=2) for s in spectrums_processed]

In [8]:
print(len(documents_spectrums_processed))

115388


## Train Spec2Vec model with default parameters

In [9]:
from spec2vec.model_building import train_new_word2vec_model

path_models = os.path.join(data_path, "trained_models")
model_file = os.path.join(path_models, all_pos+"_spec2vec_embedding.model")

iterations = [1, 3, 5, 10, 15, 20]
# Train model with default parameters
model = train_new_word2vec_model(documents_spectrums_processed, iterations, model_file)

  Epoch 1 of 20.Change in loss after epoch 1: 6127955.0
Saving model with name: /mnt/scratch/louwe015/Mass_differences/data/trained_models/ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups_spec2vec_embedding_iter_1.model
  Epoch 2 of 20.Change in loss after epoch 2: 4617147.0
  Epoch 3 of 20.Change in loss after epoch 3: 4132037.0
Saving model with name: /mnt/scratch/louwe015/Mass_differences/data/trained_models/ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups_spec2vec_embedding_iter_3.model
  Epoch 4 of 20.Change in loss after epoch 4: 3615045.0
  Epoch 5 of 20.Change in loss after epoch 5: 3164398.0
Saving model with name: /mnt/scratch/louwe015/Mass_differences/data/trained_models/ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups_spec2vec_embedding_iter_5.model
  Epoch 6 of 20.Change in loss after epoch 6: 3102232.0
  Epoch 7 of 20.Change in loss after epoch 7: 2972210.0
  Epoch 8 of 20.Change in loss after epoch 8: 2874354.0
  Epoch 9 of 20.Change in loss after ep

# New version (april 2021)
-Also include UniqueInchikey

In [59]:
data_path = "/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/datasets/ALL_GNPS_210409_positive/"
embedding_path = "/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/embeddings/ALL_GNPS_210409_positive/"

base = "ALL_GNPS_210409_positive_cleaned"
spectrum_file = os.path.join(data_path, base+".pickle")
os.path.exists(spectrum_file)

True

### Read spectra

In [4]:
if os.path.exists(spectrum_file):
    with open(spectrum_file, 'rb') as inf:
        spectrums = pickle.load(inf)  # list of matchms.Spectrum.Spectrum
else:
    print("error")

In [6]:
print(len(spectrums))

373405


### Process spectra + save

In [48]:
spectrums_processed = []
for spec in spectrums:
    s_normal = post_process_normal(spec)
    if s_normal is not None:
        spectrums_processed.append(s_normal)
print("{} remaining spectra in normally processed data for s2v.".format(len(spectrums_processed)))

199780 remaining spectra in normally processed data for s2v.


In [49]:
processed_spectrums_file = os.path.join(data_path, "ALL_GNPS_210409_positive_cleaned_peaks_processed_s2v.pickle")
with open(processed_spectrums_file, 'wb') as outf:
    pickle.dump(spectrums_processed, outf)

### Turn into documents

In [50]:
from spec2vec import SpectrumDocument
documents_spectrums_processed = [SpectrumDocument(s, n_decimals=2) for s in spectrums_processed]
print(len(documents_spectrums_processed))

199780


### UniqueInchikey
Select 'best' spectra per inchikey (according to Huber et al. 2020)

In [58]:
sys.path.insert(0, os.path.join(os.path.split(os.getcwd()[:-1])[0], "scripts"))
from mass_differences.processing import get_ids_for_unique_inchikeys

uniq_ids = get_ids_for_unique_inchikeys(spectrums_processed)
documents_uniq_processed = [documents_spectrums_processed[i] for i in uniq_ids]

### Train Spec2Vec model
Both on all spectra and UniqueInchikey spectra

In [60]:
from spec2vec.model_building import train_new_word2vec_model

model_file = os.path.join(embedding_path, base+"_spec2vec_embedding_iter_15.model")
print(model_file)

iterations = [15]
# Train model with default parameters
model = train_new_word2vec_model(documents_spectrums_processed, iterations, model_file)

/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/embeddings/ALL_GNPS_210409_positive/ALL_GNPS_210409_positive_cleaned_spec2vec_embedding_iter_15.model
  Epoch 1 of 15.Change in loss after epoch 1: 7276212.5
  Epoch 2 of 15.Change in loss after epoch 2: 5372783.5
  Epoch 3 of 15.Change in loss after epoch 3: 4765236.0
  Epoch 4 of 15.Change in loss after epoch 4: 3981888.0
  Epoch 5 of 15.Change in loss after epoch 5: 3842090.0
  Epoch 6 of 15.Change in loss after epoch 6: 3787958.0
  Epoch 7 of 15.Change in loss after epoch 7: 3563100.0
  Epoch 8 of 15.Change in loss after epoch 8: 2718620.0
  Epoch 9 of 15.Change in loss after epoch 9: 2395256.0
  Epoch 10 of 15.Change in loss after epoch 10: 2311848.0
  Epoch 11 of 15.Change in loss after epoch 11: 2322268.0
  Epoch 12 of 15.Change in loss after epoch 12: 2244200.0
  Epoch 13 of 15.Change in loss after epoch 13: 2259668.0
  Epoch 14 of 15.Change in loss after epoch 14: 2190144.0
  Epoch 15 of 15.Change in loss after epoch 15: 2247

In [62]:
# UniqueInchikey spectra
model_file_ui = os.path.join(embedding_path, base+"_spec2vec_embedding_unique_inchikey_iter_50.model")
print(model_file_ui)

iterations = [50]
# Train model with default parameters
model_ui = train_new_word2vec_model(documents_uniq_processed, iterations, model_file_ui)

/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/embeddings/ALL_GNPS_210409_positive/ALL_GNPS_210409_positive_cleaned_spec2vec_embedding_unique_inchikey_iter_50.model
  Epoch 1 of 50.Change in loss after epoch 1: 1798409.875
  Epoch 2 of 50.Change in loss after epoch 2: 1473076.875
  Epoch 3 of 50.Change in loss after epoch 3: 1261797.75
  Epoch 4 of 50.Change in loss after epoch 4: 1107461.5
  Epoch 5 of 50.Change in loss after epoch 5: 1136961.0
  Epoch 6 of 50.Change in loss after epoch 6: 1125906.5
  Epoch 7 of 50.Change in loss after epoch 7: 1020314.5
  Epoch 8 of 50.Change in loss after epoch 8: 946987.0
  Epoch 9 of 50.Change in loss after epoch 9: 900281.0
  Epoch 10 of 50.Change in loss after epoch 10: 891026.0
  Epoch 11 of 50.Change in loss after epoch 11: 903226.0
  Epoch 12 of 50.Change in loss after epoch 12: 798432.0
  Epoch 13 of 50.Change in loss after epoch 13: 845645.0
  Epoch 14 of 50.Change in loss after epoch 14: 819686.0
  Epoch 15 of 50.Change in loss after 