# Summary
Notebook to train Spec2Vec model for new AllPositive version

In [2]:
import os
import gensim
import pickle
import time
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from typing import Tuple, List
from copy import deepcopy

data_path = "/mnt/scratch/louwe015/Mass_differences/data/"

In [3]:
all_pos_version1 = False
if all_pos_version1:
    all_pos = "gnps_positive_ionmode_cleaned_by_matchms_and_lookups"
else:
    all_pos = "ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups"

print(all_pos)

ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups


## Reading spectra

In [4]:
all_positive_file = os.path.join(data_path, all_pos + ".pickle")
if os.path.exists(all_positive_file):
    with open(all_positive_file, 'rb') as inf:
        spectrums = pickle.load(inf)  # list of matchms.Spectrum.Spectrum
else:
    print("error")

In [5]:
print("number of spectra:", len(spectrums))

number of spectra: 144691


## Preprocessing

In [6]:
from matchms.filtering import normalize_intensities
from matchms.filtering import select_by_mz
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

def post_process_normal(s, min_peaks=10):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5)
    if s is None:
        return None
    #remove low peaks unless less than 10 peaks are left
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
    #add losses to normally processed spectra
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

spectrums_processed = []
for spec in spectrums:
    s_normal = post_process_normal(spec)
    if s_normal is not None:
        spectrums_processed.append(s_normal)
print("{} remaining spectra in normally processed data for s2v.".format(len(spectrums_processed)))

115388 remaining spectra in normally processed data for s2v.


## Spec2Vec processing into SpectrumDocuments

In [7]:
from spec2vec import SpectrumDocument
documents_spectrums_processed = [SpectrumDocument(s, n_decimals=2) for s in spectrums_processed]

In [8]:
print(len(documents_spectrums_processed))

115388


## Train Spec2Vec model with default parameters

In [9]:
from spec2vec.model_building import train_new_word2vec_model

path_models = os.path.join(data_path, "trained_models")
model_file = os.path.join(path_models, all_pos+"_spec2vec_embedding.model")

iterations = [1, 3, 5, 10, 15, 20]
# Train model with default parameters
model = train_new_word2vec_model(documents_spectrums_processed, iterations, model_file)

  Epoch 1 of 20.Change in loss after epoch 1: 6127955.0
Saving model with name: /mnt/scratch/louwe015/Mass_differences/data/trained_models/ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups_spec2vec_embedding_iter_1.model
  Epoch 2 of 20.Change in loss after epoch 2: 4617147.0
  Epoch 3 of 20.Change in loss after epoch 3: 4132037.0
Saving model with name: /mnt/scratch/louwe015/Mass_differences/data/trained_models/ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups_spec2vec_embedding_iter_3.model
  Epoch 4 of 20.Change in loss after epoch 4: 3615045.0
  Epoch 5 of 20.Change in loss after epoch 5: 3164398.0
Saving model with name: /mnt/scratch/louwe015/Mass_differences/data/trained_models/ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups_spec2vec_embedding_iter_5.model
  Epoch 6 of 20.Change in loss after epoch 6: 3102232.0
  Epoch 7 of 20.Change in loss after epoch 7: 2972210.0
  Epoch 8 of 20.Change in loss after epoch 8: 2874354.0
  Epoch 9 of 20.Change in loss after ep