Derive spec2vec embeddings of MS/MS spectra

In [1]:
import os
import sys
import gensim
import numpy as np

ROOT = os.path.dirname(os.getcwd())
#path_data = os.path.join(ROOT, 'data')
path_data = 'C:\\Users\\Gosia\\Desktop\\'
sys.path.insert(0, ROOT)

In [2]:
from matchms.importing import load_from_json
spectrums_lib = []
path_lcms = 'C:\\Users\\Gosia\\Desktop\\gnps_from_simon'
counter = 0
for s in os.listdir(path_lcms):
    if counter <= 5: 
        spectrums_lib += load_from_json(os.path.join(path_lcms,s))
        counter += 1

In [3]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses
def post_process_s2v(s):
    
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    if s is None:
        return None
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
        
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

In [4]:
# apply post processing steps to the data

spectrums_lib = [post_process_s2v(s) for s in spectrums_lib]

# omit spectrums that didn't qualify for analysis

spectrums_lib = [s for s in spectrums_lib if s is not None]


Create spectrum "documents"

In [5]:
from spec2vec import Spec2Vec
from spec2vec import SpectrumDocument

documents_lib = [SpectrumDocument(s, n_decimals=2) for s in spectrums_lib]


In [6]:
from spec2vec.model_building import train_new_word2vec_model
path_models = os.path.join(path_data, "trained_models")

model_file = os.path.join(path_models, "spec2vec_librarymatching_size_3.model")

iterations = [1, 3, 5, 10]

#Train model with size 10 and default parameters

model = train_new_word2vec_model(documents_lib, iterations, model_file, size = 3)

The value of size is set from 300 (default) to 3
  Epoch 1 of 10.Change in loss after epoch 1: 125506.109375
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_3_iter_1.model
  Epoch 2 of 10.Change in loss after epoch 2: 125567.125
  Epoch 3 of 10.Change in loss after epoch 3: 125132.890625
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_3_iter_3.model
  Epoch 4 of 10.Change in loss after epoch 4: 106334.03125
  Epoch 5 of 10.Change in loss after epoch 5: 91926.15625
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_3_iter_5.model
  Epoch 6 of 10.Change in loss after epoch 6: 87820.75
  Epoch 7 of 10.Change in loss after epoch 7: 83405.125
  Epoch 8 of 10.Change in loss after epoch 8: 81045.3125
  Epoch 9 of 10.Change in loss after epoch 9: 81739.0
  Epoch 10 of 10.Change in loss after epoch 10: 79668.8125
Saving model with name: C:\Users\Gosia\Desktop\trained_


Derive embeddings

In [105]:
from tqdm.notebook import tqdm  # optional, just to get a progress bar
from spec2vec.vector_operations import calc_vector


intensity_weighting_power = 0.5
allowed_missing_percentage = 15 # specify the maximum (weighted) fraction of the spectrum that is allowed to be missing

vector_size = model.vector_size
print(f"Embedding vector size: {vector_size}")

embeddings_spec2vec_lib = np.zeros((len(documents_lib), vector_size), dtype="float")
for i, doc in enumerate(tqdm(documents_lib)):
    embeddings_spec2vec_lib[i, 0:vector_size] = calc_vector(model, doc,)

Embedding vector size: 3


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=573.0), HTML(value='')))




In [106]:
embeddings_spec2vec_query = embeddings_spec2vec_lib

In [107]:
print([np.round(x, 4) for x in embeddings_spec2vec_lib])

[array([-28.7226,  39.6422, -55.4355]), array([  -5.5959,   61.5135, -103.6425]), array([-26.7974,  24.2837, -71.2194]), array([ -24.3973,  231.2826, -135.7283]), array([ -76.0284,  567.9523, -375.1597]), array([-109.5058, 1008.2638, -769.6439]), array([-166.1142, 1113.4421, -879.6349]), array([-199.5607, 1013.7787, -815.3101]), array([  25.0763,  387.7781, -810.9878]), array([-34.1186,   5.0048, -46.3929]), array([-28.2508,   6.5709, -39.0017]), array([ -18.5365,   84.4125, -125.3482]), array([ -98.2311,  153.1556, -250.0005]), array([-40.5643,  42.3847, -69.8166]), array([  80.5984,  740.4262, -892.2452]), array([-109.7528,  924.2487, -727.9415]), array([  -2.9181,  223.7927, -233.1898]), array([ -75.2812,  744.7568, -729.9123]), array([ -83.0759,  878.2758, -869.2554]), array([-156.4886,  798.2658, -843.8853]), array([-135.8989,   46.7828, -188.1453]), array([-56.3202,  24.97  , -85.6823]), array([ -23.9925,   95.7475, -102.6257]), array([ -27.035 ,  448.0081, -234.171 ]), array([  

In [108]:
import numpy as np
from sklearn.mixture import GaussianMixture

gm = GaussianMixture().fit(embeddings_spec2vec_lib)
gm.means_
gm.covariances_[0]
np.dot(np.dot(D,np.linalg.inv(Sigma)),mu.T)
np.dot(A, point.T)
point,mu

(array([-46.50241708,  35.74422895, -98.19068345]),
 array([-27.02690245,  42.74319223, -85.46827176]))

# Creating knockoffs


In [109]:
import copy
import numpy as np
from scipy.interpolate import griddata
import matplotlib.pyplot as plt
import numpy.ma as ma
from numpy.random import uniform, seed
from matplotlib import cm
from scipy.stats import multivariate_normal
from spec2vec.SpectrumDocument import SpectrumDocument
from matchms import Spectrum


def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

# seed(1234)
nDim = len(embeddings_spec2vec_lib[0])
# define the mean and covariance 
mu = gm.means_[0]
Sigma = gm.covariances_[0]
D = np.eye(nDim)*0.13

joint_cov = np.hstack((Sigma, Sigma-D))
joint_cov = np.vstack((joint_cov, np.hstack((Sigma-D,Sigma))))

print(is_pos_def(joint_cov))

A = np.eye(nDim) - np.dot(D,np.linalg.inv(Sigma))

all_knockoffs = []
# generate a sample
for point in embeddings_spec2vec_lib:
    # generate N knock-offs
    kmu = np.dot(np.dot(D,np.linalg.inv(Sigma)),mu)
    B = np.dot(A, point.T)
    kmu += B
    kSigma = 2*D - np.dot(np.dot(D,np.linalg.inv(Sigma)), D)
    ko = np.random.multivariate_normal(kmu.flatten(), kSigma, 1)
    all_knockoffs.append(ko)


knockoff_documents = []
for ko,v,d in zip(all_knockoffs,embeddings_spec2vec_lib,documents_lib):
    #print("knockoff:",ko,"vector",v,"document",d)
    #print("\n")
    e = copy.deepcopy(d)
    e._obj.set('inchi', 'knockoff')
    e._obj.set('vector', ko[0])
    knockoff_documents.append(e)
print(knockoff_documents)

True
[<spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729997040>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729997610>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x00000257299A3700>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x00000257299A3C40>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729A92DC0>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729A92DF0>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x00000257299DCF70>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x00000257299DCC70>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729A954F0>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729A95880>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729A950D0>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729A99280>, <spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729A996D0>, <spec2vec.SpectrumD

In [110]:
import importlib
import cosine_calc
importlib.reload(cosine_calc)
hits, _ = cosine_calc.get_hits(documents_lib, knockoff_documents, decoys=True, spec2vec_model=model)


In [111]:
print (documents_lib[0], knockoff_documents[0], all_knockoffs[0], embeddings_spec2vec_lib[0] )

['peak@77.10', 'peak@78.02', 'peak@104.13', 'peak@124.96', 'peak@127.05', 'peak@128.10', 'peak@129.04', 'peak@130.15', 'peak@153.00', 'peak@153.98', 'peak@154.99', 'peak@164.04', 'peak@179.06', 'loss@16.96', 'loss@26.01', 'loss@27.02', 'loss@28.00', 'loss@50.85', 'loss@51.96', 'loss@52.90', 'loss@53.95', 'loss@56.04', 'loss@76.87', 'loss@102.98', 'loss@103.90'] ['peak@77.10', 'peak@78.02', 'peak@104.13', 'peak@124.96', 'peak@127.05', 'peak@128.10', 'peak@129.04', 'peak@130.15', 'peak@153.00', 'peak@153.98', 'peak@154.99', 'peak@164.04', 'peak@179.06', 'loss@16.96', 'loss@26.01', 'loss@27.02', 'loss@28.00', 'loss@50.85', 'loss@51.96', 'loss@52.90', 'loss@53.95', 'loss@56.04', 'loss@76.87', 'loss@102.98', 'loss@103.90'] [[-29.56194061  39.51309631 -55.956831  ]] [-28.72262837  39.64217337 -55.43547055]


In [114]:
hits[0]
hit = hits[0]
from spec2vec.vector_operations import calc_vector, cosine_similarity
print( calc_vector( model, documents_lib[0] ) )
print( hit.target._obj.metadata['vector'] )
print( cosine_similarity( calc_vector( model, hit.query ), hit.target._obj.metadata['vector'] ) )
hit.query._obj.metadata['parent_mass'], hit.target._obj.metadata['parent_mass']
hit

[-28.72262837  39.64217337 -55.43547055]
[-29.56194061  39.51309631 -55.956831  ]
0.9999484772562907


Hit(query=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025726158BB0>, target=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x0000025729997040>, score=0.9999484772562907, hit='decoy')