Derive spec2vec embeddings of MS/MS spectra

In [1]:
import os
import sys
import gensim
import numpy as np

ROOT = os.path.dirname(os.getcwd())
#path_data = os.path.join(ROOT, 'data')
path_data = 'C:\\Users\\Gosia\\Desktop\\'
sys.path.insert(0, ROOT)

In [2]:
from matchms.importing import load_from_json
spectrums_lib = []
path_lcms = 'C:\\Users\\Gosia\\Desktop\\gnps_from_simon'
counter = 0
for s in os.listdir(path_lcms):
    if counter <= 5: 
        spectrums_lib += load_from_json(os.path.join(path_lcms,s))
        counter += 1

In [3]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses
def post_process_s2v(s):
    
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    if s is None:
        return None
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
        
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

In [4]:
# apply post processing steps to the data

spectrums_lib = [post_process_s2v(s) for s in spectrums_lib]

# omit spectrums that didn't qualify for analysis

spectrums_lib = [s for s in spectrums_lib if s is not None]


Create spectrum "documents"

In [48]:
from spec2vec import Spec2Vec
from spec2vec import SpectrumDocument

documents_lib = [SpectrumDocument(s, n_decimals=2) for s in spectrums_lib]


In [49]:
from spec2vec.model_building import train_new_word2vec_model
path_models = os.path.join(path_data, "trained_models")

model_file = os.path.join(path_models, "spec2vec_librarymatching_size_3.model")

iterations = [1, 3, 5, 10]

#Train model with size 10 and default parameters

model = train_new_word2vec_model(documents_lib, iterations, model_file, size = 3)

The value of size is set from 300 (default) to 3
  Epoch 1 of 10.Change in loss after epoch 1: 125696.0234375
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_3_iter_1.model
  Epoch 2 of 10.Change in loss after epoch 2: 125161.1953125
  Epoch 3 of 10.Change in loss after epoch 3: 124253.65625
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_3_iter_3.model
  Epoch 4 of 10.Change in loss after epoch 4: 103942.59375
  Epoch 5 of 10.Change in loss after epoch 5: 89982.40625
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_3_iter_5.model
  Epoch 6 of 10.Change in loss after epoch 6: 85497.3125
  Epoch 7 of 10.Change in loss after epoch 7: 85433.75
  Epoch 8 of 10.Change in loss after epoch 8: 83720.0
  Epoch 9 of 10.Change in loss after epoch 9: 79256.5
  Epoch 10 of 10.Change in loss after epoch 10: 77582.125
Saving model with name: C:\Users\Gosia\Desktop\trained


Derive embeddings

In [50]:
from tqdm.notebook import tqdm  # optional, just to get a progress bar
from spec2vec.vector_operations import calc_vector


intensity_weighting_power = 0.5
allowed_missing_percentage = 15 # specify the maximum (weighted) fraction of the spectrum that is allowed to be missing

vector_size = model.vector_size
print(f"Embedding vector size: {vector_size}")

embeddings_spec2vec_lib = np.zeros((len(documents_lib), vector_size), dtype="float")
for i, doc in enumerate(tqdm(documents_lib)):
    embeddings_spec2vec_lib[i, 0:vector_size] = calc_vector(model, doc,
                                                        intensity_weighting_power,
                                                        allowed_missing_percentage)

Embedding vector size: 3


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=573.0), HTML(value='')))




In [51]:
embeddings_spec2vec_query = embeddings_spec2vec_lib

In [52]:
print([np.round(x, 4) for x in embeddings_spec2vec_lib])

[array([ -48.3013, -155.4087,  182.0275]), array([-21.7717, -84.6197,  89.4971]), array([-0.7704, -8.6817, 10.7109]), array([ -2.5069, -17.0009,  19.4846]), array([213.9596, -73.2814, 188.9841]), array([ -6.3942, -33.602 ,  41.2221]), array([ -2.7108, -14.9356,  13.3315]), array([  66.9246, -124.727 ,  183.3473]), array([ -0.4014, -13.9481,  16.4011]), array([-1.0791, -4.9361,  5.6672]), array([147.4471, -59.1829, 201.8406]), array([-0.3845, -4.4761,  5.5947]), array([  0.4561, -12.5179,  16.638 ]), array([ 19.7795, -25.6551,  32.2803]), array([-10.4391, -78.435 ,  84.8948]), array([ 75.6978, -35.6468, 108.1179]), array([ -0.3167, -11.0699,  12.2016]), array([ -38.4406, -138.3364,  147.4806]), array([ -3.1272, -41.8362,  82.2231]), array([  3.7635, -24.1274,  26.7945]), array([ -8.382 , -55.2809,  62.4847]), array([ 0.5425, -9.162 ,  9.0044]), array([ 41.7231, -43.9335,  79.9013]), array([ -6.2085, -26.7911,  28.8112]), array([-14.2791, -47.5245,  53.5697]), array([ 0.4984, -5.3387,  6

In [53]:
hits

[Hit(query=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CDA15B0>, target=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CDA15B0>, score=1.0, hit=True),
 Hit(query=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CDA1430>, target=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CDA1430>, score=1.0, hit=True),
 Hit(query=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CDA10D0>, target=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CDA10D0>, score=1.0, hit=True),
 Hit(query=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CF24910>, target=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CF24910>, score=1.0, hit=True),
 Hit(query=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CF24970>, target=<spec2vec.SpectrumDocument.SpectrumDocument object at 0x000001FB4CF24970>, score=1.0, hit=True),
 Hit(query=<spec2vec.SpectrumDocument.SpectrumDocu

In [54]:
# Calculating cosine similarity for target-query match
import cosine_calc
hits, misses = cosine_calc.get_hits(documents_lib, documents_lib, spec2vec_model=model)

In [57]:
import numpy as np
from sklearn.mixture import GaussianMixture

gm = GaussianMixture().fit(embeddings_spec2vec_lib)
gm.means_
gm.covariances_[0]
np.dot(np.dot(D,np.linalg.inv(Sigma)),mu.T)
np.dot(A, point.T)
point,mu

(array([ 20.98208782, -63.03643032,  92.45110207]),
 array([ 36.6106666 , -53.70783099,  74.0898787 ]))

# Creating knockoffs


In [84]:
import numpy as np
from scipy.interpolate import griddata
import matplotlib.pyplot as plt
import numpy.ma as ma
from numpy.random import uniform, seed
from matplotlib import cm
from scipy.stats import multivariate_normal
from spec2vec.SpectrumDocument import SpectrumDocument
from matchms import Spectrum


def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

# seed(1234)
nDim = len(embeddings_spec2vec_lib[0])
# define the mean and covariance 
mu = gm.means_[0]
Sigma = gm.covariances_[0]
D = np.eye(nDim)*0.13

joint_cov = np.hstack((Sigma, Sigma-D))
joint_cov = np.vstack((joint_cov, np.hstack((Sigma-D,Sigma))))

print(is_pos_def(joint_cov))

A = np.eye(nDim) - np.dot(D,np.linalg.inv(Sigma))

all_knockoffs = []
# generate a sample
for point in embeddings_spec2vec_lib:
    # generate N knock-offs
    kmu = np.dot(np.dot(D,np.linalg.inv(Sigma)),mu)
    B = np.dot(A, point.T)
    kmu += B
    kSigma = 2*D - np.dot(np.dot(D,np.linalg.inv(Sigma)), D)
    ko = np.random.multivariate_normal(kmu.flatten(), kSigma, 1)
    all_knockoffs.append(ko)


knockoff_documents = []
for ko,v,d in zip(all_knockoffs,embeddings_spec2vec_lib,documents_lib):
    #print("knockoff:",ko,"vector",v,"document",d)
    #print("\n")
    knockoff_documents.append(SpectrumDocument(Spectrum(np.array(d._obj.metadata.get("parent_mass")),np.array(1))))
print(knockoff_documents)

True


AssertionError: Input argument 'intensities' should be an array of type float.