Derive spec2vec embeddings of MS/MS spectra

In [1]:
import os
import sys
import gensim
import numpy as np

ROOT = os.path.dirname(os.getcwd())
#path_data = os.path.join(ROOT, 'data')
path_data = 'C:\\Users\\Gosia\\Desktop\\'
sys.path.insert(0, ROOT)

In [2]:
from matchms.importing import load_from_json
spectrums_lib = []
path_lcms = 'C:\\Users\\Gosia\\Desktop\\gnps_from_simon'
counter = 0
for s in os.listdir(path_lcms):
    if counter <= 5: 
        spectrums_lib += load_from_json(os.path.join(path_lcms,s))
        counter += 1

In [3]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses
def post_process_s2v(s):
    
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    if s is None:
        return None
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
        
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

In [4]:
# apply post processing steps to the data

spectrums_lib = [post_process_s2v(s) for s in spectrums_lib]

# omit spectrums that didn't qualify for analysis

spectrums_lib = [s for s in spectrums_lib if s is not None]


Create spectrum "documents"

In [5]:
from spec2vec import Spec2Vec
from spec2vec import SpectrumDocument

documents_lib = [SpectrumDocument(s, n_decimals=2) for s in spectrums_lib]


In [17]:
from spec2vec.model_building import train_new_word2vec_model
path_models = os.path.join(path_data, "trained_models")

model_file = os.path.join(path_models, "spec2vec_librarymatching_size_2.model")

iterations = [1, 3, 5, 10]

#Train model with size 10 and default parameters

model = train_new_word2vec_model(documents_lib, iterations, model_file, size = 2)

The value of size is set from 300 (default) to 2
  Epoch 1 of 10.Change in loss after epoch 1: 126087.96875
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_2_iter_1.model
  Epoch 2 of 10.Change in loss after epoch 2: 125198.765625
  Epoch 3 of 10.Change in loss after epoch 3: 114185.671875
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_2_iter_3.model
  Epoch 4 of 10.Change in loss after epoch 4: 94912.78125
  Epoch 5 of 10.Change in loss after epoch 5: 86137.25
Saving model with name: C:\Users\Gosia\Desktop\trained_models\spec2vec_librarymatching_size_2_iter_5.model
  Epoch 6 of 10.Change in loss after epoch 6: 86076.25
  Epoch 7 of 10.Change in loss after epoch 7: 85445.75
  Epoch 8 of 10.Change in loss after epoch 8: 84224.1875
  Epoch 9 of 10.Change in loss after epoch 9: 82612.75
  Epoch 10 of 10.Change in loss after epoch 10: 78908.4375
Saving model with name: C:\Users\Gosia\Desktop\trained_mo


Derive embeddings

In [18]:
from tqdm.notebook import tqdm  # optional, just to get a progress bar
from spec2vec.vector_operations import calc_vector


intensity_weighting_power = 0.5
allowed_missing_percentage = 15 # specify the maximum (weighted) fraction of the spectrum that is allowed to be missing

vector_size = model.vector_size
print(f"Embedding vector size: {vector_size}")

embeddings_spec2vec_lib = np.zeros((len(documents_lib), vector_size), dtype="float")
for i, doc in enumerate(tqdm(documents_lib)):
    embeddings_spec2vec_lib[i, 0:vector_size] = calc_vector(model, doc,
                                                        intensity_weighting_power,
                                                        allowed_missing_percentage)

Embedding vector size: 2


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=573.0), HTML(value='')))




In [19]:
print([np.round(x, 4) for x in embeddings_spec2vec_lib[0,:]])

[-136.6311, 188.3401]


# Creating knockoffs


In [20]:
import gensim
from sklearn.mixture import GaussianMixture as GMM
from sklearn.datasets import make_spd_matrix
from spec2vec import calc_vector

In [21]:
#Fitting Gaussian mixture model with 25 components, full covariance structure

gmm = GMM(n_components=25, covariance_type="full")
model = gmm.fit(np.array(embeddings_spec2vec_lib))


In [22]:
#Finding Dk matrix in sampling process

def find_Dk(covariance_matrix, embedding_dimension):
    
    eigs = np.linalg.eig(covariance_matrix)[0]
    min_eig = min(eigs)
    s = min(2*min_eig, 1)
    Dk = np.diag([s]*embedding_dimension)
    return Dk
               
def is_pos_semi_def(A, epsilon = 1e-10):    
    eigs = np.linalg.eig(A)[0]
    min_eig = min(eigs)
    return min_eig >= -epsilon

In [23]:
def create_knockoffs(model,vectors):
    embedding_dimension = len(vectors[0])
    covariances = model.covariances_
    means = model.means_
    Dks = []
    for cov in covariances:
        Dk = find_Dk( cov, embedding_dimension )
        if not is_pos_semi_def( 2*cov-Dk ):
            return
        Dks.append(Dk)

    knock_means_comps_1 = []
    knock_means_comps_2 = []
    knock_covs = []
    Id = np.diag([1]*embedding_dimension)
    for cov,mean,Dk in zip(covariances,means,Dks):
        knock_cov = 2*Dk - Dk@(cov@Dk)
        knock_mean_comp_1 = Dk@(cov@mean)
        knock_mean_comp_2 = Id - Dk@cov
        knock_means_comps_1.append(knock_mean_comp_1)
        knock_means_comps_2.append(knock_mean_comp_2)
        knock_covs.append(knock_cov)
        
    knockoffs = []
    bad_is = []
    components = np.arange(len(model.weights_))
    probs = model.predict_proba(vectors)
    for i, x in enumerate(vectors):        
        x_probs = probs[i]
        k_posterior = np.random.choice(components, p=x_probs)
        knock_mean = knock_means_comps_1[k_posterior] + knock_means_comps_2[k_posterior]@x
        knock_cov = knock_covs[k_posterior]
        if i and not i%100:
            print( 'trying',i )
        if not is_pos_semi_def(knock_cov):
            bad_is.append(i)
            continue
        try:
            knockoff_sample = np.random.multivariate_normal(knock_mean, knock_cov)
        except:
            bad_is.append(i)
            continue
        knockoffs.append(knockoff_sample)
        print('success',i)        
    return knockoffs, bad_is 

In [24]:
knockoffs = create_knockoffs(model,np.array(embeddings_spec2vec_lib))

success 86
success 88
trying 100
success 111
trying 200
trying 300
trying 400
success 402
success 462
trying 500
success 529


In [34]:
for i,j in enumerate([86,88,111,402,462,529]):
    print(knockoffs[0][i],embeddings_spec2vec_lib[j])


[304.67222185 513.5507422 ] [304.65259373 513.55843477]
[349.75359173 500.83476762] [349.77825101 500.82925441]
[-137.14737412  351.26883636] [-137.10734541  351.31322076]
[179.88687467 478.56567589] [179.91519688 478.57459379]
[133.041637   463.96743423] [133.01137738 463.95526022]
[-180.47844031  307.57050187] [-180.52146339  307.52899428]
