## Creation of knockoffs from Spec2Vec embeddings


load files with spectra, put your own path

In [1]:
import numpy as np
import os
import sys
from matchms.importing import load_from_json
from spec2vec import Spec2Vec
from spec2vec import SpectrumDocument

folder_name = 'C:\\Users\\Gosia\\Desktop'
json_file_name = os.path.join(folder_name,'FDR-datsets', 'specs.json')
sys.path.append(os.path.join(folder_name,'FDR-Metabolomics', 'src'))


In [2]:
spectrums = load_from_json(json_file_name)
spectrums = [s for s in spectrums if s.metadata.get('inchikey')]

In [3]:
print(len(spectrums))


31147
31147


Creating a mapping of the inchi key prefixes to the spectrums in order to identify matching spectra

In [4]:
inchi_dict = {}
for s in spectrums:
    ik = s.metadata['inchikey']
    init_ik = ik.split('-')[0]
    if not init_ik in inchi_dict:
        inchi_dict[init_ik] = [s]
    else:
        inchi_dict[init_ik].append(s)

Dividing the spectra into library and queries

In [5]:
query_size = 1000

Building the library with matching queries plus noise

In [6]:
# choosing queries from multis ensures a possible true match
multis = set([i for i,v in inchi_dict.items() if len(v) > 1])

matching_keys = np.random.choice(list(multis), size=query_size, replace=False)

query_spec = {}
spectrums_lib = []
# We select query_size queries that have at least 1 matching spectrum in the library
for q in matching_keys:
    spec_to_add = np.random.choice(inchi_dict[q], size=1, replace=False)
    query_spec[spec_to_add[0].metadata['spectrum_id']] = spec_to_add[0]

# And everything else goes into the library
for s in spectrums:
    if s.metadata['spectrum_id'] not in query_spec:
        spectrums_lib.append(s)

spectrums_query = list(query_spec.values())

Create spectrum "documents"

In [7]:
documents_query = [SpectrumDocument(s, n_decimals=2) for s in spectrums_query]
documents_lib = [SpectrumDocument(s, n_decimals=2) for s in spectrums_lib]


Load pretrained model

In [8]:
import gensim
model_file = os.path.join('C:\\Users\\Gosia\\Desktop\\trained_models_1\\spec2vec_size_170.model')
model = gensim.models.Word2Vec.load(model_file)


In [9]:
from cosine_calc import get_hits

allowed_missing_percentage = 15
intensity_weighting_power = 0.5

hits = get_hits(documents_query, documents_lib, spec2vec_model=model,
                intensity_weighting_power=intensity_weighting_power,
                allowed_missing_percentage=allowed_missing_percentage, passatutto=False)

In [10]:
# Calculating true q-value scores
from q_value_calc import calculate_q_value
q_list_true = calculate_q_value(hits)

## Creation of knockoffs

In [None]:
from knockoffs import generate_knockoffs

diags = [50]
q_list_knockoffs = {}
for diag in diags:
    for comp in [10]:
        try:
            knockoff_documents = generate_knockoffs(model,documents_lib,allowed_missing_percentage=allowed_missing_percentage,n_components=comp, diagonal_matrix=diag)
            hits_knockoffs = get_hits(documents_query, knockoff_documents, decoys=True, spec2vec_model=model, precursor_tol=3,
                                                                    intensity_weighting_power=intensity_weighting_power,
                                                                    allowed_missing_percentage=allowed_missing_percentage, passatutto=False)
            q_list_knockoffs[(diag, comp)] = calculate_q_value(hits+hits_knockoffs,True)
        except Exception as e:
            print( diag, "failed", e )

Embedding vector size: 170
Embedding vector size: 170




In [None]:
import plot_q_vals

# plot estimated and true q-values
def combine_true_est(q_val_true,q_val_est):
    res = []
    q_idx = 0
    for q_e, _, score in q_val_est:
        while q_idx < len(q_val_true)-1 and q_val_true[q_idx+1][2] >= score:
            q_idx += 1
        res.append((score, q_val_true[q_idx][0], q_e))
    return res
        
to_plot = {}
for k,v in q_list_knockoffs.items():
    to_plot[k] = list(zip(*combine_true_est(q_list_true, v)))[1], list(zip(*combine_true_est(q_list_true, v)))[2]
plot_q_vals.plot_q_vals( to_plot )

In [None]:
hits_sorted = sorted(hits, key=lambda h:h.score, reverse=True)
for i, h in enumerate( hits_sorted[:20] ):
    if not h.hit:
        print(i)

In [None]:
hits_sorted = sorted(hits, key=lambda h:h.score, reverse=True)
for i, h in enumerate( hits_sorted[:20] ):
    if not h.hit:
        print(i)

In [None]:
hit = hits_sorted[5]
from pprint import pprint
#pprint(hit.query._obj.metadata)
pprint([p for p in hit.query._obj.peaks])

In [None]:
#pprint(hit.target._obj.metadata)
pprint([p for p in hit.target._obj.peaks])

In [None]:
q_list_true

In [None]:
list(q_list_knockoffs.values())[0]