# Similarity score comparison

In [1]:
!pip install matchms
!pip install pubchempy
!pip install rdkit-pypi
!pip install spec2vec --user



In [1]:
import os
import numpy as np
from matchms.importing import load_from_mgf
import matplotlib.pyplot as plt
import pubchempy
import pandas as pd
from itertools import compress
from rdkit import Chem
from rdkit.Chem import Draw

In [2]:
from matchms import calculate_scores
from matchms.similarity import CosineGreedy
from matchms.similarity import ModifiedCosine
from matchms.similarity import NeutralLossesCosine

In [3]:
import gensim
from spec2vec import Spec2Vec
from matchms.filtering import default_filters
from matchms.filtering import normalize_intensities

In [4]:
path_data = "C:/Users/naake/Documents/GitHub/SimilarityHackathon/data/"
file_mgf_reference = os.path.join(path_data, "GNPS-LIBRARY.mgf")
reference_spectra = list(load_from_mgf(file_mgf_reference))

In [5]:
#file_mgf_query = os.path.join(path_data, "pseudo_query_neg_v1.mgf")
#query_spectra = list(load_from_mgf(file_mgf_query))

[<matchms.Spectrum.Spectrum at 0x26824b695e0>,
 <matchms.Spectrum.Spectrum at 0x2682b4e2520>,
 <matchms.Spectrum.Spectrum at 0x26824a99400>,
 <matchms.Spectrum.Spectrum at 0x2682e2054f0>,
 <matchms.Spectrum.Spectrum at 0x2682e205160>,
 <matchms.Spectrum.Spectrum at 0x2682e205850>,
 <matchms.Spectrum.Spectrum at 0x2682e2059d0>,
 <matchms.Spectrum.Spectrum at 0x2682e205b80>,
 <matchms.Spectrum.Spectrum at 0x2682e205d00>,
 <matchms.Spectrum.Spectrum at 0x2682e205e80>,
 <matchms.Spectrum.Spectrum at 0x2682e2120d0>,
 <matchms.Spectrum.Spectrum at 0x2682e212340>,
 <matchms.Spectrum.Spectrum at 0x2682e212310>,
 <matchms.Spectrum.Spectrum at 0x2682e2124f0>,
 <matchms.Spectrum.Spectrum at 0x2682e212670>,
 <matchms.Spectrum.Spectrum at 0x2682e2127f0>,
 <matchms.Spectrum.Spectrum at 0x2682e212970>,
 <matchms.Spectrum.Spectrum at 0x2682e212af0>,
 <matchms.Spectrum.Spectrum at 0x2682e212c70>,
 <matchms.Spectrum.Spectrum at 0x2682e212df0>,
 <matchms.Spectrum.Spectrum at 0x2682e212f40>,
 <matchms.Spe

# Expand grid function

Define a function that calculates all possible combinations of parameters. Iterate over the data frame in the following.

In [6]:
from itertools import product
import pandas as pd

def expand_grid(dictionary):
   return pd.DataFrame([row for row in product(*dictionary.values())], 
                       columns=dictionary.keys())


combinations_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2]}
combinations_modified_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2]}
combinations_neutralloss_cosine = {
    'tolerance': [0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2],
    'ignore_peaks_above_precursor': [True]}
combinations_spec2vec = {
    'intensity_weighting_power': [0, 0.5, 1, 2],
    'allowed_missing_percentage': [1, 5, 10, 20]}

## expand the grids for the parameter spaces
combinations_cosine = expand_grid(combinations_cosine)
combinations_modified_cosine = expand_grid(combinations_modified_cosine)
combinations_neutralloss_cosine = expand_grid(combinations_neutralloss_cosine)
combinations_spec2vec = expand_grid(combinations_spec2vec)

## Spec2Vec

In [7]:
path_model = "C:/Users/naake/Documents/GitHub/SimilarityHackathon/data/"
filename_model = "spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model"
filename = os.path.join(path_model, filename_model)
model = gensim.models.Word2Vec.load(filename)

In [22]:
def peak_processing(spectrum):
    spectrum = default_filters(spectrum)
    spectrum = normalize_intensities(spectrum)
    return spectrum

reference_s2v_spectra = [peak_processing(s) for s in reference_spectra  if len(s.peaks) != 0]















































































































































































































































































































































































































In [9]:
combinations_spec2vec

Unnamed: 0,intensity_weighting_power,allowed_missing_percentage
0,0.0,1
1,0.0,5
2,0.0,10
3,0.0,20
4,0.5,1
5,0.5,5
6,0.5,10
7,0.5,20
8,1.0,1
9,1.0,5


In [None]:
new_colnames = []
new_cols = []

for index, row in combinations_spec2vec.iterrows():
    spec2vec_similarity = Spec2Vec(model=model,
                               intensity_weighting_power=row["intensity_weighting_power"],
                               allowed_missing_percentage=row["allowed_missing_percentage"])
    spec2vec_scores = calculate_scores(references=reference_s2v_spectra, queries=reference_s2v_spectra, spec2vec_similarity,
                          is_symmetric=False)
    scores = spec2vec_scores.scores
    SCORES = np.zeros(shape=scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            SCORES[i][j] = scores[i][j]#[0]
    SCORES_df = pd.DataFrame(SCORES)
    
    ## add row- and colnames to data.frame
    SCORES_df.index = [s.metadata["spectrum_id"] for s in reference_s2v_spectra]
    SCORES_df.columns = SCORES_df.index
    SCORES_df['rows'] = SCORES_df.index

    ## from wide to long
    SCORES_df = pd.melt(SCORES_df, id_vars ='rows', value_vars = list(SCORES_df.columns[:-1]))
    new_column = 'Spec2Vec' + "_" + str(row['intensity_weighting_power']) + "_" + str(row['allowed_missing_percentage'])
    new_colnames.append(new_column)
    new_cols.append(SCORES_df['value'].values)
    
    SCORES_df = SCORES_df.rename(columns={'value': 'Spec2Vec' + "_" + str(row['intensity_weighting_power']) + "_" + str(row['allowed_missing_percentage'])})
    

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [None]:
for i, colname in enumerate(new_colnames):
    SCORES_df[colname] = list(new_cols[i])

In [None]:
SCORES_df

In [None]:
## only keep the comparisions between same collision energies
SCORES_df["matching_CE"] = SCORES_df.apply(lambda x: x["rows"].split("_")[-1] == x["variable"].split("_")[-1], axis=1)

In [None]:
SCORES_df = SCORES_df[SCORES_df["matching_CE"]]

In [None]:
SCORES_df.to_csv("similarities_Spec2Vec2_GNPS.csv")

## Neutral loss score

In [None]:
combinations_neutralloss_cosine

In [None]:
reference_spectra_red = [s for s in reference_spectra if len(s.peaks) > 2]

In [None]:
new_colnames = []
new_cols = []

for index, row in combinations_neutralloss_cosine.iterrows():
    print(index)
    neutralloss_similarity = NeutralLossesCosine(tolerance=row["tolerance"],
            mz_power=row["mz_power"], intensity_power=row["intensity_power"],
            ignore_peaks_above_precursor=row["ignore_peaks_above_precursor"])
    neutralloss_scores = calculate_scores(references=reference_spectra_red, queries=reference_spectra_red, neutralloss_similarity,
                          is_symmetric=False)
    scores = neutralloss_scores.scores
    SCORES = np.zeros(shape=scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            SCORES[i][j] = scores[i][j][0]
    SCORES_df = pd.DataFrame(SCORES)
    
    ## add row- and colnames to data.frame
    SCORES_df.index = [s.metadata["spectrum_id"] for s in reference_spectra_red]
    SCORES_df.columns = SCORES_df.index
    SCORES_df['rows'] = SCORES_df.index

    ## from wide to long
    SCORES_df = pd.melt(SCORES_df, id_vars ='rows', value_vars = list(SCORES_df.columns[:-1]))
    new_column = 'Neutralloss' + "_" + str(row['tolerance']) + "_" + str(row['mz_power']) + "_" + str(row["intensity_power"]) + "_" + str(row["ignore_peaks_above_precursor"])
    new_colnames.append(new_column)
    new_cols.append(SCORES_df['value'].values)
    
    SCORES_df = SCORES_df.rename(columns={'value': new_column})
    

In [None]:
for i, colname in enumerate(new_colnames):
    SCORES_df[colname] = list(new_cols[i])

In [None]:
SCORES_df

In [None]:
## only keep the comparisions between same collision energies
SCORES_df["matching_CE"] = SCORES_df.apply(lambda x: x["rows"].split("_")[-1] == x["variable"].split("_")[-1], axis=1)

In [None]:
SCORES_df = SCORES_df[SCORES_df["matching_CE"]]

In [None]:
SCORES_df.to_csv("similarities_NeutralLoss_GNPS.csv")