# Similarity score comparison

In [1]:
!pip install matchms
!pip install pubchempy
!pip install rdkit-pypi
!pip install spec2vec --user



In [1]:
import os
import numpy as np
from matchms.importing import load_from_mgf
import matplotlib.pyplot as plt
import pubchempy
import pandas as pd
from itertools import compress
from rdkit import Chem
from rdkit.Chem import Draw

In [2]:
from matchms import calculate_scores
from matchms.similarity import CosineGreedy
from matchms.similarity import ModifiedCosine
from matchms.similarity import NeutralLossesCosine

In [3]:
import gensim
from spec2vec import Spec2Vec
from matchms.filtering import default_filters
from matchms.filtering import normalize_intensities

In [4]:
path_data = "C:/Users/naake/Documents/GitHub/SimilarityHackathon/data/"
file_mgf_reference = os.path.join(path_data, "GNPS-LIBRARY.mgf")
reference_spectra = list(load_from_mgf(file_mgf_reference))

# Expand grid function

Define a function that calculates all possible combinations of parameters. Iterate over the data frame in the following.

In [5]:
from itertools import product
import pandas as pd

def expand_grid(dictionary):
   return pd.DataFrame([row for row in product(*dictionary.values())], 
                       columns=dictionary.keys())


combinations_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2]}
combinations_modified_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2]}
combinations_neutralloss_cosine = {
    'tolerance': [0.01], 
    'mz_power': [0, 1, 2], 
    'intensity_power': [0, 0.5, 1],
    'ignore_peaks_above_precursor': [True]}
combinations_spec2vec = {
    'intensity_weighting_power': [0, 0.5, 1, 2],
    'allowed_missing_percentage': [5, 10, 20]}

## expand the grids for the parameter spaces
combinations_cosine = expand_grid(combinations_cosine)
combinations_modified_cosine = expand_grid(combinations_modified_cosine)
combinations_neutralloss_cosine = expand_grid(combinations_neutralloss_cosine)
combinations_spec2vec = expand_grid(combinations_spec2vec)

## Spec2Vec

In [6]:
path_model = "C:/Users/naake/Documents/GitHub/SimilarityHackathon/data/"
filename_model = "spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model"
filename = os.path.join(path_model, filename_model)
model = gensim.models.Word2Vec.load(filename)

In [7]:
def peak_processing(spectrum):
    spectrum = default_filters(spectrum)
    spectrum = normalize_intensities(spectrum)
    return spectrum

reference_s2v_spectra = [peak_processing(s) for s in reference_spectra  if len(s.peaks) != 0]















































































































































































































































































































































































































In [8]:
combinations_spec2vec

Unnamed: 0,intensity_weighting_power,allowed_missing_percentage
0,0.0,5
1,0.0,10
2,0.0,20
3,0.5,5
4,0.5,10
5,0.5,20
6,1.0,5
7,1.0,10
8,1.0,20
9,2.0,5


In [13]:
new_colnames = []
new_cols = []

for index, row in combinations_spec2vec.iterrows():
    print(index)
    spec2vec_similarity = Spec2Vec(model=model,
                               intensity_weighting_power=row["intensity_weighting_power"],
                               allowed_missing_percentage=row["allowed_missing_percentage"])
    spec2vec_scores = calculate_scores(reference_s2v_spectra, reference_s2v_spectra, spec2vec_similarity,
                          is_symmetric=False)
    scores = spec2vec_scores.scores
    SCORES = np.zeros(shape=scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            SCORES[i][j] = scores[i][j]#[0]
    SCORES_df = pd.DataFrame(SCORES)
    
    ## add row- and colnames to data.frame
    SCORES_df.index = [s.metadata["spectrum_id"] for s in reference_s2v_spectra]
    SCORES_df.columns = SCORES_df.index
    SCORES_df['rows'] = SCORES_df.index

    ## from wide to long
    SCORES_df = pd.melt(SCORES_df, id_vars ='rows', value_vars = list(SCORES_df.columns[:-1]))
    new_column = 'Spec2Vec' + "_" + str(row['intensity_weighting_power']) + "_" + str(row['allowed_missing_percentage'])
    new_colnames.append(new_column)
    new_cols.append(SCORES_df['value'].values)
    
    SCORES_df = SCORES_df.rename(columns={'value': 'Spec2Vec' + "_" + str(row['intensity_weighting_power']) + "_" + str(row['allowed_missing_percentage'])})
    

0




















































































































































1








































































































2


































































3






























































































































4


























































































5




































































6




























































































7










































































8


























































9






























































10
























































11




















































In [14]:
for i, colname in enumerate(new_colnames):
    SCORES_df[colname] = list(new_cols[i])

In [15]:
SCORES_df

Unnamed: 0,rows,variable,Spec2Vec_2.0_20.0,Spec2Vec_0.0_5.0,Spec2Vec_0.0_10.0,Spec2Vec_0.0_20.0,Spec2Vec_0.5_5.0,Spec2Vec_0.5_10.0,Spec2Vec_0.5_20.0,Spec2Vec_1.0_5.0,Spec2Vec_1.0_10.0,Spec2Vec_1.0_20.0,Spec2Vec_2.0_5.0,Spec2Vec_2.0_10.0
0,CCMSLIB00000001547,CCMSLIB00000001547,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,CCMSLIB00000001548,CCMSLIB00000001547,0.225854,0.488936,0.488936,0.488936,0.502385,0.502385,0.502385,0.472098,0.472098,0.472098,0.225854,0.225854
2,CCMSLIB00000001549,CCMSLIB00000001547,0.009459,-0.166572,-0.166572,-0.166572,-0.066494,-0.066494,-0.066494,0.050126,0.050126,0.050126,0.009459,0.009459
3,CCMSLIB00000001550,CCMSLIB00000001547,0.200955,0.205473,0.205473,0.205473,0.197767,0.197767,0.197767,0.203799,0.203799,0.203799,0.200955,0.200955
4,CCMSLIB00000001551,CCMSLIB00000001547,0.041285,-0.161674,-0.161674,-0.161674,-0.070635,-0.070635,-0.070635,0.034687,0.034687,0.034687,0.041285,0.041285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162970751,CCMSLIB00010129217,CCMSLIB00010129221,-0.051378,0.006951,0.006951,0.006951,-0.030488,-0.030488,-0.030488,-0.058455,-0.058455,-0.058455,-0.051378,-0.051378
162970752,CCMSLIB00010129218,CCMSLIB00010129221,0.859743,0.719353,0.719353,0.719353,0.864900,0.864900,0.864900,0.875463,0.875463,0.875463,0.859743,0.859743
162970753,CCMSLIB00010129219,CCMSLIB00010129221,0.999873,0.577909,0.577909,0.577909,0.935467,0.935467,0.935467,0.992311,0.992311,0.992311,0.999873,0.999873
162970754,CCMSLIB00010129220,CCMSLIB00010129221,0.999784,0.443625,0.443625,0.443625,0.881790,0.881790,0.881790,0.987035,0.987035,0.987035,0.999784,0.999784


In [16]:
## only keep the comparisions between same collision energies
SCORES_df["matching_CE"] = SCORES_df.apply(lambda x: x["rows"].split("_")[-1] == x["variable"].split("_")[-1], axis=1)

MemoryError: 

In [None]:
SCORES_df = SCORES_df[SCORES_df["matching_CE"]]

In [17]:
SCORES_df.to_csv("similarities_Spec2Vec2_GNPS.csv")

## Neutral loss score

In [18]:
combinations_neutralloss_cosine

Unnamed: 0,tolerance,mz_power,intensity_power,ignore_peaks_above_precursor
0,0.01,0,0.0,True
1,0.01,0,0.5,True
2,0.01,0,1.0,True
3,0.01,1,0.0,True
4,0.01,1,0.5,True
5,0.01,1,1.0,True
6,0.01,2,0.0,True
7,0.01,2,0.5,True
8,0.01,2,1.0,True


In [26]:
reference_spectra_red = [s for s in reference_spectra if len(s.peaks) > 2]
reference_spectra_red = [s for s in reference_spectra_red if s.metadata["precursor_mz"] > 0]

In [None]:
new_colnames = []
new_cols = []

for index, row in combinations_neutralloss_cosine.iterrows():
    print(index)
    neutralloss_similarity = NeutralLossesCosine(tolerance=row["tolerance"],
            mz_power=row["mz_power"], intensity_power=row["intensity_power"],
            ignore_peaks_above_precursor=row["ignore_peaks_above_precursor"])
    neutralloss_scores = calculate_scores(reference_spectra_red, reference_spectra_red, neutralloss_similarity,
                          is_symmetric=True)
    scores = neutralloss_scores.scores
    SCORES = np.zeros(shape=scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            SCORES[i][j] = scores[i][j][0]
    SCORES_df = pd.DataFrame(SCORES)
    
    ## add row- and colnames to data.frame
    SCORES_df.index = [s.metadata["spectrum_id"] for s in reference_spectra_red]
    SCORES_df.columns = SCORES_df.index
    SCORES_df['rows'] = SCORES_df.index

    ## from wide to long
    SCORES_df = pd.melt(SCORES_df, id_vars ='rows', value_vars = list(SCORES_df.columns[:-1]))
    new_column = 'Neutralloss' + "_" + str(row['tolerance']) + "_" + str(row['mz_power']) + "_" + str(row["intensity_power"]) + "_" + str(row["ignore_peaks_above_precursor"])
    new_colnames.append(new_column)
    new_cols.append(SCORES_df['value'].values)
    
    SCORES_df = SCORES_df.rename(columns={'value': new_column})
    

0


In [None]:
for i, colname in enumerate(new_colnames):
    SCORES_df[colname] = list(new_cols[i])

In [None]:
SCORES_df

In [None]:
## only keep the comparisions between same collision energies
SCORES_df["matching_CE"] = SCORES_df.apply(lambda x: x["rows"].split("_")[-1] == x["variable"].split("_")[-1], axis=1)

In [None]:
SCORES_df = SCORES_df[SCORES_df["matching_CE"]]

In [None]:
SCORES_df.to_csv("similarities_NeutralLoss_GNPS.csv")