# Similarity score comparison

In [1]:
!pip install matchms
!pip install pubchempy
!pip install rdkit-pypi
!pip install spec2vec --user



In [1]:
import os
import numpy as np
from matchms.importing import load_from_mgf
import matplotlib.pyplot as plt
import pubchempy
import pandas as pd
from itertools import compress
from rdkit import Chem
from rdkit.Chem import Draw

In [2]:
from matchms import calculate_scores
from matchms.similarity import CosineGreedy
from matchms.similarity import ModifiedCosine
from matchms.similarity import NeutralLossesCosine

In [3]:
import gensim
from spec2vec import Spec2Vec
from matchms.filtering import default_filters
from matchms.filtering import normalize_intensities

In [4]:
path_data = "C:/Users/naake/Documents/GitHub/SimilarityHackathon/data/"
file_mgf_reference = os.path.join(path_data, "ex_spectra_stds_NEG_scans_50.mgf")
reference_spectra = list(load_from_mgf(file_mgf_reference))

In [5]:
file_mgf_query = os.path.join(path_data, "pseudo_query_neg_v1.mgf")
query_spectra = list(load_from_mgf(file_mgf_query))

# Expand grid function

Define a function that calculates all possible combinations of parameters. Iterate over the data frame in the following.

In [12]:
from itertools import product
import pandas as pd

def expand_grid(dictionary):
   return pd.DataFrame([row for row in product(*dictionary.values())], 
                       columns=dictionary.keys())


combinations_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2]}
combinations_modified_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2]}
combinations_neutralloss_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2],
    'ignore_peaks_above_precursor': [True]}
combinations_spec2vec = {
    'intensity_weighting_power': [0, 0.5, 1, 2],
    'allowed_missing_percentage': [1, 5, 10, 20]}

## expand the grids for the parameter spaces
combinations_cosine = expand_grid(combinations_cosine)
combinations_modified_cosine = expand_grid(combinations_modified_cosine)
combinations_neutralloss_cosine = expand_grid(combinations_neutralloss_cosine)
combinations_spec2vec = expand_grid(combinations_spec2vec)

## Spec2Vec

In [8]:
path_model = "C:/Users/naake/Documents/GitHub/SimilarityHackathon/data/"
filename_model = "spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model"
filename = os.path.join(path_model, filename_model)
model = gensim.models.Word2Vec.load(filename)

In [9]:
def peak_processing(spectrum):
    spectrum = default_filters(spectrum)
    spectrum = normalize_intensities(spectrum)
    return spectrum

reference_s2v_spectra = [peak_processing(s) for s in reference_spectra]
query_s2v_spectra = [peak_processing(s) for s in query_spectra if len(s.peaks) != 0]



















































































In [10]:
combinations_spec2vec

Unnamed: 0,intensity_weighting_power,allowed_missing_percentage
0,0.0,1
1,0.0,5
2,0.0,10
3,0.0,20
4,0.5,1
5,0.5,5
6,0.5,10
7,0.5,20
8,1.0,1
9,1.0,5


In [11]:
new_colnames = []
new_cols = []

for index, row in combinations_spec2vec.iterrows():
    spec2vec_similarity = Spec2Vec(model=model,
                               intensity_weighting_power=row["intensity_weighting_power"],
                               allowed_missing_percentage=row["allowed_missing_percentage"])
    spec2vec_scores = calculate_scores(reference_s2v_spectra, query_s2v_spectra, spec2vec_similarity,
                          is_symmetric=False)
    scores = spec2vec_scores.scores
    SCORES = np.zeros(shape=scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            SCORES[i][j] = scores[i][j]#[0]
    SCORES_df = pd.DataFrame(SCORES)
    
    ## add row- and colnames to data.frame
    SCORES_df.index = [s.metadata["compound_name"] + "_" + s.metadata["collision_energy"] for s in reference_s2v_spectra]
    SCORES_df.columns = [s.metadata["scanindex"] + "_" + s.metadata["collision_energy"] for s in query_s2v_spectra]
    SCORES_df['rows'] = SCORES_df.index

    ## from wide to long
    SCORES_df = pd.melt(SCORES_df, id_vars ='rows', value_vars = list(SCORES_df.columns[:-1]))
    new_column = 'Spec2Vec' + "_" + str(row['intensity_weighting_power']) + "_" + str(row['allowed_missing_percentage'])
    new_colnames.append(new_column)
    new_cols.append(SCORES_df['value'].values)
    
    SCORES_df = SCORES_df.rename(columns={'value': 'Spec2Vec' + "_" + str(row['intensity_weighting_power']) + "_" + str(row['allowed_missing_percentage'])})
    









































In [12]:
for i, colname in enumerate(new_colnames):
    SCORES_df[colname] = list(new_cols[i])

In [13]:
SCORES_df

Unnamed: 0,rows,variable,Spec2Vec_2.0_20.0,Spec2Vec_0.0_1.0,Spec2Vec_0.0_5.0,Spec2Vec_0.0_10.0,Spec2Vec_0.0_20.0,Spec2Vec_0.5_1.0,Spec2Vec_0.5_5.0,Spec2Vec_0.5_10.0,Spec2Vec_0.5_20.0,Spec2Vec_1.0_1.0,Spec2Vec_1.0_5.0,Spec2Vec_1.0_10.0,Spec2Vec_1.0_20.0,Spec2Vec_2.0_1.0,Spec2Vec_2.0_5.0,Spec2Vec_2.0_10.0
0,11-hydroxyeicosatetraenoic acid_10,1_10,-0.088317,,,,,-0.148681,-0.148681,-0.148681,-0.148681,-0.129823,-0.129823,-0.129823,-0.129823,-0.088317,-0.088317,-0.088317
1,11-hydroxyeicosatetraenoic acid_20,1_10,-0.173825,,,,,-0.183356,-0.183356,-0.183356,-0.183356,-0.180468,-0.180468,-0.180468,-0.180468,-0.173825,-0.173825,-0.173825
2,11-hydroxyeicosatetraenoic acid_30,1_10,-0.175747,,,,,-0.202205,-0.202205,-0.202205,-0.202205,-0.187190,-0.187190,-0.187190,-0.187190,-0.175747,-0.175747,-0.175747
3,11-hydroxyeicosatetraenoic acid_40,1_10,-0.177001,,,,,-0.148220,-0.148220,-0.148220,-0.148220,-0.162778,-0.162778,-0.162778,-0.162778,-0.177001,-0.177001,-0.177001
4,11-hydroxyeicosatetraenoic acid_60,1_10,-0.016073,,,,,-0.051248,-0.051248,-0.051248,-0.051248,-0.039866,-0.039866,-0.039866,-0.039866,-0.016073,-0.016073,-0.016073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562347,Ursodeoxycholate_10,1614_40,-0.090932,-0.166517,-0.166517,-0.166517,-0.166517,-0.094399,-0.094399,-0.094399,-0.094399,-0.092282,-0.092282,-0.092282,-0.092282,-0.090932,-0.090932,-0.090932
1562348,Ursodeoxycholate_20,1614_40,-0.090627,-0.112742,-0.112742,-0.112742,-0.112742,-0.091469,-0.091469,-0.091469,-0.091469,-0.092155,-0.092155,-0.092155,-0.092155,-0.090627,-0.090627,-0.090627
1562349,Ursodeoxycholate_30,1614_40,-0.091939,-0.089974,-0.089974,-0.089974,-0.089974,-0.082289,-0.082289,-0.082289,-0.082289,-0.093102,-0.093102,-0.093102,-0.093102,-0.091939,-0.091939,-0.091939
1562350,Ursodeoxycholate_40,1614_40,-0.089873,0.002466,0.002466,0.002466,0.002466,-0.073208,-0.073208,-0.073208,-0.073208,-0.091563,-0.091563,-0.091563,-0.091563,-0.089873,-0.089873,-0.089873


In [14]:
## only keep the comparisions between same collision energies
SCORES_df["matching_CE"] = SCORES_df.apply(lambda x: x["rows"].split("_")[-1] == x["variable"].split("_")[-1], axis=1)

In [15]:
SCORES_df = SCORES_df[SCORES_df["matching_CE"]]

In [16]:
SCORES_df.to_csv("similarities_Spec2Vec2.csv")

## Neutral loss score

In [13]:
combinations_neutralloss_cosine

Unnamed: 0,tolerance,mz_power,intensity_power,ignore_peaks_above_precursor
0,0.005,0.0,0.0,True
1,0.005,0.0,0.5,True
2,0.005,0.0,1.0,True
3,0.005,0.0,2.0,True
4,0.005,0.5,0.0,True
5,0.005,0.5,0.5,True
6,0.005,0.5,1.0,True
7,0.005,0.5,2.0,True
8,0.005,1.0,0.0,True
9,0.005,1.0,0.5,True


In [14]:
reference_spectra_red = [s for s in reference_spectra if len(s.peaks) > 2]
query_spectra_red = [s for s in query_spectra if len(s.peaks) > 2]

In [None]:
new_colnames = []
new_cols = []

for index, row in combinations_neutralloss_cosine.iterrows():
    print(index)
    neutralloss_similarity = NeutralLossesCosine(tolerance=row["tolerance"],
            mz_power=row["mz_power"], intensity_power=row["intensity_power"],
            ignore_peaks_above_precursor=row["ignore_peaks_above_precursor"])
    neutralloss_scores = calculate_scores(reference_spectra_red, query_spectra_red, neutralloss_similarity,
                          is_symmetric=False)
    scores = neutralloss_scores.scores
    SCORES = np.zeros(shape=scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            SCORES[i][j] = scores[i][j][0]
    SCORES_df = pd.DataFrame(SCORES)
    
    ## add row- and colnames to data.frame
    SCORES_df.index = [s.metadata["compound_name"] + "_" + s.metadata["collision_energy"] for s in reference_spectra_red]
    SCORES_df.columns = [s.metadata["scanindex"] + "_" + s.metadata["collision_energy"] for s in query_spectra_red]
    SCORES_df['rows'] = SCORES_df.index

    ## from wide to long
    SCORES_df = pd.melt(SCORES_df, id_vars ='rows', value_vars = list(SCORES_df.columns[:-1]))
    new_column = 'Neutralloss' + "_" + str(row['tolerance']) + "_" + str(row['mz_power']) + "_" + str(row["intensity_power"]) + "_" + str(row["ignore_peaks_above_precursor"])
    new_colnames.append(new_column)
    new_cols.append(SCORES_df['value'].values)
    
    SCORES_df = SCORES_df.rename(columns={'value': new_column})
    

0
1
2
3
4
5
6
7
8
9


In [None]:
for i, colname in enumerate(new_colnames):
    SCORES_df[colname] = list(new_cols[i])

In [None]:
SCORES_df

In [None]:
## only keep the comparisions between same collision energies
SCORES_df["matching_CE"] = SCORES_df.apply(lambda x: x["rows"].split("_")[-1] == x["variable"].split("_")[-1], axis=1)

In [None]:
SCORES_df = SCORES_df[SCORES_df["matching_CE"]]

In [None]:
SCORES_df.to_csv("similarities_NeutralLoss.csv")