# Similarity score comparison

In [1]:
!pip install matchms
!pip install pubchempy
!pip install rdkit-pypi
!pip install spec2vec --user



In [1]:
import os
import numpy as np
from matchms.importing import load_from_mgf
import matplotlib.pyplot as plt
import pubchempy
import pandas as pd
from itertools import compress
from rdkit import Chem
from rdkit.Chem import Draw

In [2]:
from matchms import calculate_scores
from matchms.similarity import CosineGreedy
from matchms.similarity import ModifiedCosine
from matchms.similarity import NeutralLossesCosine

In [3]:
import gensim
from spec2vec import Spec2Vec
from matchms.filtering import default_filters
from matchms.filtering import normalize_intensities

In [4]:
path_data = "C:/Users/naake/Documents/GitHub/SimilarityHackathon/data/"
file_mgf_reference = os.path.join(path_data, "ex_spectra_stds_POS_scans_50.mgf")
reference_spectra = list(load_from_mgf(file_mgf_reference))

In [5]:
file_mgf_query = os.path.join(path_data, "exp_MS2_POS.mgf")
query_spectra = list(load_from_mgf(file_mgf_query))

# Expand grid function

Define a function that calculates all possible combinations of parameters. Iterate over the data frame in the following.

In [6]:
from itertools import product
import pandas as pd

def expand_grid(dictionary):
   return pd.DataFrame([row for row in product(*dictionary.values())], 
                       columns=dictionary.keys())


combinations_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2]}
combinations_modified_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2]}
combinations_neutralloss_cosine = {
    'tolerance': [0.005, 0.01], 
    'mz_power': [0, 0.5, 1, 2], 
    'intensity_power': [0, 0.5, 1, 2],
    'ignore_peaks_above_precursor': [True]}
combinations_spec2vec = {
    'intensity_weighting_power': [0, 0.5, 1, 2],
    'allowed_missing_percentage': [1, 5, 10, 20]}

## expand the grids for the parameter spaces
combinations_cosine = expand_grid(combinations_cosine)
combinations_modified_cosine = expand_grid(combinations_modified_cosine)
combinations_neutralloss_cosine = expand_grid(combinations_neutralloss_cosine)
combinations_spec2vec = expand_grid(combinations_spec2vec)

## Spec2Vec

In [7]:
path_model = "C:/Users/naake/Documents/GitHub/SimilarityHackathon/data/"
filename_model = "spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model"
filename = os.path.join(path_model, filename_model)
model = gensim.models.Word2Vec.load(filename)

In [8]:
def peak_processing(spectrum):
    spectrum = default_filters(spectrum)
    spectrum = normalize_intensities(spectrum)
    return spectrum

reference_s2v_spectra = [peak_processing(s) for s in reference_spectra]
query_s2v_spectra = [peak_processing(s) for s in query_spectra if len(s.peaks) != 0]

















































































































































In [9]:
combinations_spec2vec

Unnamed: 0,intensity_weighting_power,allowed_missing_percentage
0,0.0,1
1,0.0,5
2,0.0,10
3,0.0,20
4,0.5,1
5,0.5,5
6,0.5,10
7,0.5,20
8,1.0,1
9,1.0,5


In [10]:
new_colnames = []
new_cols = []

for index, row in combinations_spec2vec.iterrows():
    spec2vec_similarity = Spec2Vec(model=model,
                               intensity_weighting_power=row["intensity_weighting_power"],
                               allowed_missing_percentage=row["allowed_missing_percentage"])
    spec2vec_scores = calculate_scores(reference_s2v_spectra, query_s2v_spectra, spec2vec_similarity,
                          is_symmetric=False)
    scores = spec2vec_scores.scores
    SCORES = np.zeros(shape=scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            SCORES[i][j] = scores[i][j]#[0]
    SCORES_df = pd.DataFrame(SCORES)
    
    ## add row- and colnames to data.frame
    SCORES_df.index = [s.metadata["compound_name"] + "_" + s.metadata["collision_energy"] for s in reference_s2v_spectra]
    SCORES_df.columns = [s.metadata["scanindex"] + "_" + s.metadata["collision_energy"] for s in query_s2v_spectra]
    SCORES_df['rows'] = SCORES_df.index

    ## from wide to long
    SCORES_df = pd.melt(SCORES_df, id_vars ='rows', value_vars = list(SCORES_df.columns[:-1]))
    new_column = 'Spec2Vec' + "_" + str(row['intensity_weighting_power']) + "_" + str(row['allowed_missing_percentage'])
    new_colnames.append(new_column)
    new_cols.append(SCORES_df['value'].values)
    
    SCORES_df = SCORES_df.rename(columns={'value': 'Spec2Vec' + "_" + str(row['intensity_weighting_power']) + "_" + str(row['allowed_missing_percentage'])})
    





























































































































In [11]:
for i, colname in enumerate(new_colnames):
    SCORES_df[colname] = list(new_cols[i])

In [12]:
SCORES_df

Unnamed: 0,rows,variable,Spec2Vec_2.0_20.0,Spec2Vec_0.0_1.0,Spec2Vec_0.0_5.0,Spec2Vec_0.0_10.0,Spec2Vec_0.0_20.0,Spec2Vec_0.5_1.0,Spec2Vec_0.5_5.0,Spec2Vec_0.5_10.0,Spec2Vec_0.5_20.0,Spec2Vec_1.0_1.0,Spec2Vec_1.0_5.0,Spec2Vec_1.0_10.0,Spec2Vec_1.0_20.0,Spec2Vec_2.0_1.0,Spec2Vec_2.0_5.0,Spec2Vec_2.0_10.0
0,(2-Aminoethyl)Phosphonate_10,1094_10,-0.053715,,,,0.046467,,,0.002257,0.002257,,-0.044963,-0.044963,-0.044963,-0.053715,-0.053715,-0.053715
1,(2-Aminoethyl)Phosphonate_20,1094_10,-0.067443,,0.207186,0.207186,0.207186,,0.057912,0.057912,0.057912,,-0.035704,-0.035704,-0.035704,-0.067443,-0.067443,-0.067443
2,(2-Aminoethyl)Phosphonate_30,1094_10,-0.050679,0.050015,0.050015,0.050015,0.050015,0.007709,0.007709,0.007709,0.007709,-0.031435,-0.031435,-0.031435,-0.031435,-0.050679,-0.050679,-0.050679
3,(2-Aminoethyl)Phosphonate_40,1094_10,-0.053735,,0.122500,0.122500,0.122500,,0.026686,0.026686,0.026686,,-0.030693,-0.030693,-0.030693,-0.053735,-0.053735,-0.053735
4,(2-Aminoethyl)Phosphonate_60,1094_10,0.006546,,,,0.218896,,,,0.160352,,0.043698,0.043698,0.043698,0.006546,0.006546,0.006546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4693675,Gamma-Muricholic Acid_10,4762_60,0.126892,0.044537,0.044537,0.044537,0.044537,0.110458,0.110458,0.110458,0.110458,0.143425,0.143425,0.143425,0.143425,0.126892,0.126892,0.126892
4693676,Gamma-Muricholic Acid_20,4762_60,0.075099,0.015294,0.015294,0.015294,0.015294,0.154940,0.154940,0.154940,0.154940,0.137657,0.137657,0.137657,0.137657,0.075099,0.075099,0.075099
4693677,Gamma-Muricholic Acid_30,4762_60,0.027404,0.162223,0.162223,0.162223,0.162223,0.136834,0.136834,0.136834,0.136834,0.084281,0.084281,0.084281,0.084281,0.027404,0.027404,0.027404
4693678,Gamma-Muricholic Acid_40,4762_60,0.083282,0.153978,0.153978,0.153978,0.153978,0.110265,0.110265,0.110265,0.110265,0.105312,0.105312,0.105312,0.105312,0.083282,0.083282,0.083282


In [13]:
## only keep the comparisions between same collision energies
SCORES_df["matching_CE"] = SCORES_df.apply(lambda x: x["rows"].split("_")[-1] == x["variable"].split("_")[-1], axis=1)

In [14]:
SCORES_df = SCORES_df[SCORES_df["matching_CE"]]

In [15]:
SCORES_df.to_csv("similarities_Spec2Vec2_POS.csv")

## Neutral loss score

In [16]:
combinations_neutralloss_cosine

Unnamed: 0,tolerance,mz_power,intensity_power,ignore_peaks_above_precursor
0,0.005,0.0,0.0,True
1,0.005,0.0,0.5,True
2,0.005,0.0,1.0,True
3,0.005,0.0,2.0,True
4,0.005,0.5,0.0,True
5,0.005,0.5,0.5,True
6,0.005,0.5,1.0,True
7,0.005,0.5,2.0,True
8,0.005,1.0,0.0,True
9,0.005,1.0,0.5,True


In [17]:
reference_spectra_red = [s for s in reference_spectra if len(s.peaks) > 2]
query_spectra_red = [s for s in query_spectra if len(s.peaks) > 2]

In [None]:
new_colnames = []
new_cols = []

for index, row in combinations_neutralloss_cosine.iterrows():
    print(index)
    neutralloss_similarity = NeutralLossesCosine(tolerance=row["tolerance"],
            mz_power=row["mz_power"], intensity_power=row["intensity_power"],
            ignore_peaks_above_precursor=row["ignore_peaks_above_precursor"])
    neutralloss_scores = calculate_scores(reference_spectra_red, query_spectra_red, neutralloss_similarity,
                          is_symmetric=False)
    scores = neutralloss_scores.scores
    SCORES = np.zeros(shape=scores.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            SCORES[i][j] = scores[i][j][0]
    SCORES_df = pd.DataFrame(SCORES)
    
    ## add row- and colnames to data.frame
    SCORES_df.index = [s.metadata["compound_name"] + "_" + s.metadata["collision_energy"] for s in reference_spectra_red]
    SCORES_df.columns = [s.metadata["scanindex"] + "_" + s.metadata["collision_energy"] for s in query_spectra_red]
    SCORES_df['rows'] = SCORES_df.index

    ## from wide to long
    SCORES_df = pd.melt(SCORES_df, id_vars ='rows', value_vars = list(SCORES_df.columns[:-1]))
    new_column = 'Neutralloss' + "_" + str(row['tolerance']) + "_" + str(row['mz_power']) + "_" + str(row["intensity_power"]) + "_" + str(row["ignore_peaks_above_precursor"])
    new_colnames.append(new_column)
    new_cols.append(SCORES_df['value'].values)
    
    SCORES_df = SCORES_df.rename(columns={'value': new_column})
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [None]:
for i, colname in enumerate(new_colnames):
    SCORES_df[colname] = list(new_cols[i]) 

In [None]:
SCORES_df

In [None]:
## only keep the comparisions between same collision energies
SCORES_df["matching_CE"] = SCORES_df.apply(lambda x: x["rows"].split("_")[-1] == x["variable"].split("_")[-1], axis=1)

In [None]:
SCORES_df = SCORES_df[SCORES_df["matching_CE"]]

In [None]:
SCORES_df.to_csv("similarities_NeutralLoss_POS.csv")