# Spec2Vec - calculate similarities based on molecular fingerprints

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
import os
import sys
sys.path.insert(0, os.path.dirname(os.getcwd()))

# Locations (import paths from config.py)
from config import ROOT, PATH_SPEC2VEC, PATH_MS_DATA, PATH_SAVE_MODEL, PATH_COMPUTED, PATH_OUTPUT

sys.path.insert(0, PATH_SPEC2VEC)

import matchms.helper_functions as functions
import matchms.MS_functions

## Import unique-inchi subset

In [3]:
# Import & filter data from positive uniqueInchikey dataset
file_json = os.path.join(PATH_MS_DATA, "uniqueInchikey_positive_minmax_10_1000_2dec_exp08_191116.json")
file_mgf = os.path.join(PATH_MS_DATA, "allGNPS_positive_uniqueInchikey_191107.mgf")

spectra, spectra_dict, MS_documents, MS_documents_intensity, spectra_metadata = MS_functions.load_MGF_data(file_mgf = file_mgf,
                                                           file_json = file_json,
                                                           num_decimals = 2,
                                                           min_frag = 0.0, max_frag = 1000.0,
                                                           min_loss = 5.0, max_loss = 500.0,
                                                           min_intensity_perc = 0.0,
                                                           exp_intensity_filter = 0.8,
                                                           min_keep_peaks_0 = 10,
                                                           min_keep_peaks_per_mz = 20/200,
                                                           min_peaks = 10,
                                                           max_peaks = 1000, #None, #500,
                                                           peak_loss_words = ['peak_', 'loss_'])

Spectra json file found and loaded.


In [4]:
print("Number of imported spectra:", len(spectra))

Number of imported spectra: 10998


In [5]:
spectra_metadata.head()

Unnamed: 0,doc_ID,gnps_ID,name,title,precursor_mz,num_peaks_losses,inchi,inchikey,smiles,charge
0,0,CCMSLIB00005435506,cholic acid M+Na,,431.277,35,"""InChI=1S/C24H40O5/c1-13(4-7-21(28)29)16-5-6-1...",BHQCQFFYRZLCQQ-IHELEQLESA-N,C[C@H](CCC(=O)O)[C@H]1CC[C@@H]2[C@@]1([C@H](C[...,1
1,1,CCMSLIB00005435507,deoxycholic acid M-H2O+H,,375.289,157,"""InChI=1S/C24H40O4/c1-14(4-9-22(27)28)18-7-8-1...",KXGVEGMKQFWNSR-BKAWJTANSA-N,C[C@H](CCC(=O)O)[C@H]1CC[C@@H]2[C@@]1([C@H](C[...,1
2,2,CCMSLIB00005435517,glycocholic acid 2M+H,,931.625,91,"""InChI=1S/C26H43NO6/c1-14(4-7-22(31)27-13-23(3...",RFDAIACWWDREDC-VKRKCYKBSA-N,C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@@H]2[C@@]1([...,1
3,3,CCMSLIB00005435523,glycodeoxycholic acid 2M+Na,,921.617,24,"""InChI=1S/C26H43NO5/c1-15(4-9-23(30)27-14-24(3...",WVULKSPCQVQLCU-JKPPYMBGSA-N,C[C@H](CCC(=O)NCC(=O)O)[C@H]1CCC2[C@@]1([C@H](...,1
4,4,CCMSLIB00005435528,glycohyocholic acid 2M+K,,953.606,15,"""InChI=1S/C26H43NO6/c1-14(4-7-20(29)27-13-21(3...",ZQYUKJFJPJDMMR-ZDWCHQGWSA-N,C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@@H]2[C@@]1(C...,1


# Calculate molecular fingerprints
There are many different types of molecular fingerprints. And they all come in different flavors and bit depths.
We here focus on two common types: circular fingerprints (such as ecfp or morgan) and daylight-like fingerprints.

## 1. Circular fingerprint --> "morgan3" from RDkit

In [6]:
fingerprints_morgan3_2048, exclude_IDs = MS_functions.get_mol_fingerprints(spectra, method = "morgan3", nBits = 2048)

---- (1) Generating RDkit molecules from inchi or smiles...
No proper molecule generated for spectrum 1435
No proper molecule generated for spectrum 1440
No proper molecule generated for spectrum 2232
No proper molecule generated for spectrum 2233
No proper molecule generated for spectrum 9384
No proper molecule generated for spectrum 10791
No proper molecule generated for spectrum 10991
---- (2) Generating fingerprints from molecules...
Problem with molecule from spectrum 1435
Problem with molecule from spectrum 1440
Problem with molecule from spectrum 2232
Problem with molecule from spectrum 2233
Problem with molecule from spectrum 9384
Problem with molecule from spectrum 10791
Problem with molecule from spectrum 10991


In [7]:
exclude_IDs

[1435, 1440, 2232, 2233, 9384, 10791, 10991]

In [8]:
fingerprints_morgan3_2048[0].shape, len(fingerprints_morgan3_2048)

((2048,), 10998)

In [9]:
fingerprints_morgan3_2048[0][:20]

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## 2. Daylight-like fingerprint --> RDkit

In [10]:
fingerprints_daylight_2048, exclude_IDs = MS_functions.get_mol_fingerprints(spectra, method = "daylight", nBits = 2048)

---- (1) Generating RDkit molecules from inchi or smiles...
No proper molecule generated for spectrum 1435
No proper molecule generated for spectrum 1440
No proper molecule generated for spectrum 2232
No proper molecule generated for spectrum 2233
No proper molecule generated for spectrum 9384
No proper molecule generated for spectrum 10791
No proper molecule generated for spectrum 10991
---- (2) Generating fingerprints from molecules...
Problem with molecule from spectrum 1435
Problem with molecule from spectrum 1440
Problem with molecule from spectrum 2232
Problem with molecule from spectrum 2233
Problem with molecule from spectrum 9384
Problem with molecule from spectrum 10791
Problem with molecule from spectrum 10991


In [11]:
fingerprints_daylight_2048[0][:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [12]:
fingerprints_daylight_2048[0].shape, len(fingerprints_daylight_2048)

((2048,), 10998)

### Note: for 6 spectra, fingerprints could not been made
This spectra will be ignored for the rest of the analysis.

In [13]:
exclude_IDs

[1435, 1440, 2232, 2233, 9384, 10791, 10991]

In [14]:
fingerprints_morgan3_2048[1435][:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Calculate all-vs-all of molecular similarity scores
### Calculate all-vs-all matrix of molecular similarities (dice score using morgan3 fingerprints)

In [15]:
import matchms.MS_similarity_classical as MS_sim_classic

filename = os.path.join(PATH_COMPUTED, 'MS_sim_mol_uniqueInchikey_morgan3_dice2048_191119.npy')
mol_sim = MS_sim_classic.mol_sim_matrix(fingerprints_morgan3_2048,
                                       fingerprints_morgan3_2048,
                                       method = 'dice',
                                       filename = filename)

Could not find file  C:\OneDrive - Netherlands eScience Center\Project_Wageningen_iOMEGA\Spec2Vec\computed_results\MS_sim_mol_uniqueInchikey_morgan3_dice2048_191119.npy
Molecular scores will be calculated from scratch.
 Calculated submatrix 121 out of 121----------------------------------------
Succesfully calculated matrix containing all-vs-all molecular similarity values.
Matrix was saved under: C:\OneDrive - Netherlands eScience Center\Project_Wageningen_iOMEGA\Spec2Vec\computed_results\MS_sim_mol_uniqueInchikey_morgan3_dice2048_191119.npy


### Calculate all-vs-all matrix of molecular similarities (jaccard using fingerprints)

In [29]:
filename = os.path.join(PATH_COMPUTED, 'MS_sim_mol_uniqueInchikey_rdkit2048_jaccard_191119.npy')
mol_sim = MS_sim_classic.mol_sim_matrix(fingerprints_daylight_2048,
                                       fingerprints_daylight_2048,
                                       method = 'jaccard',
                                       filename = filename)

Could not find file  C:\OneDrive - Netherlands eScience Center\Project_Wageningen_iOMEGA\Spec2Vec\computed_results\MS_sim_mol_uniqueInchikey_rdkit2048_jaccard_191119.npy
Molecular scores will be calculated from scratch.
 Calculated submatrix 121 out of 121----------------------------------------
Succesfully calculated matrix containing all-vs-all molecular similarity values.
Matrix was saved under: C:\OneDrive - Netherlands eScience Center\Project_Wageningen_iOMEGA\Spec2Vec\computed_results\MS_sim_mol_uniqueInchikey_rdkit2048_jaccard_191119.npy
