# Create molecular networks
This notebooks creates a graphml file for the case study data to enable visualizing mol networks in cytoscape

# Download data from zenodo
The ms2deepscore model, case study data and MS2Query annotations are downloaded from zenodo

In [1]:
import requests
import os

from matchms.exporting import save_as_mgf
from matchms.filtering import require_minimum_number_of_peaks
from matchms.importing import load_from_mgf
from tqdm import tqdm

def download_file(link, file_name):
    response = requests.get(link, stream=True)
    if os.path.exists(file_name):
        print(f"The file {file_name} already exists, the file won't be downloaded")
        return
    total_size = int(response.headers.get('content-length', 0))
    
    with open(file_name, "wb") as f, tqdm(desc="Downloading file", total=total_size, unit='B', unit_scale=True, unit_divisor=1024,) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                bar.update(len(chunk))  # Update progress bar by the chunk size
    
model_file_name = "ms2deepscore_model.pt"
# case_study_spectra_file_name = "case_study_spectra.mgf"

download_file("https://zenodo.org/records/14290920/files/settings.json?download=1", "ms2deepscore_settings.json")
download_file("https://zenodo.org/records/14290920/files/ms2deepscore_model.pt?download=1", model_file_name)
# download_file("https://zenodo.org/records/14535374/files/cleaned_spectra_pos_neg_with_numbering.mgf?download=1", case_study_spectra_file_name)


Downloading file: 2.18kB [00:00, ?B/s]


FileNotFoundError: [Errno 2] No such file or directory: '../UrineCase_study/ms2deepscore_model.pt'

### Merge the spectra into a single file


In [29]:
import os
from matchms.importing.load_from_mzml import load_from_mzml

neg_folder_path = './mzml/neg'
pos_folder_path = './mzml/pos'

mzml_neg_files = [f for f in os.listdir(neg_folder_path) if f.lower().endswith('.mzml')]
mzml_pos_files = [f for f in os.listdir(pos_folder_path) if f.lower().endswith('.mzml')]

all_spectra = []
for spectrum_file_name in mzml_neg_files:
    file_id = spectrum_file_name[-7:-5]
    spectra = list(load_from_mzml(os.path.join(neg_folder_path, spectrum_file_name)))
    for i, spectrum in enumerate(spectra):
        spectrum.set("identifier", f"neg_{file_id}_{i}")
        spectrum.set("ionmode", "negative")
    all_spectra += spectra
for spectrum_file_name in mzml_pos_files:
    file_id = spectrum_file_name[-7:-5]
    spectra = list(load_from_mzml(os.path.join(pos_folder_path, spectrum_file_name)))
    for i, spectrum in enumerate(spectra):
        spectrum.set("identifier", f"pos_{file_id}_{i}")
        spectrum.set("ionmode", "positive")

    all_spectra += spectra

In [30]:
len(all_spectra)

6105

In [33]:
from matchms.exporting.save_as_mgf import save_as_mgf
save_as_mgf(all_spectra, "blood_case_study_spectra.mgf")

dict_keys(['spectra'])


### Load MS2Deepscore model

In [27]:
from ms2deepscore.models import load_model
model_file_name = "../Urine_case_study/ms2deepscore_model.pt"

model = load_model(model_file_name)

### Create spectral similarity scores
The spectrum file "./cleaned_spectra_pos_neg_with_numbering.mgf" was created in pre_processing_spectra

In [34]:
from matchms.Pipeline import Pipeline, create_workflow
from ms2deepscore import MS2DeepScore

workflow = create_workflow(
    query_filters=[["require_minimum_number_of_peaks", {"n_required": 3}]],
    score_computations=[
        [MS2DeepScore, {"model": model}],
        ],
)
pipeline = Pipeline(workflow)
pipeline.logging_level = "ERROR"  # To define the verbosety of the logging
report = pipeline.run("blood_case_study_spectra.mgf")

Processing spectra: 6105it [00:07, 821.40it/s] 
6100it [02:33, 39.63it/s]


### Create a network
The pipeline.scores contain all the scores. To make a molecular network only some of the similarity scores are stored. They are only stored if the score is at least 0.85 and each node (spectrum) is only connected to the top 5 highest similarity scores and only if it is in the tup 5 of that other spectrum as well. 

This is the common approach for creating molecular networks (exact settings vary) and allows for getting visually pleasing mol networks (preventing giant hairbals).

In [36]:
from matchms.networking import SimilarityNetwork

# Define settings
ms2ds_network = SimilarityNetwork(
    identifier_key="identifier",
    score_cutoff=0.85,  # higher numbers produce more isolated sub-graphs
    max_links=10,  # lower number makes sparser networks
    link_method="mutual",  # mutual means: link is only added if in top list of both nodes
)

# Compute the graph (takes some time)
ms2ds_network.create_network(pipeline.scores, score_name="MS2DeepScore")

### save to graphml

In [37]:
# Export to graphml
ms2ds_network.export_to_graphml("ms2ds_graph_min_0_85_score_10_links.graphml")


# Add annotations

Load MS2Query annotations

In [66]:
import pandas as pd

neg_annotations = pd.read_csv("./mzml/neg/results/combined_ms2query_neg_results.csv")

neg_ids = neg_annotations['query_spectrum_nr'].tolist()
neg_identifiers =[]
file_id = 0
previous_id = 1000
for id in neg_ids:
    if previous_id > id:
        file_id += 1
    previous_id = id

    if file_id < 10:
        spectrum_identifier = f"neg_0{file_id}_{id-1}"
    else:
        spectrum_identifier = f"neg_{file_id}_{id-1}"
    neg_identifiers.append(spectrum_identifier)
neg_annotations.insert(loc=1, column='identifiers', value=neg_identifiers)
neg_annotations


Unnamed: 0,query_spectrum_nr,identifiers,...,npc_superclass_results,npc_pathway_results
0,1,neg_01_0,...,Saccharides,Carbohydrates
1,2,neg_01_1,...,Nucleosides,Carbohydrates
2,3,neg_01_2,...,,Carbohydrates
3,4,neg_01_3,...,,Fatty acids
4,5,neg_01_4,...,Saccharides,Carbohydrates
...,...,...,...,...,...
2887,286,neg_10_285,...,Glycerophospholipids,Fatty acids
2888,287,neg_10_286,...,Tyrosine alkaloids,Alkaloids
2889,288,neg_10_287,...,,Alkaloids
2890,289,neg_10_288,...,Glycerophospholipids,Fatty acids


In [67]:
import pandas as pd

pos_annotations = pd.read_csv("./mzml/pos/results/combined_ms2query_pos_results.csv")

pos_ids = pos_annotations['query_spectrum_nr'].tolist()
pos_identifiers =[]
file_id = 0
previous_id = 1000
for id in pos_ids:
    if previous_id > id:
        file_id += 1
    previous_id = id
    if file_id < 10:
        spectrum_identifier = f"pos_0{file_id}_{id-1}"
    else:
        spectrum_identifier = f"pos_{file_id}_{id-1}"
    pos_identifiers.append(spectrum_identifier)
pos_annotations.insert(loc=1, column='identifiers', value=pos_identifiers)
pos_annotations

Unnamed: 0,query_spectrum_nr,identifiers,...,npc_superclass_results,npc_pathway_results
0,1,pos_01_0,...,,Carbohydrates
1,2,pos_01_1,...,Saccharides,Carbohydrates
2,3,pos_01_2,...,Nicotinic acid alkaloids,Alkaloids
3,4,pos_01_3,...,Polyethers,Polyketides
4,5,pos_01_4,...,Coumarins,Shikimates and Phenylpropanoids
...,...,...,...,...,...
3200,305,pos_10_304,...,Lysine alkaloids,Alkaloids
3201,306,pos_10_305,...,Oligopeptides,Amino acids and Peptides; Polyketides
3202,307,pos_10_306,...,Steroids,Terpenoids
3203,308,pos_10_307,...,Octadecanoids,Fatty acids


In [74]:
# double check that the ids match
for row in tqdm(pos_annotations.itertuples()):
    identifier = row.identifiers
    precursor_mz = row.precursor_mz_query_spectrum
    for spectrum in all_spectra:
        if spectrum.get('identifier') == identifier:
            assert round(float(spectrum.get("precursor_mz")), ndigits=1) == round(precursor_mz, 1), "not matching precursor_mz"


406it [01:06,  6.06it/s]


KeyboardInterrupt: 

In [77]:
combined_ms2query_annotations = pd.concat([pos_annotations, neg_annotations])

In [78]:
combined_ms2query_annotations

Unnamed: 0,query_spectrum_nr,identifiers,...,npc_superclass_results,npc_pathway_results
0,1,pos_01_0,...,,Carbohydrates
1,2,pos_01_1,...,Saccharides,Carbohydrates
2,3,pos_01_2,...,Nicotinic acid alkaloids,Alkaloids
3,4,pos_01_3,...,Polyethers,Polyketides
4,5,pos_01_4,...,Coumarins,Shikimates and Phenylpropanoids
...,...,...,...,...,...
2887,286,neg_10_285,...,Glycerophospholipids,Fatty acids
2888,287,neg_10_286,...,Tyrosine alkaloids,Alkaloids
2889,288,neg_10_287,...,,Alkaloids
2890,289,neg_10_288,...,Glycerophospholipids,Fatty acids


In [79]:
combined_ms2query_annotations.to_csv("combined_ms2query_annotations.csv")

### Load into cytoscape

The graphml file can be loaded into cytoscape: https://cytoscape.org/ This is an open source platform for visualizing graphs.


To recreate the case study results:
- Open cytoscape
- Load in the above created graphml file.
- Set style settings (or load in a style file)
- Set up chemviz to visualize chemical information.
- Explore your data!