# Pre processing of case study spectra
The case studies spectra are first preprocessed using MSDIAL (not included in this notebook). Followed by the processing in this notebook. 

### Merge fractions
Multiple fractions are used, which are stored in separate files. To make processing easier the files are combined and the fraction is added as metadata. 

In [2]:
import os
from matchms.importing import load_from_mgf
import re


all_spectra = []
# Define the path to the folder
for dir_name in ("./mgf_RNEG_RP_each5", "./mgf_RPOS_RP_each5"):
    for file_name in os.listdir(dir_name):
        file_path = os.path.join(dir_name, file_name)
    
        if os.path.isfile(file_path):
            fraction_nr_match = re.search(r'Fraction(\d+)', file_name)
            if fraction_nr_match:
                fraction_number = fraction_nr_match.group(1)
                fraction_number = int(fraction_number)
            spectra = list(load_from_mgf(file_path))
            
            for spectrum in spectra:
                spectrum.set("fraction", fraction_number)
                all_spectra.append(spectrum)


In [None]:
from matchms.exporting import save_as_mgf
save_as_mgf(all_spectra, "./combined_spectra.mgf")

### Clean using matchms
The spectra are cleaned and harmonized using matchms

In [3]:
import os
from matchms.Pipeline import Pipeline, create_workflow
from matchms.filtering.default_pipelines import DEFAULT_FILTERS
from matchms.similarity.PrecursorMzMatch import PrecursorMzMatch

workflow = create_workflow(
    query_filters=DEFAULT_FILTERS + [("require_minimum_number_of_peaks", {"n_required": 5})],
)
pipeline = Pipeline(workflow)
report = pipeline.run("./combined_spectra.mgf")



Processing spectrums: 40265it [09:38, 69.57it/s]


In [4]:
print(report)

----- Spectrum Processing Report -----
Number of spectrums processed: 40265
Number of spectrums removed: 38153
Changes during processing:
                                 removed spectra  changed metadata  changed mass spectrum
filter                                                                                   
require_minimum_number_of_peaks            38153                 0                      0
add_compound_name                              0             40265                      0
add_retention_index                            0             40265                      0
add_retention_time                             0             40265                      0
derive_ionmode                                 0             40265                      0
harmonize_undefined_inchikey                   0             40265                      0
harmonize_undefined_inchi                      0             40265                      0
harmonize_undefined_smiles                     0    

In [None]:
from matchms.exporting import save_as_mgf
save_as_mgf(pipeline.spectrums_queries, "./cleaned_spectra.mgf")

### Separately save pos and neg for MS2Query predictions
MS2Query needs to be run separately for pos and neg. So the files with pos and neg are saved separately.

In [5]:
pos_spectra = []
neg_spectra = []
for spectrum in pipeline.spectrums_queries:
    if spectrum.get("ionmode") == "positive":
        pos_spectra.append(spectrum)
    else:
        neg_spectra.append(spectrum)


In [None]:
save_as_mgf(pos_spectra, "./cleaned_pos_spectra.mgf")
save_as_mgf(neg_spectra, "./cleaned_neg_spectra.mgf")

# Add identifiers
To be able to cross link the identifiers between ms2query results, mol networking and other annotations, the spectra need identifiers. They are simply numbered with a prefix of pos or neg

In [None]:
spectra_pos = list(load_from_mgf("./cleaned_pos_spectra.mgf"))
spectra_neg = list(load_from_mgf("./cleaned_neg_spectra.mgf"))

In [None]:
for i, spectrum in enumerate(spectra_pos):
    query_spectrum_nr = "pos_"+ str(i + 1)
    spectrum.set("query_spectrum_nr", query_spectrum_nr)
for i, spectrum in enumerate(spectra_neg):
    query_spectrum_nr = "neg_"+ str(i + 1)
    spectrum.set("query_spectrum_nr", query_spectrum_nr)

In [None]:
from matchms.exporting import save_as_mgf
save_as_mgf(spectra_pos + spectra_neg, "./cleaned_spectra_pos_neg_with_numbering.mgf")