# Visualization of MS spectrum

In [1]:
%matplotlib inline

from pathlib import Path

from matplotlib import pyplot as plt,__version__ as plt_version
from pkg_resources import get_distribution  # Comes with setuptools.
from pyteomics import mzml, auxiliary

import ipywidgets as widgets
from IPython.display import clear_output

print(f"Matplotlib version : {plt_version}")
try:
    print(f"pyteomics version: {get_distribution('pyteomics').version}")
except:
    print("pyteomics version not found")

Matplotlib version : 3.1.1
pyteomics version: 4.1.2


Choosing the file is done programmatically as the file upload may not work due to the size
of the mzML files.

In [2]:
# Get the mzml file.
DATA_DIR = Path("..").resolve().joinpath("data/")

mzml_file = DATA_DIR.joinpath("1937004_Q1_5.mzML")

In [3]:
scans = []
mz_arrays = []
intensity_arrays = []
retention_times = []

In [4]:
# Get the MS1 spectra.
with mzml.read(str(mzml_file)) as reader:
    for spectrum in reader:
        if spectrum["ms level"] == 1:
            scans.append(spectrum["id"])
            mz_arrays.append(spectrum["m/z array"])
            intensity_arrays.append(spectrum["intensity array"])
            retention_times.append(spectrum["scanList"]["scan"][0]["scan start time"])
            

In [5]:
# Widget management.
file_box = widgets.Select(
    options=retention_times,
    # Writing Retention time is too long for the default label.
    description='RT (minutes):',
    disabled=False,
)

button = widgets.Button(description='Display',
                        button_style="info")


# Configuring the widgets interactions.

out = widgets.Output()

# Display the image when the button is clicked.  Is rather slow.
def display_image(_):
    with out:
        clear_output()
        # Supposes the retention time have same index as arrays.
        spectrum_index = retention_times.index(file_box.value)
        # The width needs to be adjusted for some arrays with extreme peaks.
        plt.bar(mz_arrays[spectrum_index], intensity_arrays[spectrum_index])
        plt.xlabel("m/z")
        plt.ylabel("intensity")
        plt.show()

button.on_click(display_image)

In [6]:
# Main widget.
box = widgets.VBox([file_box, button, out])
box

VBox(children=(Select(description='RT (minutes):', options=(0.0030056662, 0.011141001, 0.017401364, 0.02369308…

## Open a MzML with pyopenMS

In [7]:
from pyopenms import __version__ as pyopenms_version, MzMLFile, MSExperiment, IdXMLFile
print(pyopenms_version)

2.4.0


In [8]:
# Storing the MzML content
exp = MSExperiment()
MzMLFile().load(str(mzml_file), exp)

In [9]:
# The parsing is too slow.
"""
spectrums = []
for spec in exp.getSpectra():
    if spec.getMSLevel() == 1:
        spectrums.append(spec)
print(len(spectrums))
"""

'\nspectrums = []\nfor spec in exp.getSpectra():\n    if spec.getMSLevel() == 1:\n        spectrums.append(spec)\nprint(len(spectrums))\n'

## Try using pyopenMS to parse the idXML

In [10]:
idxml_file = DATA_DIR.joinpath("1937004_Q1_5.idXML")

In [11]:
from pyopenms import IdXMLFile

In [12]:
protein_ids = []
peptide_ids = []

IdXMLFile().load(str(idxml_file), protein_ids, peptide_ids)

In [13]:
pep_ids = list(peptide_ids)
print(len(pep_ids))

31920


In [14]:
print(pep_ids[0].getMZ())
print(pep_ids[0].getMetaValue("spectrum_reference"))

415.205402217913
b'controllerType=0 controllerNumber=1 scan=1008'


In [15]:
for hit in pep_ids[0].getHits():
    print(" - Peptide hit rank:", hit.getRank())
    print(" - Peptide hit sequence:", hit.getSequence().toString())
    print(" - Peptide hit score:", hit.getScore())

 - Peptide hit rank: 0
 - Peptide hit sequence: b'VQPEDNK'
 - Peptide hit score: 0.8249030113220215


In [16]:
idxml_scans = [scan.getMetaValue("spectrum_reference").decode() for scan in pep_ids]
print(len(idxml_scans))

31920


In [17]:
# mzml scan values vs idxml
print(scans[0])
print(pep_ids[0].getMetaValue("spectrum_reference").decode())  # Decode to go from binary to utf8

controllerType=0 controllerNumber=1 scan=1
controllerType=0 controllerNumber=1 scan=1008


## Take both MS1 and MS2

In [None]:
ms1_scans = []
ms1_mz_arrays = []
ms1_intensity_arrays = []
ms1_retention_times = []

ms2_scans = []
ms2_mz_arrays = []
ms2_intensity_arrays = []
ms2_retention_times = []

with mzml.read(str(mzml_file)) as reader:
    for spectrum in reader:
        if spectrum["ms level"] == 1:
            ms1_scans.append(spectrum["id"])
            ms1_mz_arrays.append(spectrum["m/z array"])
            ms1_intensity_arrays.append(spectrum["intensity array"])
            ms1_retention_times.append(spectrum["scanList"]["scan"][0]["scan start time"])
        elif spectrum["ms level"] == 2:
            ms2_scans.append(spectrum["id"])
            ms2_mz_arrays.append(spectrum["m/z array"])
            ms2_intensity_arrays.append(spectrum["intensity array"])
            ms2_retention_times.append(spectrum["scanList"]["scan"][0]["scan start time"])


In [None]:
print(len(ms1_scans) + len(ms2_scans))

In [None]:
idxml_scans = [scan.getMetaValue("spectrum_reference").decode() for scan in pep_ids]
print(len(idxml_scans))

In [None]:
[elem for elem in dir(pep_ids[0]) if not elem.startswith("_")]

In [None]:
idxml_scans[:5]

In [None]:
spectrum_index = ms2_retention_times.index(ms2_retention_times[0])
# The width needs to be adjusted for some arrays with extreme peaks.
plt.bar(ms2_mz_arrays[spectrum_index], ms2_intensity_arrays[spectrum_index])
plt.xlabel("m/z")
plt.ylabel("intensity")
plt.show()