Merge pull request #229 from jonasscheid/dev

Add ion annotation feature
nf-core · Nov 14, 2022 · 6350048 · 6350048
2 parents 05bdea2 + 0c2f3dd
commit 6350048
Show file tree

Hide file tree

Showing 4 changed files with 346 additions and 3 deletions.
diff --git a/bin/get_ion_annotations.py b/bin/get_ion_annotations.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python
+__author__ = "Jonas Scheid"
+
+from typing import Tuple
+from pyopenms import *
+import pandas as pd
+import numpy as np
+import argparse
+from pyopenms.Plotting import *
+
+def parse_arguments() -> Tuple[argparse.ArgumentParser, argparse.Namespace]:
+    """
+    Purpose: Parse command line arguments
+    Output: parser
+            args: Contains all parsed command line arguments
+    """
+    parser = argparse.ArgumentParser(
+        description="""Ion annotator \n Returns two tsv files (all_ions, matching_ions)
+    that retrieve all ions with their respective m/z and intensities as well as the ion annotation of ions matching
+    to the peptides theoretical spectrum"""
+    )
+
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        nargs="+",
+        help="List of replicate mzML files of a sample",
+    )
+    parser.add_argument(
+        "-idxml",
+        "--filtered_idXML",
+        required=True,
+        type=str,
+        help="FDR filtered idXML file that contains all peptides of the final results",
+    )
+    parser.add_argument(
+        "-p",
+        "--prefix",
+        required=True,
+        type=str,
+        help="Prefix that will be added to the two output files",
+    )
+    parser.add_argument(
+        "-pc",
+        "--precursor_charge",
+        type=str,
+        default="2:3",
+        help="Precursor charge range",
+    )
+    parser.add_argument(
+        "-fmt",
+        "--fragment_mass_tolerance",
+        type=str,
+        default="0.02",
+        help="The fragment mass tolerance of theoretical and experimental spectrum matching",
+    )
+    parser.add_argument(
+        "-a_ions",
+        "--use_a_ions",
+        action="store_true",
+        help="Add a-ions to the theoretical spectrum generation",
+    )
+    parser.add_argument(
+        "-c_ions",
+        "--use_c_ions",
+        action="store_true",
+        help="Add c-ions to the theoretical spectrum generation",
+    )
+    parser.add_argument(
+        "-x_ions",
+        "--use_x_ions",
+        action="store_true",
+        help="Add x-ions to the theoretical spectrum generation",
+    )
+    parser.add_argument(
+        "-z_ions",
+        "--use_z_ions",
+        action="store_true",
+        help="Add z-ions to the theoretical spectrum generation",
+    )
+    parser.add_argument(
+        "-rpp",
+        "--remove_precursor_peak",
+        type=bool,
+        default=False,
+        help="Do not consider precursor masses in the theoretical spectrum generation",
+    )
+
+    args = parser.parse_args()
+
+    return parser, args
+
+def generate_theoretical_spectrum(
+    peptide: PeptideIdentification, args: argparse.Namespace
+) -> MSSpectrum:
+    """
+    Purpose: Generate theoretical spectrum based on PeptideIdentification
+    Output: Theoretical spectrum of input peptide
+    """
+    sequence = peptide.getHits()[0].getSequence()
+    min_charge = int(args.precursor_charge.split(":")[0])
+    # If precursor charge ranges between 2:5, the fragment charges range from 1:4.
+    # Get the precursor charge information from the idXML file
+    min_fragment_charge, max_fragment_charge = min_charge - 1, max(
+        peptide.getHits()[0].getCharge() - 1, 1
+    )
+
+    # Define parameters for theoretical spectrum generation
+    tsg = TheoreticalSpectrumGenerator()
+    theo_spectrum = MSSpectrum()
+    p = tsg.getParameters()
+    p.setValue("add_y_ions", "true")
+    p.setValue("add_b_ions", "true")
+    if args.use_a_ions:
+        p.setValue("add_a_ions", "true")
+    if args.use_c_ions:
+        p.setValue("add_c_ions", "true")
+    if args.use_x_ions:
+        p.setValue("add_x_ions", "true")
+    if args.use_z_ions:
+        p.setValue("add_z_ions", "true")
+    p.setValue("add_metainfo", "true")
+    p.setValue("add_losses", "true")
+    p.setValue("add_precursor_peaks", "true")
+    p.setValue("add_first_prefix_ion", "true")
+    p.setValue("add_abundant_immonium_ions", "true")
+    if not args.remove_precursor_peak:
+        p.setValue("add_all_precursor_charges", "true")
+
+    # Generate theoretical spectrum
+    tsg.setParameters(p)
+    tsg.getSpectrum(theo_spectrum, sequence, min_fragment_charge, max_fragment_charge)
+
+    return theo_spectrum
+
+def flatten(ls: list) -> list:
+    """
+    Purpose: Flatten list of lists into a list
+    Output: List
+    """
+    return [item for sublist in ls for item in sublist]
+
+def __main__():
+    parser, args = parse_arguments()
+    protein_ids = []
+    peptide_ids = []
+    # Each peptide ID should only contain one hit in the FDR_filtered idXML
+    IdXMLFile().load(args.filtered_idXML, protein_ids, peptide_ids)
+    # Get the list of mzML files that have been merged together by IDmerger previously
+    filenames = [
+        filename.decode("utf-8")
+        for filename in protein_ids[0].getMetaValue("spectra_data")
+    ]
+    # Define empty lists that collect all the necessary information, which is comprised in DataFrames
+    ions = []
+    spectra_mz = []
+    spectra_intensities = []
+    spectra_peptides = []
+    spectra_nativeIDs = []
+    spectra_filename = []
+    is_matching_ion = []
+    # Define spectrum alignment class and set parameters for later use
+    spa = SpectrumAlignment()
+    # Specify parameters for the alignment. Pyopenms offers two parameters here
+    p = spa.getParameters()
+    # Since we look at the MS2 Spectrum we align Da tolerance
+    p.setValue("tolerance", float(args.fragment_mass_tolerance))
+    p.setValue("is_relative_tolerance", "false")  # false == Da, true == ppm
+    spa.setParameters(p)
+
+    for file in args.input:
+        # Load the mzML files into the pyopenms structure
+        exp = MSExperiment()
+        MzMLFile().load(file, exp)
+        # Create lookup object for easy spectrum reference.
+        spectrum_lookup = SpectrumLookup()
+        spectrum_lookup.readSpectra(exp, "scan=(?<SCAN>\\d+)")
+        # Iterate over the FDR filtered peptideIDs
+        for peptide_id in peptide_ids:
+            # Check if the PeptideHit originates from the current mzML file
+            if (
+                file.split("/")[-1]
+                != filenames[peptide_id.getMetaValue("id_merge_index")]
+            ):
+                continue
+            # Access raw spectrum via the spectrum native ID
+            spectrum = exp.getSpectrum(
+                spectrum_lookup.findByNativeID(
+                    peptide_id.getMetaValue("spectrum_reference")
+                )
+            )
+            # Save mz and intensities of all peaks to comprise them later in a DataFrame
+            spectrum_mz, spectrum_intensities = spectrum.get_peaks()
+            spectra_mz.append(spectrum_mz)
+            spectra_intensities.append(spectrum_intensities)
+            sequence = peptide_id.getHits()[0].getSequence()
+            spectra_peptides.append(np.repeat(sequence, len(spectrum_mz)))
+            is_matching_ion_peptide = np.repeat(False, len(spectrum_mz))
+            spectra_nativeIDs.append(
+                np.repeat(
+                    peptide_id.getMetaValue("spectrum_reference"), len(spectrum_mz)
+                )
+            )
+            # Generate theoretical spectrum of peptide hit
+            theo_spectrum = generate_theoretical_spectrum(peptide_id, args)
+            # Align both spectra
+            alignment = []
+            spa.getSpectrumAlignment(alignment, theo_spectrum, spectrum)
+            # Obtain ion annotations from theoretical spectrum if there are at least two matching ions
+            if len(alignment) > 1:
+                for theo_idx, obs_idx in alignment:
+                    ion_name = theo_spectrum.getStringDataArrays()[0][theo_idx].decode()
+                    ion_charge = theo_spectrum.getIntegerDataArrays()[0][theo_idx]
+                    obs_mz = spectrum[obs_idx].getMZ()
+                    obs_int = spectrum[obs_idx].getIntensity()
+                    ions.append(
+                        [
+                            sequence,
+                            ion_name,
+                            ion_charge,
+                            theo_spectrum[theo_idx].getMZ(),
+                            obs_mz,
+                            obs_int,
+                        ]
+                    )
+                    is_matching_ion_peptide[obs_idx] = True
+
+            is_matching_ion.append(is_matching_ion_peptide)
+
+        spectra_filename.append(
+            np.repeat(file, len(flatten(spectra_mz)) - len(flatten(spectra_filename)))
+        )
+
+    # Save information of matching ions
+    matching_ions = pd.DataFrame.from_records(
+        ions,
+        columns=[
+            "Peptide",
+            "Ion_name",
+            "Ion_charge",
+            "Theoretical_mass",
+            "Experimental_mass",
+            "Intensity",
+        ],
+    )
+    # Save information of all peaks (including non-matching peaks)
+    all_peaks_df = pd.DataFrame(
+        [
+            flatten(spectra_peptides),
+            flatten(spectra_mz),
+            flatten(spectra_intensities),
+            flatten(is_matching_ion),
+            flatten(spectra_nativeIDs),
+            flatten(spectra_filename),
+        ],
+        index=[
+            "Peptide",
+            "Experimental_mass",
+            "Intensity",
+            "Is_matching_ion",
+            "nativeID",
+            "filename",
+        ],
+    ).transpose()
+    all_peaks_df.index.name = "Ion_ID"
+    # Numpy magically converts booleans to floats of 1.0 and 0.0, revert that
+    all_peaks_df["Is_matching_ion"] = all_peaks_df["Is_matching_ion"].astype(bool)
+    # Add ion ID column to matching ions table for easy reference
+    matching_ions["Ion_ID"] = all_peaks_df.index[all_peaks_df["Is_matching_ion"]]
+    # Write matching ions table
+    matching_ions.to_csv(f"{args.prefix}_matching_ions.tsv", sep="\t", index=False)
+    all_peaks_df.to_csv(f"{args.prefix}_all_peaks.tsv", sep="\t")
+
+if __name__ == "__main__":
+    __main__()
diff --git a/conf/modules.config b/conf/modules.config
@@ -162,6 +162,20 @@ process {
         ]
     }
 
+    withName: 'PYOPENMS_ION_ANNOTATOR' {
+        ext.prefix = { "${meta.sample}" }
+        ext.args   = [
+                "--precursor_charge ${params.prec_charge}",
+                "--fragment_mass_tolerance ${params.fragment_mass_tolerance}",
+                "--remove_precursor_peak ${params.remove_precursor_peak}"
+            ].join(' ').trim()
+            publishDir  = [
+                path: { "${params.outdir}/intermediate_results/ion_annotations" },
+                mode: params.publish_dir_mode,
+                pattern: '*.tsv'
+            ]
+        }
+
     withName: 'OPENMS_TEXTEXPORTER_UNQUANTIFIED|OPENMS_TEXTEXPORTER_QUANTIFIED' {
             publishDir  = [
                 path: { "${params.outdir}/" },

diff --git a/modules/local/pyopenms_ion_annotator.nf b/modules/local/pyopenms_ion_annotator.nf
@@ -0,0 +1,42 @@
+process PYOPENMS_ION_ANNOTATOR {
+    tag "$sample"
+    label 'process_high'
+
+    conda (params.enable_conda ? "bioconda::pyopenms=2.8.0" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pyopenms:2.8.0--py310h3dc0cdb_1' :
+        'quay.io/biocontainers/pyopenms:2.8.0--py310h3dc0cdb_1' }"
+
+    input:
+        tuple val(sample), path(mzml), path(fdr_filtered_idxml)
+
+    output:
+        tuple val(sample), path("*.tsv"), path("*.tsv"), emit: tsv
+        path "versions.yml"             , emit: versions
+
+    script:
+        def prefix           = task.ext.prefix ?: "${mzml.baseName}"
+        def args             = task.ext.args  ?: ''
+
+        def xions            = params.use_x_ions ? "-use_x_ions" : ""
+        def zions            = params.use_z_ions ? "-use_z_ions" : ""
+        def aions            = params.use_a_ions ? "-use_a_ions" : ""
+        def cions            = params.use_c_ions ? "-use_c_ions" : ""
+
+        """
+        get_ion_annotations.py --input $mzml \\
+            -idxml $fdr_filtered_idxml \\
+            --prefix $sample \\
+            $args \\
+            $xions \\
+            $zions \\
+            $aions \\
+            $cions
+
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            pyopenms: \$(echo \$(FileInfo --help 2>&1) | sed 's/^.*Version: //; s/-.*\$//' | sed 's/ -*//; s/ .*\$//')
+        END_VERSIONS
+        """
+}
diff --git a/workflows/mhcquant.nf b/workflows/mhcquant.nf
@@ -70,6 +70,7 @@ include { OPENMS_IDFILTER as OPENMS_IDFILTER_Q_VALUE }                      from
 include { OPENMS_IDMERGER }                                                 from '../modules/local/openms_idmerger'
 include { OPENMS_PSMFEATUREEXTRACTOR }                                      from '../modules/local/openms_psmfeatureextractor'
 include { OPENMS_PERCOLATORADAPTER }                                        from '../modules/local/openms_percolatoradapter'
+include { PYOPENMS_ION_ANNOTATOR }                                          from '../modules/local/pyopenms_ion_annotator'
 
 include { OPENMS_TEXTEXPORTER as OPENMS_TEXTEXPORTER_FDR }                  from '../modules/local/openms_textexporter'
 include { OPENMS_TEXTEXPORTER as OPENMS_TEXTEXPORTER_UNQUANTIFIED }         from '../modules/local/openms_textexporter'
@@ -221,11 +222,11 @@ workflow MHCQUANT {
     // Prepare for check if file is empty
     OPENMS_TEXTEXPORTER_FDR(OPENMS_IDFILTER_Q_VALUE.out.idxml)
     // Return an error message when there is only a header present in the document
-    OPENMS_TEXTEXPORTER_FDR.out.tsv.map { 
+    OPENMS_TEXTEXPORTER_FDR.out.tsv.map {
         meta, tsv -> if (tsv.size() < 130) {
             log.error "It seems that there were no significant hits found for one or more samples.\nPlease consider incrementing the '--fdr_threshold' after removing the work directory or to exclude this sample."
             exit(0)
-        } 
+        }
     }
 
     //
@@ -302,6 +303,15 @@ workflow MHCQUANT {
         )
     }
 
+    // Annotate spectra with ion fragmentation information
+    ch_filtered_idxml = OPENMS_IDFILTER_Q_VALUE.out.idxml.map { meta, idxml -> [meta.id, idxml] }
+    ch_raw_spectra_and_filtered_peptides = ch_mzml_file.map {
+                                                meta, mzml -> [meta.sample + '_' + meta.condition, mzml[0]] }
+                                            .groupTuple()
+                                            .join(ch_filtered_idxml)
+
+    PYOPENMS_ION_ANNOTATOR( ch_raw_spectra_and_filtered_peptides )
+
     //
     // MODULE: Pipeline reporting
     //
@@ -315,7 +325,7 @@ workflow MHCQUANT {
     if (!params.skip_multiqc) {
         workflow_summary = WorkflowMhcquant.paramsSummaryMultiqc(workflow, summary_params)
         ch_workflow_summary = Channel.value(workflow_summary)
-        
+
         methods_description    = WorkflowMhcquant.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description)
         ch_methods_description = Channel.value(methods_description)
 
@@ -333,6 +343,7 @@ workflow MHCQUANT {
         multiqc_report = MULTIQC.out.report.toList()
         ch_versions    = ch_versions.mix(MULTIQC.out.versions)
     }
+
 }
 
 /*