Skip to content

Commit

Permalink
Merge pull request #229 from jonasscheid/dev
Browse files Browse the repository at this point in the history
Add ion annotation feature
  • Loading branch information
jonasscheid committed Nov 14, 2022
2 parents 05bdea2 + 0c2f3dd commit 6350048
Show file tree
Hide file tree
Showing 4 changed files with 346 additions and 3 deletions.
276 changes: 276 additions & 0 deletions bin/get_ion_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
#!/usr/bin/env python
__author__ = "Jonas Scheid"

from typing import Tuple
from pyopenms import *
import pandas as pd
import numpy as np
import argparse
from pyopenms.Plotting import *

def parse_arguments() -> Tuple[argparse.ArgumentParser, argparse.Namespace]:
"""
Purpose: Parse command line arguments
Output: parser
args: Contains all parsed command line arguments
"""
parser = argparse.ArgumentParser(
description="""Ion annotator \n Returns two tsv files (all_ions, matching_ions)
that retrieve all ions with their respective m/z and intensities as well as the ion annotation of ions matching
to the peptides theoretical spectrum"""
)

parser.add_argument(
"-i",
"--input",
required=True,
nargs="+",
help="List of replicate mzML files of a sample",
)
parser.add_argument(
"-idxml",
"--filtered_idXML",
required=True,
type=str,
help="FDR filtered idXML file that contains all peptides of the final results",
)
parser.add_argument(
"-p",
"--prefix",
required=True,
type=str,
help="Prefix that will be added to the two output files",
)
parser.add_argument(
"-pc",
"--precursor_charge",
type=str,
default="2:3",
help="Precursor charge range",
)
parser.add_argument(
"-fmt",
"--fragment_mass_tolerance",
type=str,
default="0.02",
help="The fragment mass tolerance of theoretical and experimental spectrum matching",
)
parser.add_argument(
"-a_ions",
"--use_a_ions",
action="store_true",
help="Add a-ions to the theoretical spectrum generation",
)
parser.add_argument(
"-c_ions",
"--use_c_ions",
action="store_true",
help="Add c-ions to the theoretical spectrum generation",
)
parser.add_argument(
"-x_ions",
"--use_x_ions",
action="store_true",
help="Add x-ions to the theoretical spectrum generation",
)
parser.add_argument(
"-z_ions",
"--use_z_ions",
action="store_true",
help="Add z-ions to the theoretical spectrum generation",
)
parser.add_argument(
"-rpp",
"--remove_precursor_peak",
type=bool,
default=False,
help="Do not consider precursor masses in the theoretical spectrum generation",
)

args = parser.parse_args()

return parser, args

def generate_theoretical_spectrum(
peptide: PeptideIdentification, args: argparse.Namespace
) -> MSSpectrum:
"""
Purpose: Generate theoretical spectrum based on PeptideIdentification
Output: Theoretical spectrum of input peptide
"""
sequence = peptide.getHits()[0].getSequence()
min_charge = int(args.precursor_charge.split(":")[0])
# If precursor charge ranges between 2:5, the fragment charges range from 1:4.
# Get the precursor charge information from the idXML file
min_fragment_charge, max_fragment_charge = min_charge - 1, max(
peptide.getHits()[0].getCharge() - 1, 1
)

# Define parameters for theoretical spectrum generation
tsg = TheoreticalSpectrumGenerator()
theo_spectrum = MSSpectrum()
p = tsg.getParameters()
p.setValue("add_y_ions", "true")
p.setValue("add_b_ions", "true")
if args.use_a_ions:
p.setValue("add_a_ions", "true")
if args.use_c_ions:
p.setValue("add_c_ions", "true")
if args.use_x_ions:
p.setValue("add_x_ions", "true")
if args.use_z_ions:
p.setValue("add_z_ions", "true")
p.setValue("add_metainfo", "true")
p.setValue("add_losses", "true")
p.setValue("add_precursor_peaks", "true")
p.setValue("add_first_prefix_ion", "true")
p.setValue("add_abundant_immonium_ions", "true")
if not args.remove_precursor_peak:
p.setValue("add_all_precursor_charges", "true")

# Generate theoretical spectrum
tsg.setParameters(p)
tsg.getSpectrum(theo_spectrum, sequence, min_fragment_charge, max_fragment_charge)

return theo_spectrum

def flatten(ls: list) -> list:
"""
Purpose: Flatten list of lists into a list
Output: List
"""
return [item for sublist in ls for item in sublist]

def __main__():
parser, args = parse_arguments()
protein_ids = []
peptide_ids = []
# Each peptide ID should only contain one hit in the FDR_filtered idXML
IdXMLFile().load(args.filtered_idXML, protein_ids, peptide_ids)
# Get the list of mzML files that have been merged together by IDmerger previously
filenames = [
filename.decode("utf-8")
for filename in protein_ids[0].getMetaValue("spectra_data")
]
# Define empty lists that collect all the necessary information, which is comprised in DataFrames
ions = []
spectra_mz = []
spectra_intensities = []
spectra_peptides = []
spectra_nativeIDs = []
spectra_filename = []
is_matching_ion = []
# Define spectrum alignment class and set parameters for later use
spa = SpectrumAlignment()
# Specify parameters for the alignment. Pyopenms offers two parameters here
p = spa.getParameters()
# Since we look at the MS2 Spectrum we align Da tolerance
p.setValue("tolerance", float(args.fragment_mass_tolerance))
p.setValue("is_relative_tolerance", "false") # false == Da, true == ppm
spa.setParameters(p)

for file in args.input:
# Load the mzML files into the pyopenms structure
exp = MSExperiment()
MzMLFile().load(file, exp)
# Create lookup object for easy spectrum reference.
spectrum_lookup = SpectrumLookup()
spectrum_lookup.readSpectra(exp, "scan=(?<SCAN>\\d+)")
# Iterate over the FDR filtered peptideIDs
for peptide_id in peptide_ids:
# Check if the PeptideHit originates from the current mzML file
if (
file.split("/")[-1]
!= filenames[peptide_id.getMetaValue("id_merge_index")]
):
continue
# Access raw spectrum via the spectrum native ID
spectrum = exp.getSpectrum(
spectrum_lookup.findByNativeID(
peptide_id.getMetaValue("spectrum_reference")
)
)
# Save mz and intensities of all peaks to comprise them later in a DataFrame
spectrum_mz, spectrum_intensities = spectrum.get_peaks()
spectra_mz.append(spectrum_mz)
spectra_intensities.append(spectrum_intensities)
sequence = peptide_id.getHits()[0].getSequence()
spectra_peptides.append(np.repeat(sequence, len(spectrum_mz)))
is_matching_ion_peptide = np.repeat(False, len(spectrum_mz))
spectra_nativeIDs.append(
np.repeat(
peptide_id.getMetaValue("spectrum_reference"), len(spectrum_mz)
)
)
# Generate theoretical spectrum of peptide hit
theo_spectrum = generate_theoretical_spectrum(peptide_id, args)
# Align both spectra
alignment = []
spa.getSpectrumAlignment(alignment, theo_spectrum, spectrum)
# Obtain ion annotations from theoretical spectrum if there are at least two matching ions
if len(alignment) > 1:
for theo_idx, obs_idx in alignment:
ion_name = theo_spectrum.getStringDataArrays()[0][theo_idx].decode()
ion_charge = theo_spectrum.getIntegerDataArrays()[0][theo_idx]
obs_mz = spectrum[obs_idx].getMZ()
obs_int = spectrum[obs_idx].getIntensity()
ions.append(
[
sequence,
ion_name,
ion_charge,
theo_spectrum[theo_idx].getMZ(),
obs_mz,
obs_int,
]
)
is_matching_ion_peptide[obs_idx] = True

is_matching_ion.append(is_matching_ion_peptide)

spectra_filename.append(
np.repeat(file, len(flatten(spectra_mz)) - len(flatten(spectra_filename)))
)

# Save information of matching ions
matching_ions = pd.DataFrame.from_records(
ions,
columns=[
"Peptide",
"Ion_name",
"Ion_charge",
"Theoretical_mass",
"Experimental_mass",
"Intensity",
],
)
# Save information of all peaks (including non-matching peaks)
all_peaks_df = pd.DataFrame(
[
flatten(spectra_peptides),
flatten(spectra_mz),
flatten(spectra_intensities),
flatten(is_matching_ion),
flatten(spectra_nativeIDs),
flatten(spectra_filename),
],
index=[
"Peptide",
"Experimental_mass",
"Intensity",
"Is_matching_ion",
"nativeID",
"filename",
],
).transpose()
all_peaks_df.index.name = "Ion_ID"
# Numpy magically converts booleans to floats of 1.0 and 0.0, revert that
all_peaks_df["Is_matching_ion"] = all_peaks_df["Is_matching_ion"].astype(bool)
# Add ion ID column to matching ions table for easy reference
matching_ions["Ion_ID"] = all_peaks_df.index[all_peaks_df["Is_matching_ion"]]
# Write matching ions table
matching_ions.to_csv(f"{args.prefix}_matching_ions.tsv", sep="\t", index=False)
all_peaks_df.to_csv(f"{args.prefix}_all_peaks.tsv", sep="\t")

if __name__ == "__main__":
__main__()
14 changes: 14 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,20 @@ process {
]
}

withName: 'PYOPENMS_ION_ANNOTATOR' {
ext.prefix = { "${meta.sample}" }
ext.args = [
"--precursor_charge ${params.prec_charge}",
"--fragment_mass_tolerance ${params.fragment_mass_tolerance}",
"--remove_precursor_peak ${params.remove_precursor_peak}"
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/intermediate_results/ion_annotations" },
mode: params.publish_dir_mode,
pattern: '*.tsv'
]
}

withName: 'OPENMS_TEXTEXPORTER_UNQUANTIFIED|OPENMS_TEXTEXPORTER_QUANTIFIED' {
publishDir = [
path: { "${params.outdir}/" },
Expand Down
42 changes: 42 additions & 0 deletions modules/local/pyopenms_ion_annotator.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
process PYOPENMS_ION_ANNOTATOR {
tag "$sample"
label 'process_high'

conda (params.enable_conda ? "bioconda::pyopenms=2.8.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/pyopenms:2.8.0--py310h3dc0cdb_1' :
'quay.io/biocontainers/pyopenms:2.8.0--py310h3dc0cdb_1' }"

input:
tuple val(sample), path(mzml), path(fdr_filtered_idxml)

output:
tuple val(sample), path("*.tsv"), path("*.tsv"), emit: tsv
path "versions.yml" , emit: versions

script:
def prefix = task.ext.prefix ?: "${mzml.baseName}"
def args = task.ext.args ?: ''

def xions = params.use_x_ions ? "-use_x_ions" : ""
def zions = params.use_z_ions ? "-use_z_ions" : ""
def aions = params.use_a_ions ? "-use_a_ions" : ""
def cions = params.use_c_ions ? "-use_c_ions" : ""

"""
get_ion_annotations.py --input $mzml \\
-idxml $fdr_filtered_idxml \\
--prefix $sample \\
$args \\
$xions \\
$zions \\
$aions \\
$cions
cat <<-END_VERSIONS > versions.yml
"${task.process}":
pyopenms: \$(echo \$(FileInfo --help 2>&1) | sed 's/^.*Version: //; s/-.*\$//' | sed 's/ -*//; s/ .*\$//')
END_VERSIONS
"""
}
17 changes: 14 additions & 3 deletions workflows/mhcquant.nf
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ include { OPENMS_IDFILTER as OPENMS_IDFILTER_Q_VALUE } from
include { OPENMS_IDMERGER } from '../modules/local/openms_idmerger'
include { OPENMS_PSMFEATUREEXTRACTOR } from '../modules/local/openms_psmfeatureextractor'
include { OPENMS_PERCOLATORADAPTER } from '../modules/local/openms_percolatoradapter'
include { PYOPENMS_ION_ANNOTATOR } from '../modules/local/pyopenms_ion_annotator'

include { OPENMS_TEXTEXPORTER as OPENMS_TEXTEXPORTER_FDR } from '../modules/local/openms_textexporter'
include { OPENMS_TEXTEXPORTER as OPENMS_TEXTEXPORTER_UNQUANTIFIED } from '../modules/local/openms_textexporter'
Expand Down Expand Up @@ -221,11 +222,11 @@ workflow MHCQUANT {
// Prepare for check if file is empty
OPENMS_TEXTEXPORTER_FDR(OPENMS_IDFILTER_Q_VALUE.out.idxml)
// Return an error message when there is only a header present in the document
OPENMS_TEXTEXPORTER_FDR.out.tsv.map {
OPENMS_TEXTEXPORTER_FDR.out.tsv.map {
meta, tsv -> if (tsv.size() < 130) {
log.error "It seems that there were no significant hits found for one or more samples.\nPlease consider incrementing the '--fdr_threshold' after removing the work directory or to exclude this sample."
exit(0)
}
}
}

//
Expand Down Expand Up @@ -302,6 +303,15 @@ workflow MHCQUANT {
)
}

// Annotate spectra with ion fragmentation information
ch_filtered_idxml = OPENMS_IDFILTER_Q_VALUE.out.idxml.map { meta, idxml -> [meta.id, idxml] }
ch_raw_spectra_and_filtered_peptides = ch_mzml_file.map {
meta, mzml -> [meta.sample + '_' + meta.condition, mzml[0]] }
.groupTuple()
.join(ch_filtered_idxml)

PYOPENMS_ION_ANNOTATOR( ch_raw_spectra_and_filtered_peptides )

//
// MODULE: Pipeline reporting
//
Expand All @@ -315,7 +325,7 @@ workflow MHCQUANT {
if (!params.skip_multiqc) {
workflow_summary = WorkflowMhcquant.paramsSummaryMultiqc(workflow, summary_params)
ch_workflow_summary = Channel.value(workflow_summary)

methods_description = WorkflowMhcquant.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description)
ch_methods_description = Channel.value(methods_description)

Expand All @@ -333,6 +343,7 @@ workflow MHCQUANT {
multiqc_report = MULTIQC.out.report.toList()
ch_versions = ch_versions.mix(MULTIQC.out.versions)
}

}

/*
Expand Down

0 comments on commit 6350048

Please sign in to comment.