Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ion annotation feature #229

Merged
merged 4 commits into from
Nov 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
276 changes: 276 additions & 0 deletions bin/get_ion_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
#!/usr/bin/env python
__author__ = "Jonas Scheid"

from typing import Tuple
from pyopenms import *
import pandas as pd
import numpy as np
import argparse
from pyopenms.Plotting import *

def parse_arguments() -> Tuple[argparse.ArgumentParser, argparse.Namespace]:
"""
Purpose: Parse command line arguments
Output: parser
args: Contains all parsed command line arguments
"""
parser = argparse.ArgumentParser(
description="""Ion annotator \n Returns two tsv files (all_ions, matching_ions)
that retrieve all ions with their respective m/z and intensities as well as the ion annotation of ions matching
to the peptides theoretical spectrum"""
)

parser.add_argument(
"-i",
"--input",
required=True,
nargs="+",
help="List of replicate mzML files of a sample",
)
parser.add_argument(
"-idxml",
"--filtered_idXML",
required=True,
type=str,
help="FDR filtered idXML file that contains all peptides of the final results",
)
parser.add_argument(
"-p",
"--prefix",
required=True,
type=str,
help="Prefix that will be added to the two output files",
)
parser.add_argument(
"-pc",
"--precursor_charge",
type=str,
default="2:3",
help="Precursor charge range",
)
parser.add_argument(
"-fmt",
"--fragment_mass_tolerance",
type=str,
default="0.02",
help="The fragment mass tolerance of theoretical and experimental spectrum matching",
)
parser.add_argument(
"-a_ions",
"--use_a_ions",
action="store_true",
help="Add a-ions to the theoretical spectrum generation",
)
parser.add_argument(
"-c_ions",
"--use_c_ions",
action="store_true",
help="Add c-ions to the theoretical spectrum generation",
)
parser.add_argument(
"-x_ions",
"--use_x_ions",
action="store_true",
help="Add x-ions to the theoretical spectrum generation",
)
parser.add_argument(
"-z_ions",
"--use_z_ions",
action="store_true",
help="Add z-ions to the theoretical spectrum generation",
)
parser.add_argument(
"-rpp",
"--remove_precursor_peak",
type=bool,
default=False,
help="Do not consider precursor masses in the theoretical spectrum generation",
)

args = parser.parse_args()

return parser, args

def generate_theoretical_spectrum(
peptide: PeptideIdentification, args: argparse.Namespace
) -> MSSpectrum:
"""
Purpose: Generate theoretical spectrum based on PeptideIdentification
Output: Theoretical spectrum of input peptide
"""
sequence = peptide.getHits()[0].getSequence()
min_charge = int(args.precursor_charge.split(":")[0])
# If precursor charge ranges between 2:5, the fragment charges range from 1:4.
# Get the precursor charge information from the idXML file
min_fragment_charge, max_fragment_charge = min_charge - 1, max(
peptide.getHits()[0].getCharge() - 1, 1
)

# Define parameters for theoretical spectrum generation
tsg = TheoreticalSpectrumGenerator()
theo_spectrum = MSSpectrum()
p = tsg.getParameters()
p.setValue("add_y_ions", "true")
p.setValue("add_b_ions", "true")
if args.use_a_ions:
p.setValue("add_a_ions", "true")
if args.use_c_ions:
p.setValue("add_c_ions", "true")
if args.use_x_ions:
p.setValue("add_x_ions", "true")
if args.use_z_ions:
p.setValue("add_z_ions", "true")
p.setValue("add_metainfo", "true")
p.setValue("add_losses", "true")
p.setValue("add_precursor_peaks", "true")
p.setValue("add_first_prefix_ion", "true")
p.setValue("add_abundant_immonium_ions", "true")
if not args.remove_precursor_peak:
p.setValue("add_all_precursor_charges", "true")

# Generate theoretical spectrum
tsg.setParameters(p)
tsg.getSpectrum(theo_spectrum, sequence, min_fragment_charge, max_fragment_charge)

return theo_spectrum

def flatten(ls: list) -> list:
"""
Purpose: Flatten list of lists into a list
Output: List
"""
return [item for sublist in ls for item in sublist]

def __main__():
parser, args = parse_arguments()
protein_ids = []
peptide_ids = []
# Each peptide ID should only contain one hit in the FDR_filtered idXML
IdXMLFile().load(args.filtered_idXML, protein_ids, peptide_ids)
# Get the list of mzML files that have been merged together by IDmerger previously
filenames = [
filename.decode("utf-8")
for filename in protein_ids[0].getMetaValue("spectra_data")
]
# Define empty lists that collect all the necessary information, which is comprised in DataFrames
ions = []
spectra_mz = []
spectra_intensities = []
spectra_peptides = []
spectra_nativeIDs = []
spectra_filename = []
is_matching_ion = []
# Define spectrum alignment class and set parameters for later use
spa = SpectrumAlignment()
# Specify parameters for the alignment. Pyopenms offers two parameters here
p = spa.getParameters()
# Since we look at the MS2 Spectrum we align Da tolerance
p.setValue("tolerance", float(args.fragment_mass_tolerance))
p.setValue("is_relative_tolerance", "false") # false == Da, true == ppm
spa.setParameters(p)

for file in args.input:
# Load the mzML files into the pyopenms structure
exp = MSExperiment()
MzMLFile().load(file, exp)
# Create lookup object for easy spectrum reference.
spectrum_lookup = SpectrumLookup()
spectrum_lookup.readSpectra(exp, "scan=(?<SCAN>\\d+)")
# Iterate over the FDR filtered peptideIDs
for peptide_id in peptide_ids:
# Check if the PeptideHit originates from the current mzML file
if (
file.split("/")[-1]
!= filenames[peptide_id.getMetaValue("id_merge_index")]
):
continue
# Access raw spectrum via the spectrum native ID
spectrum = exp.getSpectrum(
spectrum_lookup.findByNativeID(
peptide_id.getMetaValue("spectrum_reference")
)
)
# Save mz and intensities of all peaks to comprise them later in a DataFrame
spectrum_mz, spectrum_intensities = spectrum.get_peaks()
spectra_mz.append(spectrum_mz)
spectra_intensities.append(spectrum_intensities)
sequence = peptide_id.getHits()[0].getSequence()
spectra_peptides.append(np.repeat(sequence, len(spectrum_mz)))
is_matching_ion_peptide = np.repeat(False, len(spectrum_mz))
spectra_nativeIDs.append(
np.repeat(
peptide_id.getMetaValue("spectrum_reference"), len(spectrum_mz)
)
)
# Generate theoretical spectrum of peptide hit
theo_spectrum = generate_theoretical_spectrum(peptide_id, args)
# Align both spectra
alignment = []
spa.getSpectrumAlignment(alignment, theo_spectrum, spectrum)
# Obtain ion annotations from theoretical spectrum if there are at least two matching ions
if len(alignment) > 1:
for theo_idx, obs_idx in alignment:
ion_name = theo_spectrum.getStringDataArrays()[0][theo_idx].decode()
ion_charge = theo_spectrum.getIntegerDataArrays()[0][theo_idx]
obs_mz = spectrum[obs_idx].getMZ()
obs_int = spectrum[obs_idx].getIntensity()
ions.append(
[
sequence,
ion_name,
ion_charge,
theo_spectrum[theo_idx].getMZ(),
obs_mz,
obs_int,
]
)
is_matching_ion_peptide[obs_idx] = True

is_matching_ion.append(is_matching_ion_peptide)

spectra_filename.append(
np.repeat(file, len(flatten(spectra_mz)) - len(flatten(spectra_filename)))
)

# Save information of matching ions
matching_ions = pd.DataFrame.from_records(
ions,
columns=[
"Peptide",
"Ion_name",
"Ion_charge",
"Theoretical_mass",
"Experimental_mass",
"Intensity",
],
)
# Save information of all peaks (including non-matching peaks)
all_peaks_df = pd.DataFrame(
[
flatten(spectra_peptides),
flatten(spectra_mz),
flatten(spectra_intensities),
flatten(is_matching_ion),
flatten(spectra_nativeIDs),
flatten(spectra_filename),
],
index=[
"Peptide",
"Experimental_mass",
"Intensity",
"Is_matching_ion",
"nativeID",
"filename",
],
).transpose()
all_peaks_df.index.name = "Ion_ID"
# Numpy magically converts booleans to floats of 1.0 and 0.0, revert that
all_peaks_df["Is_matching_ion"] = all_peaks_df["Is_matching_ion"].astype(bool)
# Add ion ID column to matching ions table for easy reference
matching_ions["Ion_ID"] = all_peaks_df.index[all_peaks_df["Is_matching_ion"]]
# Write matching ions table
matching_ions.to_csv(f"{args.prefix}_matching_ions.tsv", sep="\t", index=False)
all_peaks_df.to_csv(f"{args.prefix}_all_peaks.tsv", sep="\t")

if __name__ == "__main__":
__main__()
14 changes: 14 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,20 @@ process {
]
}

withName: 'PYOPENMS_ION_ANNOTATOR' {
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved
ext.prefix = { "${meta.sample}" }
ext.args = [
"--precursor_charge ${params.prec_charge}",
"--fragment_mass_tolerance ${params.fragment_mass_tolerance}",
"--remove_precursor_peak ${params.remove_precursor_peak}"
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/intermediate_results/ion_annotations" },
mode: params.publish_dir_mode,
pattern: '*.tsv'
]
}

withName: 'OPENMS_TEXTEXPORTER_UNQUANTIFIED|OPENMS_TEXTEXPORTER_QUANTIFIED' {
publishDir = [
path: { "${params.outdir}/" },
Expand Down
42 changes: 42 additions & 0 deletions modules/local/pyopenms_ion_annotator.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
process PYOPENMS_ION_ANNOTATOR {
tag "$sample"
label 'process_high'

conda (params.enable_conda ? "bioconda::pyopenms=2.8.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/pyopenms:2.8.0--py310h3dc0cdb_1' :
'quay.io/biocontainers/pyopenms:2.8.0--py310h3dc0cdb_1' }"

input:
tuple val(sample), path(mzml), path(fdr_filtered_idxml)

output:
tuple val(sample), path("*.tsv"), path("*.tsv"), emit: tsv
path "versions.yml" , emit: versions

script:
def prefix = task.ext.prefix ?: "${mzml.baseName}"
def args = task.ext.args ?: ''

def xions = params.use_x_ions ? "-use_x_ions" : ""
def zions = params.use_z_ions ? "-use_z_ions" : ""
def aions = params.use_a_ions ? "-use_a_ions" : ""
def cions = params.use_c_ions ? "-use_c_ions" : ""

"""
get_ion_annotations.py --input $mzml \\
-idxml $fdr_filtered_idxml \\
--prefix $sample \\
$args \\
$xions \\
$zions \\
$aions \\
$cions


cat <<-END_VERSIONS > versions.yml
"${task.process}":
pyopenms: \$(echo \$(FileInfo --help 2>&1) | sed 's/^.*Version: //; s/-.*\$//' | sed 's/ -*//; s/ .*\$//')
END_VERSIONS
"""
}
17 changes: 14 additions & 3 deletions workflows/mhcquant.nf
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ include { OPENMS_IDFILTER as OPENMS_IDFILTER_Q_VALUE } from
include { OPENMS_IDMERGER } from '../modules/local/openms_idmerger'
include { OPENMS_PSMFEATUREEXTRACTOR } from '../modules/local/openms_psmfeatureextractor'
include { OPENMS_PERCOLATORADAPTER } from '../modules/local/openms_percolatoradapter'
include { PYOPENMS_ION_ANNOTATOR } from '../modules/local/pyopenms_ion_annotator'

include { OPENMS_TEXTEXPORTER as OPENMS_TEXTEXPORTER_FDR } from '../modules/local/openms_textexporter'
include { OPENMS_TEXTEXPORTER as OPENMS_TEXTEXPORTER_UNQUANTIFIED } from '../modules/local/openms_textexporter'
Expand Down Expand Up @@ -221,11 +222,11 @@ workflow MHCQUANT {
// Prepare for check if file is empty
OPENMS_TEXTEXPORTER_FDR(OPENMS_IDFILTER_Q_VALUE.out.idxml)
// Return an error message when there is only a header present in the document
OPENMS_TEXTEXPORTER_FDR.out.tsv.map {
OPENMS_TEXTEXPORTER_FDR.out.tsv.map {
meta, tsv -> if (tsv.size() < 130) {
log.error "It seems that there were no significant hits found for one or more samples.\nPlease consider incrementing the '--fdr_threshold' after removing the work directory or to exclude this sample."
exit(0)
}
}
}

//
Expand Down Expand Up @@ -302,6 +303,15 @@ workflow MHCQUANT {
)
}

// Annotate spectra with ion fragmentation information
ch_filtered_idxml = OPENMS_IDFILTER_Q_VALUE.out.idxml.map { meta, idxml -> [meta.id, idxml] }
ch_raw_spectra_and_filtered_peptides = ch_mzml_file.map {
meta, mzml -> [meta.sample + '_' + meta.condition, mzml[0]] }
.groupTuple()
.join(ch_filtered_idxml)

PYOPENMS_ION_ANNOTATOR( ch_raw_spectra_and_filtered_peptides )

//
// MODULE: Pipeline reporting
//
Expand All @@ -315,7 +325,7 @@ workflow MHCQUANT {
if (!params.skip_multiqc) {
workflow_summary = WorkflowMhcquant.paramsSummaryMultiqc(workflow, summary_params)
ch_workflow_summary = Channel.value(workflow_summary)

methods_description = WorkflowMhcquant.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description)
ch_methods_description = Channel.value(methods_description)

Expand All @@ -333,6 +343,7 @@ workflow MHCQUANT {
multiqc_report = MULTIQC.out.report.toList()
ch_versions = ch_versions.mix(MULTIQC.out.versions)
}

}

/*
Expand Down