Skip to content

Commit

Permalink
Fix cli tests: (#28)
Browse files Browse the repository at this point in the history
- Remove test_cli_pepxml because xml files don't work with streaming
- Replace old output file names
- Add random generator 'rng' variable to confidence since it is required for proteins
- Remove subset_max_train from PluginModel
- Fix bug: convert targets column after reading in chunks
- Fix peptide column name for confidence
- Fix test cli plugins : replace DecisionTreeClassifier with LinearSVC BECAUSE DecisionTreeClassifier return scores as 0 or 1
  • Loading branch information
sambenfredj authored and gessulat committed Feb 27, 2024
1 parent 8f417dd commit 2e1723e
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 84 deletions.
41 changes: 33 additions & 8 deletions mokapot/confidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ class GroupedConfidence:
----------
psms : OnDiskPsmDataset
A collection of PSMs.
rng : int or np.random.Generator, optional
A seed or generator used for cross-validation split creation and to
break ties, or ``None`` to use the default random number generator
state.
scores : np.ndarray
A vector containing the score of each PSM.
desc : bool
Expand Down Expand Up @@ -84,6 +88,7 @@ def __init__(
proteins=None,
combine=False,
prefixes=None,
rng=0,
):
"""Initialize a GroupedConfidence object"""
data = read_file(psms.filename, use_cols=list(psms.columns))
Expand Down Expand Up @@ -124,6 +129,7 @@ def __init__(
combine=combine,
prefixes=prefixes,
append_to_output_file=append_to_group,
rng=rng,
)
if combine:
append_to_group = True
Expand Down Expand Up @@ -236,11 +242,12 @@ class Confidence(object):
"peptide_pairs": "Peptide Pairs",
}

def __init__(self, psms, proteins=None):
def __init__(self, psms, proteins=None, rng=0):
"""Initialize a PsmConfidence object."""
self._score_column = "score"
self._target_column = psms.target_column
self._protein_column = "proteinIds"
self._rng = rng
self._group_column = psms.group_column
self._metadata_column = psms.metadata_columns

Expand Down Expand Up @@ -393,6 +400,10 @@ class LinearConfidence(Confidence):
----------
psms : OnDiskPsmDataset
A collection of PSMs.
rng : int or np.random.Generator, optional
A seed or generator used for cross-validation split creation and to
break ties, or ``None`` to use the default random number generator
state.
level_paths : List(Path)
Files with unique psms and unique peptides.
levels : List(str)
Expand Down Expand Up @@ -422,12 +433,12 @@ def __init__(
deduplication=True,
proteins=None,
sep="\t",
rng=0,
):
"""Initialize a a LinearPsmConfidence object"""
super().__init__(psms, proteins)
super().__init__(psms, proteins, rng)
self._target_column = psms.target_column
self._psm_columns = psms.spectrum_columns
self._peptide_column = psms.peptide_column
self._peptide_column = "peptide"
self._protein_column = "proteinIds"
self._eval_fdr = eval_fdr
self.deduplication = deduplication
Expand Down Expand Up @@ -510,7 +521,7 @@ def _assign_confidence(
self._peptide_column,
self._score_column,
self._proteins,
self.rng,
self._rng,
)
proteins_path = "proteins.csv"
proteins.to_csv(proteins_path, index=False, sep=sep)
Expand Down Expand Up @@ -640,8 +651,7 @@ def __init__(
"""Initialize a CrossLinkedConfidence object"""
super().__init__(psms)
self._target_column = psms.target_column
self._psm_columns = psms.spectrum_columns
self._peptide_column = psms.peptide_column
self._peptide_column = "peptide"

self._assign_confidence(
level_paths=level_paths,
Expand Down Expand Up @@ -711,13 +721,18 @@ def assign_confidence(
group_column=None,
combine=False,
append_to_output_file=False,
rng=0,
):
"""Assign confidence to PSMs peptides, and optionally, proteins.
Parameters
----------
psms : OnDiskPsmDataset
A collection of PSMs.
rng : int or np.random.Generator, optional
A seed or generator used for cross-validation split creation and to
break ties, or ``None`` to use the default random number generator
state.
scores : numpy.ndarray
The scores by which to rank the PSMs. The default, :code:`None`,
uses the feature that accepts the most PSMs at an FDR threshold of
Expand Down Expand Up @@ -766,6 +781,7 @@ def assign_confidence(
feat
].values
)

psms_path = "psms.csv"
peptides_path = "peptides.csv"
levels = ["psms"]
Expand Down Expand Up @@ -796,7 +812,10 @@ def assign_confidence(
if _psms.group_column is None:
out_files = []
for level in levels:
dest_dir_prefix = f"{dest_dir}/"
if str(dest_dir)[-1] == ".":
dest_dir_prefix = dest_dir
else:
dest_dir_prefix = f"{dest_dir}/"
if prefix is not None:
dest_dir_prefix = dest_dir_prefix + f"{prefix}."
if group_column is not None and not combine:
Expand Down Expand Up @@ -892,6 +911,7 @@ def assign_confidence(
decoys=decoys,
deduplication=deduplication,
proteins=proteins,
rng=rng,
)
if prefix is None:
append_to_output_file = True
Expand All @@ -908,12 +928,17 @@ def assign_confidence(
proteins=proteins,
combine=combine,
prefixes=[prefix],
rng=rng,
)


def save_sorted_metadata_chunks(
chunk_metadata, score_chunk, psms, deduplication, i, sep
):
chunk_metadata = convert_targets_column(
data=chunk_metadata.apply(pd.to_numeric, errors="ignore"),
target_column=psms.target_column,
)
chunk_metadata["score"] = score_chunk
chunk_metadata.sort_values(by="score", ascending=False, inplace=True)
if deduplication:
Expand Down
1 change: 1 addition & 0 deletions mokapot/mokapot.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def main():
deduplication=deduplication,
proteins=proteins,
prefixes=prefixes,
rng=config.seed,
)

if config.save_models:
Expand Down
8 changes: 6 additions & 2 deletions mokapot/parsers/pin.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def get_column_names_from_file(file):
return perc.readline().rstrip().split("\t")


def get_rows_from_dataframe(idx, chunk, train_psms):
def get_rows_from_dataframe(idx, chunk, train_psms, psms):
"""
extract rows from a chunk of a dataframe
Expand All @@ -311,6 +311,10 @@ def get_rows_from_dataframe(idx, chunk, train_psms):
List
list of list of dataframes
"""
chunk = convert_targets_column(
data=chunk.apply(pd.to_numeric, errors="ignore"),
target_column=psms.target_column,
)
for k, train in enumerate(idx):
idx_ = list(set(train) & set(chunk.index))
train_psms[k].append(
Expand Down Expand Up @@ -345,7 +349,7 @@ def parse_in_chunks(psms, train_idx, chunk_size):
use_cols=_psms.columns,
)
Parallel(n_jobs=-1, require="sharedmem")(
delayed(get_rows_from_dataframe)(idx, chunk, train_psms)
delayed(get_rows_from_dataframe)(idx, chunk, train_psms, _psms)
for chunk in reader
)
return Parallel(n_jobs=-1, require="sharedmem")(
Expand Down
5 changes: 2 additions & 3 deletions tests/system_tests/sample_plugin/mokapot_ctree/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from argparse import _ArgumentGroup

from sklearn import tree
from sklearn import svm

from mokapot.model import Model
from mokapot.plugins import BasePlugin
Expand All @@ -12,7 +12,7 @@
class PluginModel(Model):
def __init__(self, *args, **kwargs):
LOGGER.warning("The ctree model is not production ready")
clf = tree.DecisionTreeClassifier()
clf = svm.LinearSVC()
super().__init__(clf, *args, **kwargs)


Expand All @@ -30,7 +30,6 @@ def get_model(self, config):
max_iter=config.max_iter,
direction=config.direction,
override=config.override,
subset_max_train=config.subset_max_train,
)

def process_data(self, data, config):
Expand Down
92 changes: 21 additions & 71 deletions tests/system_tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import subprocess
from pathlib import Path

import pandas as pd
import pytest

# Warnings are errors for these tests
Expand All @@ -28,19 +27,12 @@ def phospho_files():
return pin, fasta


@pytest.fixture
def pepxml_file():
"""Get the pepxml file"""
pepxml = Path("data", "msfragger.pepXML")
return pepxml


def test_basic_cli(tmp_path, scope_files):
"""Test that basic cli works."""
cmd = ["mokapot", scope_files[0], "--dest_dir", tmp_path]
subprocess.run(cmd, check=True)
assert Path(tmp_path, "mokapot.psms.txt").exists()
assert Path(tmp_path, "mokapot.peptides.txt").exists()
assert Path(tmp_path, "targets.psms").exists()
assert Path(tmp_path, "targets.peptides").exists()


def test_cli_options(tmp_path, scope_files):
Expand Down Expand Up @@ -77,28 +69,16 @@ def test_cli_options(tmp_path, scope_files):
subprocess.run(cmd, check=True)
file_bases = [f.name.split(".")[0] for f in scope_files[0:2]]

assert Path(tmp_path, f"blah.{file_bases[0]}.mokapot.psms.txt").exists()
assert Path(
tmp_path, f"blah.{file_bases[0]}.mokapot.peptides.txt"
).exists()
assert Path(tmp_path, f"blah.{file_bases[1]}.mokapot.psms.txt").exists()
assert Path(
tmp_path, f"blah.{file_bases[1]}.mokapot.peptides.txt"
).exists()
assert Path(tmp_path, f"blah.{file_bases[0]}.targets.psms").exists()
assert Path(tmp_path, f"blah.{file_bases[0]}.targets.peptides").exists()
assert Path(tmp_path, f"blah.{file_bases[1]}.targets.psms").exists()
assert Path(tmp_path, f"blah.{file_bases[1]}.targets.peptides").exists()

# Test keep_decoys:
assert Path(
tmp_path, f"blah.{file_bases[0]}.mokapot.decoy.psms.txt"
).exists()
assert Path(
tmp_path, f"blah.{file_bases[0]}.mokapot.decoy.peptides.txt"
).exists()
assert Path(
tmp_path, f"blah.{file_bases[1]}.mokapot.decoy.psms.txt"
).exists()
assert Path(
tmp_path, f"blah.{file_bases[1]}.mokapot.decoy.peptides.txt"
).exists()
assert Path(tmp_path, f"blah.{file_bases[0]}.decoys.psms").exists()
assert Path(tmp_path, f"blah.{file_bases[0]}.decoys.peptides").exists()
assert Path(tmp_path, f"blah.{file_bases[1]}.decoys.psms").exists()
assert Path(tmp_path, f"blah.{file_bases[1]}.decoys.peptides").exists()


def test_cli_aggregate(tmp_path, scope_files):
Expand All @@ -117,16 +97,16 @@ def test_cli_aggregate(tmp_path, scope_files):
]

subprocess.run(cmd, check=True)
assert Path(tmp_path, "blah.mokapot.psms.txt").exists()
assert Path(tmp_path, "blah.mokapot.peptides.txt").exists()
assert not Path(tmp_path, "blah.mokapot.decoy.psms.txt").exists()
assert not Path(tmp_path, "blah.mokapot.decoy.peptides.txt").exists()
assert Path(tmp_path, "blah.targets.psms").exists()
assert Path(tmp_path, "blah.targets.peptides").exists()
assert not Path(tmp_path, "blah.targets.decoy.psms").exists()
assert not Path(tmp_path, "blah.targets.decoy.peptides").exists()

# Test that decoys are also in the output when --keep_decoys is used
cmd += ["--keep_decoys"]
subprocess.run(cmd, check=True)
assert Path(tmp_path, "blah.mokapot.decoy.psms.txt").exists()
assert Path(tmp_path, "blah.mokapot.decoy.peptides.txt").exists()
assert Path(tmp_path, "blah.decoys.psms").exists()
assert Path(tmp_path, "blah.decoys.peptides").exists()


def test_cli_fasta(tmp_path, phospho_files):
Expand All @@ -143,39 +123,9 @@ def test_cli_fasta(tmp_path, phospho_files):
]

subprocess.run(cmd, check=True)
assert Path(tmp_path, "mokapot.psms.txt").exists()
assert Path(tmp_path, "mokapot.peptides.txt").exists()
assert Path(tmp_path, "mokapot.proteins.txt").exists()


def test_cli_pepxml(tmp_path, pepxml_file):
"""Test that finding the correct parser works"""
cmd = [
"mokapot",
pepxml_file,
"--dest_dir",
tmp_path,
"--max_iter",
"1",
"--decoy_prefix",
"rev_",
]

subprocess.run(cmd, check=True)
unbinned_file = Path(tmp_path, "mokapot.peptides.txt")
assert Path(tmp_path, "mokapot.psms.txt").exists()
assert unbinned_file.exists()

cmd += ["--open_modification_bin_size", "0.01", "--file_root", "binned"]
subprocess.run(cmd, check=True)
binned_file = Path(tmp_path, "binned.mokapot.peptides.txt")
assert Path(tmp_path, "binned.mokapot.psms.txt").exists()
assert binned_file.exists()

# If binning was successful, there should be more distinct peptides:
unbinned = pd.read_csv(unbinned_file, sep="\t")
binned = pd.read_csv(binned_file, sep="\t")
assert len(binned) > len(unbinned)
assert Path(tmp_path, "targets.psms").exists()
assert Path(tmp_path, "targets.peptides").exists()
assert Path(tmp_path, "targets.proteins").exists()


def test_cli_saved_models(tmp_path, phospho_files):
Expand All @@ -193,8 +143,8 @@ def test_cli_saved_models(tmp_path, phospho_files):

cmd += ["--load_models", *list(Path(tmp_path).glob("*.pkl"))]
subprocess.run(cmd, check=True)
assert Path(tmp_path, "mokapot.psms.txt").exists()
assert Path(tmp_path, "mokapot.peptides.txt").exists()
assert Path(tmp_path, "targets.psms").exists()
assert Path(tmp_path, "targets.peptides").exists()


def test_cli_plugins(tmp_path, phospho_files):
Expand Down

0 comments on commit 2e1723e

Please sign in to comment.