Fix cli tests: (#28)

- Remove test_cli_pepxml because xml files don't work with streaming - Replace old output file names - Add random generator 'rng' variable to confidence since it is required for proteins - Remove subset_max_train from PluginModel - Fix bug: convert targets column after reading in chunks - Fix peptide column name for confidence - Fix test cli plugins : replace DecisionTreeClassifier with LinearSVC BECAUSE DecisionTreeClassifier return scores as 0 or 1
msaid-de · Feb 27, 2024 · 2e1723e · 2e1723e
1 parent 8f417dd
commit 2e1723e
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 84 deletions.
diff --git a/mokapot/confidence.py b/mokapot/confidence.py
@@ -54,6 +54,10 @@ class GroupedConfidence:
     ----------
     psms : OnDiskPsmDataset
         A collection of PSMs.
+    rng : int or np.random.Generator, optional
+        A seed or generator used for cross-validation split creation and to
+        break ties, or ``None`` to use the default random number generator
+        state.
     scores : np.ndarray
         A vector containing the score of each PSM.
     desc : bool
@@ -84,6 +88,7 @@ def __init__(
         proteins=None,
         combine=False,
         prefixes=None,
+        rng=0,
     ):
         """Initialize a GroupedConfidence object"""
         data = read_file(psms.filename, use_cols=list(psms.columns))
@@ -124,6 +129,7 @@ def __init__(
                 combine=combine,
                 prefixes=prefixes,
                 append_to_output_file=append_to_group,
+                rng=rng,
             )
             if combine:
                 append_to_group = True
@@ -236,11 +242,12 @@ class Confidence(object):
         "peptide_pairs": "Peptide Pairs",
     }
 
-    def __init__(self, psms, proteins=None):
+    def __init__(self, psms, proteins=None, rng=0):
         """Initialize a PsmConfidence object."""
         self._score_column = "score"
         self._target_column = psms.target_column
         self._protein_column = "proteinIds"
+        self._rng = rng
         self._group_column = psms.group_column
         self._metadata_column = psms.metadata_columns
 
@@ -393,6 +400,10 @@ class LinearConfidence(Confidence):
     ----------
     psms : OnDiskPsmDataset
         A collection of PSMs.
+    rng : int or np.random.Generator, optional
+        A seed or generator used for cross-validation split creation and to
+        break ties, or ``None`` to use the default random number generator
+        state.
     level_paths : List(Path)
             Files with unique psms and unique peptides.
     levels : List(str)
@@ -422,12 +433,12 @@ def __init__(
         deduplication=True,
         proteins=None,
         sep="\t",
+        rng=0,
     ):
         """Initialize a a LinearPsmConfidence object"""
-        super().__init__(psms, proteins)
+        super().__init__(psms, proteins, rng)
         self._target_column = psms.target_column
-        self._psm_columns = psms.spectrum_columns
-        self._peptide_column = psms.peptide_column
+        self._peptide_column = "peptide"
         self._protein_column = "proteinIds"
         self._eval_fdr = eval_fdr
         self.deduplication = deduplication
@@ -510,7 +521,7 @@ def _assign_confidence(
                 self._peptide_column,
                 self._score_column,
                 self._proteins,
-                self.rng,
+                self._rng,
             )
             proteins_path = "proteins.csv"
             proteins.to_csv(proteins_path, index=False, sep=sep)
@@ -640,8 +651,7 @@ def __init__(
         """Initialize a CrossLinkedConfidence object"""
         super().__init__(psms)
         self._target_column = psms.target_column
-        self._psm_columns = psms.spectrum_columns
-        self._peptide_column = psms.peptide_column
+        self._peptide_column = "peptide"
 
         self._assign_confidence(
             level_paths=level_paths,
@@ -711,13 +721,18 @@ def assign_confidence(
     group_column=None,
     combine=False,
     append_to_output_file=False,
+    rng=0,
 ):
     """Assign confidence to PSMs peptides, and optionally, proteins.
 
     Parameters
     ----------
     psms : OnDiskPsmDataset
         A collection of PSMs.
+    rng : int or np.random.Generator, optional
+        A seed or generator used for cross-validation split creation and to
+        break ties, or ``None`` to use the default random number generator
+        state.
     scores : numpy.ndarray
         The scores by which to rank the PSMs. The default, :code:`None`,
         uses the feature that accepts the most PSMs at an FDR threshold of
@@ -766,6 +781,7 @@ def assign_confidence(
                     feat
                 ].values
             )
+
     psms_path = "psms.csv"
     peptides_path = "peptides.csv"
     levels = ["psms"]
@@ -796,7 +812,10 @@ def assign_confidence(
         if _psms.group_column is None:
             out_files = []
             for level in levels:
-                dest_dir_prefix = f"{dest_dir}/"
+                if str(dest_dir)[-1] == ".":
+                    dest_dir_prefix = dest_dir
+                else:
+                    dest_dir_prefix = f"{dest_dir}/"
                 if prefix is not None:
                     dest_dir_prefix = dest_dir_prefix + f"{prefix}."
                 if group_column is not None and not combine:
@@ -892,6 +911,7 @@ def assign_confidence(
                 decoys=decoys,
                 deduplication=deduplication,
                 proteins=proteins,
+                rng=rng,
             )
             if prefix is None:
                 append_to_output_file = True
@@ -908,12 +928,17 @@ def assign_confidence(
                 proteins=proteins,
                 combine=combine,
                 prefixes=[prefix],
+                rng=rng,
             )
 
 
 def save_sorted_metadata_chunks(
     chunk_metadata, score_chunk, psms, deduplication, i, sep
 ):
+    chunk_metadata = convert_targets_column(
+        data=chunk_metadata.apply(pd.to_numeric, errors="ignore"),
+        target_column=psms.target_column,
+    )
     chunk_metadata["score"] = score_chunk
     chunk_metadata.sort_values(by="score", ascending=False, inplace=True)
     if deduplication:

diff --git a/mokapot/mokapot.py b/mokapot/mokapot.py
@@ -166,6 +166,7 @@ def main():
         deduplication=deduplication,
         proteins=proteins,
         prefixes=prefixes,
+        rng=config.seed,
     )
 
     if config.save_models:

diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py
@@ -293,7 +293,7 @@ def get_column_names_from_file(file):
         return perc.readline().rstrip().split("\t")
 
 
-def get_rows_from_dataframe(idx, chunk, train_psms):
+def get_rows_from_dataframe(idx, chunk, train_psms, psms):
     """
     extract rows from a chunk of a dataframe
 
@@ -311,6 +311,10 @@ def get_rows_from_dataframe(idx, chunk, train_psms):
     List
         list of list of dataframes
     """
+    chunk = convert_targets_column(
+        data=chunk.apply(pd.to_numeric, errors="ignore"),
+        target_column=psms.target_column,
+    )
     for k, train in enumerate(idx):
         idx_ = list(set(train) & set(chunk.index))
         train_psms[k].append(
@@ -345,7 +349,7 @@ def parse_in_chunks(psms, train_idx, chunk_size):
             use_cols=_psms.columns,
         )
         Parallel(n_jobs=-1, require="sharedmem")(
-            delayed(get_rows_from_dataframe)(idx, chunk, train_psms)
+            delayed(get_rows_from_dataframe)(idx, chunk, train_psms, _psms)
             for chunk in reader
         )
     return Parallel(n_jobs=-1, require="sharedmem")(

diff --git a/tests/system_tests/sample_plugin/mokapot_ctree/__init__.py b/tests/system_tests/sample_plugin/mokapot_ctree/__init__.py
@@ -1,7 +1,7 @@
 import logging
 from argparse import _ArgumentGroup
 
-from sklearn import tree
+from sklearn import svm
 
 from mokapot.model import Model
 from mokapot.plugins import BasePlugin
@@ -12,7 +12,7 @@
 class PluginModel(Model):
     def __init__(self, *args, **kwargs):
         LOGGER.warning("The ctree model is not production ready")
-        clf = tree.DecisionTreeClassifier()
+        clf = svm.LinearSVC()
         super().__init__(clf, *args, **kwargs)
 
 
@@ -30,7 +30,6 @@ def get_model(self, config):
             max_iter=config.max_iter,
             direction=config.direction,
             override=config.override,
-            subset_max_train=config.subset_max_train,
         )
 
     def process_data(self, data, config):

diff --git a/tests/system_tests/test_cli.py b/tests/system_tests/test_cli.py
@@ -7,7 +7,6 @@
 import subprocess
 from pathlib import Path
 
-import pandas as pd
 import pytest
 
 # Warnings are errors for these tests
@@ -28,19 +27,12 @@ def phospho_files():
     return pin, fasta
 
 
-@pytest.fixture
-def pepxml_file():
-    """Get the pepxml file"""
-    pepxml = Path("data", "msfragger.pepXML")
-    return pepxml
-
-
 def test_basic_cli(tmp_path, scope_files):
     """Test that basic cli works."""
     cmd = ["mokapot", scope_files[0], "--dest_dir", tmp_path]
     subprocess.run(cmd, check=True)
-    assert Path(tmp_path, "mokapot.psms.txt").exists()
-    assert Path(tmp_path, "mokapot.peptides.txt").exists()
+    assert Path(tmp_path, "targets.psms").exists()
+    assert Path(tmp_path, "targets.peptides").exists()
 
 
 def test_cli_options(tmp_path, scope_files):
@@ -77,28 +69,16 @@ def test_cli_options(tmp_path, scope_files):
     subprocess.run(cmd, check=True)
     file_bases = [f.name.split(".")[0] for f in scope_files[0:2]]
 
-    assert Path(tmp_path, f"blah.{file_bases[0]}.mokapot.psms.txt").exists()
-    assert Path(
-        tmp_path, f"blah.{file_bases[0]}.mokapot.peptides.txt"
-    ).exists()
-    assert Path(tmp_path, f"blah.{file_bases[1]}.mokapot.psms.txt").exists()
-    assert Path(
-        tmp_path, f"blah.{file_bases[1]}.mokapot.peptides.txt"
-    ).exists()
+    assert Path(tmp_path, f"blah.{file_bases[0]}.targets.psms").exists()
+    assert Path(tmp_path, f"blah.{file_bases[0]}.targets.peptides").exists()
+    assert Path(tmp_path, f"blah.{file_bases[1]}.targets.psms").exists()
+    assert Path(tmp_path, f"blah.{file_bases[1]}.targets.peptides").exists()
 
     # Test keep_decoys:
-    assert Path(
-        tmp_path, f"blah.{file_bases[0]}.mokapot.decoy.psms.txt"
-    ).exists()
-    assert Path(
-        tmp_path, f"blah.{file_bases[0]}.mokapot.decoy.peptides.txt"
-    ).exists()
-    assert Path(
-        tmp_path, f"blah.{file_bases[1]}.mokapot.decoy.psms.txt"
-    ).exists()
-    assert Path(
-        tmp_path, f"blah.{file_bases[1]}.mokapot.decoy.peptides.txt"
-    ).exists()
+    assert Path(tmp_path, f"blah.{file_bases[0]}.decoys.psms").exists()
+    assert Path(tmp_path, f"blah.{file_bases[0]}.decoys.peptides").exists()
+    assert Path(tmp_path, f"blah.{file_bases[1]}.decoys.psms").exists()
+    assert Path(tmp_path, f"blah.{file_bases[1]}.decoys.peptides").exists()
 
 
 def test_cli_aggregate(tmp_path, scope_files):
@@ -117,16 +97,16 @@ def test_cli_aggregate(tmp_path, scope_files):
     ]
 
     subprocess.run(cmd, check=True)
-    assert Path(tmp_path, "blah.mokapot.psms.txt").exists()
-    assert Path(tmp_path, "blah.mokapot.peptides.txt").exists()
-    assert not Path(tmp_path, "blah.mokapot.decoy.psms.txt").exists()
-    assert not Path(tmp_path, "blah.mokapot.decoy.peptides.txt").exists()
+    assert Path(tmp_path, "blah.targets.psms").exists()
+    assert Path(tmp_path, "blah.targets.peptides").exists()
+    assert not Path(tmp_path, "blah.targets.decoy.psms").exists()
+    assert not Path(tmp_path, "blah.targets.decoy.peptides").exists()
 
     # Test that decoys are also in the output when --keep_decoys is used
     cmd += ["--keep_decoys"]
     subprocess.run(cmd, check=True)
-    assert Path(tmp_path, "blah.mokapot.decoy.psms.txt").exists()
-    assert Path(tmp_path, "blah.mokapot.decoy.peptides.txt").exists()
+    assert Path(tmp_path, "blah.decoys.psms").exists()
+    assert Path(tmp_path, "blah.decoys.peptides").exists()
 
 
 def test_cli_fasta(tmp_path, phospho_files):
@@ -143,39 +123,9 @@ def test_cli_fasta(tmp_path, phospho_files):
     ]
 
     subprocess.run(cmd, check=True)
-    assert Path(tmp_path, "mokapot.psms.txt").exists()
-    assert Path(tmp_path, "mokapot.peptides.txt").exists()
-    assert Path(tmp_path, "mokapot.proteins.txt").exists()
-
-
-def test_cli_pepxml(tmp_path, pepxml_file):
-    """Test that finding the correct parser works"""
-    cmd = [
-        "mokapot",
-        pepxml_file,
-        "--dest_dir",
-        tmp_path,
-        "--max_iter",
-        "1",
-        "--decoy_prefix",
-        "rev_",
-    ]
-
-    subprocess.run(cmd, check=True)
-    unbinned_file = Path(tmp_path, "mokapot.peptides.txt")
-    assert Path(tmp_path, "mokapot.psms.txt").exists()
-    assert unbinned_file.exists()
-
-    cmd += ["--open_modification_bin_size", "0.01", "--file_root", "binned"]
-    subprocess.run(cmd, check=True)
-    binned_file = Path(tmp_path, "binned.mokapot.peptides.txt")
-    assert Path(tmp_path, "binned.mokapot.psms.txt").exists()
-    assert binned_file.exists()
-
-    # If binning was successful, there should be more distinct peptides:
-    unbinned = pd.read_csv(unbinned_file, sep="\t")
-    binned = pd.read_csv(binned_file, sep="\t")
-    assert len(binned) > len(unbinned)
+    assert Path(tmp_path, "targets.psms").exists()
+    assert Path(tmp_path, "targets.peptides").exists()
+    assert Path(tmp_path, "targets.proteins").exists()
 
 
 def test_cli_saved_models(tmp_path, phospho_files):
@@ -193,8 +143,8 @@ def test_cli_saved_models(tmp_path, phospho_files):
 
     cmd += ["--load_models", *list(Path(tmp_path).glob("*.pkl"))]
     subprocess.run(cmd, check=True)
-    assert Path(tmp_path, "mokapot.psms.txt").exists()
-    assert Path(tmp_path, "mokapot.peptides.txt").exists()
+    assert Path(tmp_path, "targets.psms").exists()
+    assert Path(tmp_path, "targets.peptides").exists()
 
 
 def test_cli_plugins(tmp_path, phospho_files):