lsst · villarrealas · Dec 11, 2023 · Nov 15, 2023 · mxk62 · Dec 9, 2023
diff --git a/doc/changes/DM-41543.feature.rst b/doc/changes/DM-41543.feature.rst
@@ -0,0 +1 @@
+Introduced the ``--return-exit-codes`` flag to bps report, which provides a summary of exit code counts and exit codes for non-payload errors. This currently only works for PanDA.
diff --git a/python/lsst/ctrl/bps/bps_reports.py b/python/lsst/ctrl/bps/bps_reports.py
@@ -28,7 +28,7 @@
 """Classes and functions used in reporting run status.
 """
 
-__all__ = ["BaseRunReport", "DetailedRunReport", "SummaryRunReport"]
+__all__ = ["BaseRunReport", "DetailedRunReport", "SummaryRunReport", "ExitCodesReport"]
 
 import abc
 import logging
@@ -232,6 +232,53 @@ def __str__(self):
         return str("\n".join(lines))
 
 
+class ExitCodesReport(BaseRunReport):
+    """An extension of run report to give information about
+    error handling from the wms service.
+    """
+
+    def add(self, run_report, use_global_id=False):
+        # Docstring inherited from the base class.
+
+        # get labels from things and exit codes
+
+        labels = []
+        if run_report.run_summary:
+            for part in run_report.run_summary.split(";"):
+                label, _ = part.split(":")
+                labels.append(label)
+        else:
+            id_ = run_report.global_wms_id if use_global_id else run_report.wms_id
+            self._msg = f"WARNING: Job summary for run '{id_}' not available, report maybe incomplete."
+            return
+        exit_code_summary = run_report.exit_code_summary
+        for label in labels:
+            exit_codes = exit_code_summary[label]
+            if exit_codes:
+                # payload errors always return 1 on failure
+                pipe_error_count = sum([code for code in exit_codes if code == 1])
+                infra_codes = [code for code in exit_codes if code != 0 and code != 1]
+                if infra_codes:
+                    infra_error_count = len(infra_codes)
+                    str_infra_codes = [str(code) for code in infra_codes]
+                    infra_error_codes = ", ".join(sorted(set(str_infra_codes)))
+                else:
+                    infra_error_count = 0
+                    infra_error_codes = "None"
+            else:
+                pipe_error_count = 0
+                infra_error_codes = "None"
+                infra_error_count = 0
+            run = [label]
+            run.extend([pipe_error_count, infra_error_count, infra_error_codes])
+            self._table.add_row(run)
+
+    def __str__(self):
+        alignments = ["<"] + [">"] * (len(self._table.colnames) - 1)
+        lines = list(self._table.pformat_all(align=alignments))
+        return str("\n".join(lines))
+
+
 def compile_job_summary(jobs):
     """Compile job summary from information available for individual jobs.
 

diff --git a/python/lsst/ctrl/bps/cli/cmd/commands.py b/python/lsst/ctrl/bps/cli/cmd/commands.py
@@ -106,6 +106,13 @@ def restart(*args, **kwargs):
 @click.option("--user", help="Restrict report to specific user.")
 @click.option("--hist", "hist_days", default=0.0, help="Search WMS history X days for completed info.")
 @click.option("--pass-thru", help="Pass the given string to the WMS service class.")
+@click.option(
+    "--return-exit-codes",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Return exit codes from jobs with a non-success status.",
+)
 @click.option(
     "--global/--no-global",
     "is_global",

diff --git a/python/lsst/ctrl/bps/drivers.py b/python/lsst/ctrl/bps/drivers.py
@@ -459,7 +459,7 @@ def restart_driver(wms_service, run_id):
             print("Restart failed: Unknown error")
 
 
-def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
+def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False, return_exit_codes=False):
     """Print out summary of jobs submitted for execution.
 
     Parameters
@@ -481,11 +481,26 @@ def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=Fal
 
         Only applicable in the context of a WMS using distributed job queues
         (e.g., HTCondor).
+    return_exit_codes : `bool`, optional
+        If set, return exit codes related to jobs with a
+        non-success status. Defaults to False, which means that only
+        the summary state is returned.
+
+        Only applicable in the context of a WMS with associated
+        handlers to return exit codes from jobs.
     """
     if wms_service is None:
         default_config = BpsConfig(BPS_DEFAULTS)
         wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
-    report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
+    report(
+        wms_service,
+        run_id,
+        user,
+        hist_days,
+        pass_thru,
+        is_global=is_global,
+        return_exit_codes=return_exit_codes,
+    )
 
 
 def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):

diff --git a/python/lsst/ctrl/bps/report.py b/python/lsst/ctrl/bps/report.py
@@ -35,13 +35,13 @@
 
 from lsst.utils import doImport
 
-from .bps_reports import DetailedRunReport, SummaryRunReport
+from .bps_reports import DetailedRunReport, ExitCodesReport, SummaryRunReport
 from .wms_service import WmsStates
 
 _LOG = logging.getLogger(__name__)
 
 
-def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
+def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False, return_exit_codes=False):
     """Print out summary of jobs submitted for execution.
 
     Parameters
@@ -63,6 +63,13 @@ def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
 
         Only applicable in the context of a WMS using distributed job queues
         (e.g., HTCondor).
+    return_exit_codes : `bool`, optional
+        If set, return exit codes related to jobs with a
+        non-success status. Defaults to False, which means that only
+        the summary state is returned.
+
+        Only applicable in the context of a WMS with associated
+        handlers to return exit codes from jobs.
     """
     wms_service_class = doImport(wms_service)
     wms_service = wms_service_class({})
@@ -87,6 +94,7 @@ def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
             ("RUN", "S"),
         ]
     )
+
     if run_id:
         fields = [(" ", "S")] + [(state.name, "i") for state in WmsStates] + [("EXPECTED", "i")]
         run_report = DetailedRunReport(fields)
@@ -103,6 +111,19 @@ def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
             print("\n")
             print(run_report)
 
+            if return_exit_codes:
+                fields = [
+                    (" ", "S"),
+                    ("PAYLOAD ERROR COUNT", "i"),
+                    ("INFRASTRUCTURE ERROR COUNT", "i"),
+                    ("INFRASTRUCTURE ERROR CODES", "S"),
+                ]
+                run_exits_report = ExitCodesReport(fields)
+                run_exits_report.add(run, use_global_id=is_global)
+                print("\n")
+                print(run_exits_report)
+                run_exits_report.clear()
+
             run_brief.clear()
             run_report.clear()
         if not runs and not message:

diff --git a/python/lsst/ctrl/bps/wms_service.py b/python/lsst/ctrl/bps/wms_service.py
@@ -151,7 +151,11 @@ class WmsRunReport:
     """Job counts per state."""
 
     job_summary: dict[str, dict[WmsStates, int]] = None
-    """Job counts per label and per state.
+    """Job counts per label and per state."""
+
+    exit_code_summary: dict[list] = None
+    """Summary of non-zero exit codes per job label
+    available through the WMS.
     """
 
 
@@ -252,7 +256,15 @@ def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thr
         """
         raise NotImplementedError
 
-    def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
+    def report(
+        self,
+        wms_workflow_id=None,
+        user=None,
+        hist=0,
+        pass_thru=None,
+        is_global=False,
+        return_exit_codes=False,
+    ):
         """Query WMS for status of submitted WMS workflows.
 
         Parameters
@@ -273,6 +285,13 @@ def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_glo
             Only applicable in the context of a WMS using distributed job
             queues (e.g., HTCondor). A WMS with a centralized job queue
             (e.g. PanDA) can safely ignore it.
+        return_exit_codes : `bool`, optional
+            If set, return exit codes related to jobs with a
+            non-success status. Defaults to False, which means that only
+            the summary state is returned.
+
+            Only applicable in the context of a WMS with associated
+            handlers to return exit codes from jobs.
 
         Returns
         -------

diff --git a/tests/test_report.py b/tests/test_report.py
@@ -34,6 +34,7 @@
 from lsst.ctrl.bps import (
     BaseRunReport,
     DetailedRunReport,
+    ExitCodesReport,
     SummaryRunReport,
     WmsJobReport,
     WmsRunReport,
@@ -287,5 +288,75 @@ def testAddWithoutRunSummary(self):
         self.assertEqual(self.actual, expected)
 
 
+class ExitCodesReportTestCase(unittest.TestCase):
+    """Test an exit code report."""
+
+    def setUp(self):
+        self.fields = [
+            (" ", "S"),
+            ("PAYLOAD ERROR COUNT", "i"),
+            ("INFRASTRUCTURE ERROR COUNT", "i"),
+            ("INFRASTRUCTURE ERROR CODES", "S"),
+        ]
+
+        table = Table(dtype=self.fields)
+        table.add_row(["foo"] + [0] + [0] + ["None"])
+        table.add_row(["bar"] + [1] + [3] + ["2, 3, 4"])
+        self.expected = ExitCodesReport.from_table(table)
+
+        self.run = WmsRunReport(
+            wms_id="1.0",
+            global_wms_id="foo#1.0",
+            path="/path/to/run",
+            label="label",
+            run="run",
+            project="dev",
+            campaign="testing",
+            payload="test",
+            operator="tester",
+            run_summary="foo:1;bar:1",
+            state=WmsStates.RUNNING,
+            jobs=[
+                WmsJobReport(wms_id="1.0", name="", label="foo", state=WmsStates.SUCCEEDED),
+                WmsJobReport(wms_id="2.0", name="", label="bar", state=WmsStates.RUNNING),
+            ],
+            total_number_jobs=2,
+            job_state_counts={
+                state: 1 if state in {WmsStates.SUCCEEDED, WmsStates.RUNNING} else 0 for state in WmsStates
+            },
+            job_summary={
+                "foo": {state: 1 if state == WmsStates.SUCCEEDED else 0 for state in WmsStates},
+                "bar": {state: 1 if state == WmsStates.RUNNING else 0 for state in WmsStates},
+            },
+            exit_code_summary={
+                "foo": [0, 0, 0, 0],
+                "bar": [1, 2, 3, 4],
+            },
+        )
+
+        self.actual = ExitCodesReport(self.fields)
+
+    def testAddWithJobSummary(self):
+        """Test adding a run with a job summary."""
+        self.run.jobs = None
+        self.actual.add(self.run)
+        print(self.actual)
+        print(self.expected)
+        self.assertEqual(self.actual, self.expected)
+
+    def testAddWithJobs(self):
+        """Test adding a run with a job info, but not job summary."""
+        self.run.job_summary = None
+        self.actual.add(self.run)
+
+        self.assertEqual(self.actual, self.expected)
+
+    def testAddWithoutRunSummary(self):
+        """Test adding a run without a run summary."""
+        self.run.run_summary = None
+        self.actual.add(self.run)
+        self.assertRegex(self.actual.message, r"^WARNING.*incomplete")
+
+
 if __name__ == "__main__":
     unittest.main()