Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-41543: Adds Reporting Exit Code Information to bps report #157

Merged
merged 1 commit into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/changes/DM-41543.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Introduced the ``--return-exit-codes`` flag to bps report, which provides a summary of exit code counts and exit codes for non-payload errors. This currently only works for PanDA.
49 changes: 48 additions & 1 deletion python/lsst/ctrl/bps/bps_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"""Classes and functions used in reporting run status.
"""

__all__ = ["BaseRunReport", "DetailedRunReport", "SummaryRunReport"]
__all__ = ["BaseRunReport", "DetailedRunReport", "SummaryRunReport", "ExitCodesReport"]

import abc
import logging
Expand Down Expand Up @@ -232,6 +232,53 @@ def __str__(self):
return str("\n".join(lines))


class ExitCodesReport(BaseRunReport):
"""An extension of run report to give information about
error handling from the wms service.
"""

def add(self, run_report, use_global_id=False):
# Docstring inherited from the base class.

# get labels from things and exit codes

labels = []
if run_report.run_summary:
for part in run_report.run_summary.split(";"):
label, _ = part.split(":")
labels.append(label)
else:
id_ = run_report.global_wms_id if use_global_id else run_report.wms_id
self._msg = f"WARNING: Job summary for run '{id_}' not available, report maybe incomplete."
return
exit_code_summary = run_report.exit_code_summary
for label in labels:
exit_codes = exit_code_summary[label]
if exit_codes:
# payload errors always return 1 on failure
pipe_error_count = sum([code for code in exit_codes if code == 1])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it would be worthwhile adding a comment that we're taking advantage of the fact that payloads always return 1 on failure here (right?).

infra_codes = [code for code in exit_codes if code != 0 and code != 1]
if infra_codes:
infra_error_count = len(infra_codes)
str_infra_codes = [str(code) for code in infra_codes]
infra_error_codes = ", ".join(sorted(set(str_infra_codes)))
else:
infra_error_count = 0
infra_error_codes = "None"
else:
pipe_error_count = 0
infra_error_codes = "None"
infra_error_count = 0
run = [label]
run.extend([pipe_error_count, infra_error_count, infra_error_codes])
self._table.add_row(run)

def __str__(self):
alignments = ["<"] + [">"] * (len(self._table.colnames) - 1)
lines = list(self._table.pformat_all(align=alignments))
return str("\n".join(lines))


def compile_job_summary(jobs):
"""Compile job summary from information available for individual jobs.

Expand Down
7 changes: 7 additions & 0 deletions python/lsst/ctrl/bps/cli/cmd/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ def restart(*args, **kwargs):
@click.option("--user", help="Restrict report to specific user.")
@click.option("--hist", "hist_days", default=0.0, help="Search WMS history X days for completed info.")
@click.option("--pass-thru", help="Pass the given string to the WMS service class.")
@click.option(
"--return-exit-codes",
is_flag=True,
show_default=True,
default=False,
help="Return exit codes from jobs with a non-success status.",
)
@click.option(
"--global/--no-global",
"is_global",
Expand Down
19 changes: 17 additions & 2 deletions python/lsst/ctrl/bps/drivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ def restart_driver(wms_service, run_id):
print("Restart failed: Unknown error")


def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False, return_exit_codes=False):
"""Print out summary of jobs submitted for execution.

Parameters
Expand All @@ -481,11 +481,26 @@ def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=Fal

Only applicable in the context of a WMS using distributed job queues
(e.g., HTCondor).
return_exit_codes : `bool`, optional
If set, return exit codes related to jobs with a
non-success status. Defaults to False, which means that only
the summary state is returned.

Only applicable in the context of a WMS with associated
handlers to return exit codes from jobs.
"""
if wms_service is None:
default_config = BpsConfig(BPS_DEFAULTS)
wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
report(
wms_service,
run_id,
user,
hist_days,
pass_thru,
is_global=is_global,
return_exit_codes=return_exit_codes,
)


def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
Expand Down
25 changes: 23 additions & 2 deletions python/lsst/ctrl/bps/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@

from lsst.utils import doImport

from .bps_reports import DetailedRunReport, SummaryRunReport
from .bps_reports import DetailedRunReport, ExitCodesReport, SummaryRunReport
from .wms_service import WmsStates

_LOG = logging.getLogger(__name__)


def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False, return_exit_codes=False):
"""Print out summary of jobs submitted for execution.

Parameters
Expand All @@ -63,6 +63,13 @@ def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False):

Only applicable in the context of a WMS using distributed job queues
(e.g., HTCondor).
return_exit_codes : `bool`, optional
If set, return exit codes related to jobs with a
non-success status. Defaults to False, which means that only
the summary state is returned.

Only applicable in the context of a WMS with associated
handlers to return exit codes from jobs.
"""
wms_service_class = doImport(wms_service)
wms_service = wms_service_class({})
Expand All @@ -87,6 +94,7 @@ def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
("RUN", "S"),
]
)

if run_id:
fields = [(" ", "S")] + [(state.name, "i") for state in WmsStates] + [("EXPECTED", "i")]
run_report = DetailedRunReport(fields)
Expand All @@ -103,6 +111,19 @@ def report(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
print("\n")
print(run_report)

if return_exit_codes:
fields = [
(" ", "S"),
("PAYLOAD ERROR COUNT", "i"),
("INFRASTRUCTURE ERROR COUNT", "i"),
("INFRASTRUCTURE ERROR CODES", "S"),
]
run_exits_report = ExitCodesReport(fields)
run_exits_report.add(run, use_global_id=is_global)
print("\n")
print(run_exits_report)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add print("\n") before this line so the job report and the exit code report are separated by a blank line if the latter is displayed?

run_exits_report.clear()

run_brief.clear()
run_report.clear()
if not runs and not message:
Expand Down
23 changes: 21 additions & 2 deletions python/lsst/ctrl/bps/wms_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,11 @@ class WmsRunReport:
"""Job counts per state."""

job_summary: dict[str, dict[WmsStates, int]] = None
"""Job counts per label and per state.
"""Job counts per label and per state."""

exit_code_summary: dict[list] = None
"""Summary of non-zero exit codes per job label
available through the WMS.
"""


Expand Down Expand Up @@ -252,7 +256,15 @@ def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thr
"""
raise NotImplementedError

def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False):
def report(
self,
wms_workflow_id=None,
user=None,
hist=0,
pass_thru=None,
is_global=False,
return_exit_codes=False,
):
"""Query WMS for status of submitted WMS workflows.

Parameters
Expand All @@ -273,6 +285,13 @@ def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_glo
Only applicable in the context of a WMS using distributed job
queues (e.g., HTCondor). A WMS with a centralized job queue
(e.g. PanDA) can safely ignore it.
return_exit_codes : `bool`, optional
If set, return exit codes related to jobs with a
non-success status. Defaults to False, which means that only
the summary state is returned.

Only applicable in the context of a WMS with associated
handlers to return exit codes from jobs.

Returns
-------
Expand Down
71 changes: 71 additions & 0 deletions tests/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from lsst.ctrl.bps import (
BaseRunReport,
DetailedRunReport,
ExitCodesReport,
SummaryRunReport,
WmsJobReport,
WmsRunReport,
Expand Down Expand Up @@ -287,5 +288,75 @@ def testAddWithoutRunSummary(self):
self.assertEqual(self.actual, expected)


class ExitCodesReportTestCase(unittest.TestCase):
"""Test an exit code report."""

def setUp(self):
self.fields = [
(" ", "S"),
("PAYLOAD ERROR COUNT", "i"),
("INFRASTRUCTURE ERROR COUNT", "i"),
("INFRASTRUCTURE ERROR CODES", "S"),
]

table = Table(dtype=self.fields)
table.add_row(["foo"] + [0] + [0] + ["None"])
table.add_row(["bar"] + [1] + [3] + ["2, 3, 4"])
self.expected = ExitCodesReport.from_table(table)

self.run = WmsRunReport(
wms_id="1.0",
global_wms_id="foo#1.0",
path="/path/to/run",
label="label",
run="run",
project="dev",
campaign="testing",
payload="test",
operator="tester",
run_summary="foo:1;bar:1",
state=WmsStates.RUNNING,
jobs=[
WmsJobReport(wms_id="1.0", name="", label="foo", state=WmsStates.SUCCEEDED),
WmsJobReport(wms_id="2.0", name="", label="bar", state=WmsStates.RUNNING),
],
total_number_jobs=2,
job_state_counts={
state: 1 if state in {WmsStates.SUCCEEDED, WmsStates.RUNNING} else 0 for state in WmsStates
},
job_summary={
"foo": {state: 1 if state == WmsStates.SUCCEEDED else 0 for state in WmsStates},
"bar": {state: 1 if state == WmsStates.RUNNING else 0 for state in WmsStates},
},
exit_code_summary={
"foo": [0, 0, 0, 0],
"bar": [1, 2, 3, 4],
},
)

self.actual = ExitCodesReport(self.fields)

def testAddWithJobSummary(self):
"""Test adding a run with a job summary."""
self.run.jobs = None
self.actual.add(self.run)
print(self.actual)
print(self.expected)
self.assertEqual(self.actual, self.expected)

def testAddWithJobs(self):
"""Test adding a run with a job info, but not job summary."""
self.run.job_summary = None
self.actual.add(self.run)

self.assertEqual(self.actual, self.expected)

def testAddWithoutRunSummary(self):
"""Test adding a run without a run summary."""
self.run.run_summary = None
self.actual.add(self.run)
self.assertRegex(self.actual.message, r"^WARNING.*incomplete")


if __name__ == "__main__":
unittest.main()