Skip to content

Commit

Permalink
Merge pull request #86 from lsst/tickets/DM-32968
Browse files Browse the repository at this point in the history
DM-32968: Job put on hold if exit with signal 11
  • Loading branch information
mxk62 committed Jan 13, 2022
2 parents 8ca1b35 + a343525 commit faa8242
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/changes/DM-32968.misc.rst
@@ -0,0 +1 @@
Make HTCondor treat all jobs exiting with a signal as if they ran out of memory.
9 changes: 5 additions & 4 deletions python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py
Expand Up @@ -563,10 +563,11 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
"transfer_executable": "False",
"getenv": "True",

# Exceeding memory sometimes triggering SIGBUS error. Tell htcondor
# to put SIGBUS jobs on hold.
"on_exit_hold": "(ExitBySignal == true) && (ExitSignal == 7)",
"on_exit_hold_reason": '"Job raised a signal 7. Usually means job has gone over memory limit."',
# Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
# htcondor to put on hold any jobs which exited by a signal.
"on_exit_hold": "ExitBySignal == true",
"on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
'"Handling signal as if job has gone over memory limit.")',
"on_exit_hold_subcode": "34"
}

Expand Down

0 comments on commit faa8242

Please sign in to comment.