Skip to content

Commit

Permalink
Improve select ClassAd expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
mxk62 committed Oct 27, 2021
1 parent 184596c commit fadf008
Showing 1 changed file with 24 additions and 16 deletions.
40 changes: 24 additions & 16 deletions python/lsst/ctrl/bps/wms/htcondor/htcondor_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1451,12 +1451,12 @@ def _wms_id_to_cluster(wms_id):
def _create_periodic_release_expr(memory, multiplier, limit):
"""Construct an HTCondorAd expression for releasing held jobs.
The expression instruct HTCondor to release any job from being held
providing it satisfies all conditions below:
The expression instruct HTCondor to release any job which was put on hold
due to exceeding memory requirements back to the job queue providing it
satisfies all of the conditions below:
* it was put on held due to exceeding memory requirements,
* number of run attempts did not reach allowable number of retries,
* the memory requirements in the failed run attempt did not reach
* the memory requirements in the last failed run attempt did not reach
the specified memory limit.
Parameters
Expand All @@ -1476,20 +1476,22 @@ def _create_periodic_release_expr(memory, multiplier, limit):
"""
is_retry_allowed = "NumJobStarts <= JobMaxRetries"
was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
was_mem_exceeded = "(HoldReasonCode == 34 || (HoldReasonCode == 3 && HoldReasonSubCode == 34))"
expr = f"JobStatus == 5 && {was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
was_mem_exceeded = "JobStatus == 5 " \
"&& (HoldReasonCode == 34 && HoldReasonSubCode == 0 " \
"|| HoldReasonCode == 3 && HoldReasonSubCode == 34)"
expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
return expr


def _create_periodic_remove_expr(memory, multiplier, limit):
"""Construct an HTCondorAd expression for removing jobs from the queue.
The expression instruct HTCondor to remove any job from the job queue
providing it satisfies all conditions below:
The expression instruct HTCondor to remove any job which was put on hold
due to exceeding memory requirements from the job queue providing it
satisfies any of the conditions below:
* it was put on hold,
* allowable number of retries was reached,
* the memory requirements during the failed run attempt reached
* the memory requirements during the last failed run attempt reached
the specified memory limit.
Parameters
Expand All @@ -1510,9 +1512,10 @@ def _create_periodic_remove_expr(memory, multiplier, limit):
"""
is_retry_disallowed = "NumJobStarts > JobMaxRetries"
was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
was_mem_exceeded = "" \
"(HoldReasonCode == 34 || (HoldReasonCode == 3 && HoldReasonSubCode == 34))"
expr = f"JobStatus == 5 && {was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
was_mem_exceeded = "JobStatus == 5 " \
"&& (HoldReasonCode == 34 && HoldReasonSubCode == 0 " \
"|| HoldReasonCode == 3 && HoldReasonSubCode == 34)"
expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
return expr


Expand All @@ -1534,9 +1537,14 @@ def _create_request_memory_expr(memory, multiplier, limit):
A string representing an HTCondor ClassAd expression enabling safe
memory scaling between job retries.
"""
# ClassAds 'Last*' are UNDEFINED when a job is put in the job queue.
# The special comparison operators ensure that all comparisons below will
# evaluate to FALSE in this case.
# The check if the job was held due to exceeding memory requirements
# will be made *after* job was released back to the job queue (is in
# the IDLE state), hence the need to use `Last*` job ClassAds instead of
# the ones describing job's current state.
#
# Also, 'Last*' job ClassAds are UNDEFINED when a job is initially
# put in the job queue. The special comparison operators ensure that all
# comparisons below will evaluate to FALSE in this case.
was_mem_exceeded = "LastJobStatus =?= 5 " \
"&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 " \
"|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
Expand Down

0 comments on commit fadf008

Please sign in to comment.