From 0b1fe312e8c98a814b1c419940f35253f58f958e Mon Sep 17 00:00:00 2001 From: Yaman Qalieh Date: Sat, 18 Feb 2023 15:03:01 -0500 Subject: [PATCH] fix: batch collect jobs for scancel (#2114) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description When using --slurm, exiting snakemake by SIGINT or ctrl-c helpfully cancels spawned jobs. However, this is quite unreliable as it often hangs for a few minutes and exits without canceling the jobs in the end (without indicating that it failed either). Slurm documentation on [scancel](https://slurm.schedmd.com/scancel.html#SECTION_PERFORMANCE) notes that a large number of scancel calls at the same time may result in denial of service. Snakemake runs scancel on each job [individually](https://github.com/snakemake/snakemake/blob/main/snakemake/executors/slurm/slurm_submit.py#L136). Instead, job ids should be collected and cancelled all at once. fixes #2113 ### QC * [X] The PR contains a test case for the changes or the changes are already covered by an existing test case. * [X] The documentation (`docs/`) is updated to reflect the changes or this is not necessary (e.g. if the change does neither modify the language nor the behavior or functionalities of Snakemake). Co-authored-by: Johannes Köster --- snakemake/executors/slurm/slurm_submit.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/snakemake/executors/slurm/slurm_submit.py b/snakemake/executors/slurm/slurm_submit.py index ddc4e4b09..b94bf0af0 100644 --- a/snakemake/executors/slurm/slurm_submit.py +++ b/snakemake/executors/slurm/slurm_submit.py @@ -134,22 +134,23 @@ def additional_general_args(self): return [" --slurm-jobstep", "--jobs 1"] def cancel(self): - for job in self.active_jobs: - jobid = job.jobid + # Jobs are collected to reduce load on slurmctld + jobids = " ".join([job.jobid for job in self.active_jobs]) + if len(jobids) > 0: try: # timeout set to 60, because a scheduler cycle usually is # about 30 sec, but can be longer in extreme cases. # Under 'normal' circumstances, 'scancel' is executed in # virtually no time. subprocess.check_output( - f"scancel {jobid}", + f"scancel {jobids}", text=True, shell=True, timeout=60, stderr=subprocess.PIPE, ) except subprocess.TimeoutExpired: - logger.warning(f"Unable to cancel job {jobid} within a minute.") + logger.warning(f"Unable to cancel jobs within a minute.") self.shutdown() def get_account_arg(self, job):