From c68231a18eb513fe0cab6d46d2ef4a60ec31af6f Mon Sep 17 00:00:00 2001 From: Ahmad Sharif Date: Sun, 24 Aug 2025 13:50:15 -0700 Subject: [PATCH 1/2] Add ntasks for older slurm versions. Without this flag, on slurm 23.11.10 we get this error: ``` sbatch: error: Failed to validate job spec. --gpus-per-task or --tres-per-task used without either --gpus or -n/--ntasks is not allowed. sbatch: error: Invalid generic resource (gres) specification ``` --- torchx/schedulers/slurm_scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchx/schedulers/slurm_scheduler.py b/torchx/schedulers/slurm_scheduler.py index f74dd2448..fde0fbf96 100644 --- a/torchx/schedulers/slurm_scheduler.py +++ b/torchx/schedulers/slurm_scheduler.py @@ -210,6 +210,7 @@ def from_role( sbatch_opts.setdefault("gpus-per-node", str(resource.gpu)) else: sbatch_opts.setdefault("gpus-per-task", str(resource.gpu)) + sbatch_opts.setdefault("ntasks", "1") srun_opts = { "output": f"slurm-{macros.app_id}-{name}.out", From 60e80d8ff1fd67dd6df57d53483cd69152320722 Mon Sep 17 00:00:00 2001 From: Ahmad Sharif Date: Sun, 24 Aug 2025 15:26:02 -0700 Subject: [PATCH 2/2] . --- torchx/schedulers/test/slurm_scheduler_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchx/schedulers/test/slurm_scheduler_test.py b/torchx/schedulers/test/slurm_scheduler_test.py index 480f02bc8..ef7f3383e 100644 --- a/torchx/schedulers/test/slurm_scheduler_test.py +++ b/torchx/schedulers/test/slurm_scheduler_test.py @@ -128,6 +128,7 @@ def test_replica_request(self, mock_version: MagicMock) -> None: "--cpus-per-task=2", "--mem=10", "--gpus-per-task=3", + "--ntasks=1", ], ) self.assertEqual( @@ -163,6 +164,7 @@ def test_replica_request_nomem(self, mock_version: MagicMock) -> None: "--ntasks-per-node=1", "--cpus-per-task=2", "--gpus-per-task=3", + "--ntasks=1", ], )