From e3266dab9ba545299a7af5eb312773bf7ae41f0f Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 17 Sep 2021 13:08:49 +0800 Subject: [PATCH] Bug - Fix torch.distributed command for single node (#201) Fix `torch.distributed` command for single node. --- superbench/runner/runner.py | 19 ++++++------------- tests/runner/test_runner.py | 2 -- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 7db3fa952..fcaf3f70b 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -123,20 +123,13 @@ def __get_mode_command(self, benchmark_name, mode): elif mode.name == 'torch.distributed': # TODO: replace with torch.distributed.run in v1.9 # TODO: only supports node_num=1 and node_num=all currently + torch_dist_params = '' if mode.node_num == 1 else \ + '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' mode_command = ( - 'python3 -m torch.distributed.launch ' - '--use_env --no_python --nproc_per_node={proc_num} ' - '--nnodes={node_num} --node_rank=$NODE_RANK ' - '--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' - '{command} {torch_distributed_suffix}' - ).format( - proc_num=mode.proc_num, - node_num=1 if mode.node_num == 1 else '$NNODES', - command=exec_command, - torch_distributed_suffix=( - 'superbench.benchmarks.{name}.parameters.distributed_impl=ddp ' - 'superbench.benchmarks.{name}.parameters.distributed_backend=nccl' - ).format(name=benchmark_name), + f'python3 -m torch.distributed.launch' + f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}' + f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp' + f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl' ) elif mode.name == 'mpi': mode_command = ( diff --git a/tests/runner/test_runner.py b/tests/runner/test_runner.py index 20bd267ee..220d6d0d8 100644 --- a/tests/runner/test_runner.py +++ b/tests/runner/test_runner.py @@ -116,8 +116,6 @@ def test_get_mode_command(self): 'expected_command': ( 'python3 -m torch.distributed.launch ' '--use_env --no_python --nproc_per_node=8 ' - '--nnodes=1 --node_rank=$NODE_RANK ' - '--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo ' 'superbench.benchmarks.foo.parameters.distributed_impl=ddp ' 'superbench.benchmarks.foo.parameters.distributed_backend=nccl'