From a86359da9e1c0e4ee16beb7d7ead3df1fba7577b Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 10 Sep 2021 20:19:12 +0800 Subject: [PATCH 1/2] Runner: Code Revision - runner launch new process for each model in xxx_models benchmark --- superbench/runner/runner.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 7db3fa952..42ad6409e 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -128,7 +128,7 @@ def __get_mode_command(self, benchmark_name, mode): '--use_env --no_python --nproc_per_node={proc_num} ' '--nnodes={node_num} --node_rank=$NODE_RANK ' '--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' - '{command} {torch_distributed_suffix}' + '{command} {torch_distributed_suffix} {model_name}' ).format( proc_num=mode.proc_num, node_num=1 if mode.node_num == 1 else '$NNODES', @@ -137,6 +137,8 @@ def __get_mode_command(self, benchmark_name, mode): 'superbench.benchmarks.{name}.parameters.distributed_impl=ddp ' 'superbench.benchmarks.{name}.parameters.distributed_backend=nccl' ).format(name=benchmark_name), + model_name=('superbench.benchmarks.{name}.models=[{model}]' + ).format(name=benchmark_name, model=mode.model_name) ) elif mode.name == 'mpi': mode_command = ( @@ -341,7 +343,10 @@ def run(self): 'proc_rank': proc_rank }) for proc_rank in range(mode.proc_num) ) - elif mode.name == 'torch.distributed' or mode.name == 'mpi': + elif mode.name == 'torch.distributed': + for m in self._sb_benchmarks[benchmark_name].models: + self._run_proc(benchmark_name, mode, {'proc_rank': 0, 'model_name': m}) + elif mode.name == 'mpi': self._run_proc(benchmark_name, mode, {'proc_rank': 0}) else: logger.warning('Unknown mode %s.', mode.name) From 8a670bd83c21987e55e2aa89ab4691f43d423fe5 Mon Sep 17 00:00:00 2001 From: yukirora Date: Sat, 11 Sep 2021 12:55:11 +0800 Subject: [PATCH 2/2] add torch.distributed.barrier( destroy_process_group --- superbench/benchmarks/model_benchmarks/pytorch_base.py | 1 + superbench/runner/runner.py | 9 ++------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index d99218460..0497dd750 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -174,6 +174,7 @@ def _postprocess(self): try: if self._args.distributed_impl == DistributedImpl.DDP: + torch.distributed.barrier() torch.distributed.destroy_process_group() except BaseException as e: self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 42ad6409e..7db3fa952 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -128,7 +128,7 @@ def __get_mode_command(self, benchmark_name, mode): '--use_env --no_python --nproc_per_node={proc_num} ' '--nnodes={node_num} --node_rank=$NODE_RANK ' '--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' - '{command} {torch_distributed_suffix} {model_name}' + '{command} {torch_distributed_suffix}' ).format( proc_num=mode.proc_num, node_num=1 if mode.node_num == 1 else '$NNODES', @@ -137,8 +137,6 @@ def __get_mode_command(self, benchmark_name, mode): 'superbench.benchmarks.{name}.parameters.distributed_impl=ddp ' 'superbench.benchmarks.{name}.parameters.distributed_backend=nccl' ).format(name=benchmark_name), - model_name=('superbench.benchmarks.{name}.models=[{model}]' - ).format(name=benchmark_name, model=mode.model_name) ) elif mode.name == 'mpi': mode_command = ( @@ -343,10 +341,7 @@ def run(self): 'proc_rank': proc_rank }) for proc_rank in range(mode.proc_num) ) - elif mode.name == 'torch.distributed': - for m in self._sb_benchmarks[benchmark_name].models: - self._run_proc(benchmark_name, mode, {'proc_rank': 0, 'model_name': m}) - elif mode.name == 'mpi': + elif mode.name == 'torch.distributed' or mode.name == 'mpi': self._run_proc(benchmark_name, mode, {'proc_rank': 0}) else: logger.warning('Unknown mode %s.', mode.name)