From 90c637b726e09fe8f2e67075e5d53db5ff34b756 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 23 May 2023 02:19:21 +0000 Subject: [PATCH 01/11] add runner for sys info --- superbench/cli/_commands.py | 1 + superbench/cli/_handler.py | 41 +++++++++++++++++++++++++++++++++ superbench/cli/_node_handler.py | 13 ++++++++++- superbench/runner/runner.py | 18 +++++++++++++++ 4 files changed, 72 insertions(+), 1 deletion(-) diff --git a/superbench/cli/_commands.py b/superbench/cli/_commands.py index f37bc0f33..246db0b07 100644 --- a/superbench/cli/_commands.py +++ b/superbench/cli/_commands.py @@ -23,6 +23,7 @@ def load_command_table(self, args): g.command('deploy', 'deploy_command_handler') g.command('exec', 'exec_command_handler') g.command('run', 'run_command_handler') + g.command('run-info', 'run_info_command_handler') with CommandGroup(self, 'benchmark', 'superbench.cli._benchmark_handler#{}') as g: g.command('list', 'benchmark_list_command_handler') g.command('list-parameters', 'benchmark_list_params_command_handler') diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py index 3c2d1cbaa..6f46d4761 100644 --- a/superbench/cli/_handler.py +++ b/superbench/cli/_handler.py @@ -319,3 +319,44 @@ def run_command_handler( runner.run() if runner.get_failure_count() != 0: sys.exit(runner.get_failure_count()) + + +def run_info_command_handler( + docker_image='superbench/superbench', + docker_username=None, + docker_password=None, + no_image_pull=False, + host_file=None, + host_list=None, + host_username=None, + host_password=None, + output_dir=None, + private_key=None +): + """Get node hardware info. + + Args: + output_dir (str): Output directory. + + Returns: + dict: node info. + """ + + docker_config, ansible_config, sb_config, sb_output_dir = process_runner_arguments( + docker_image=docker_image, + docker_username=docker_username, + docker_password=docker_password, + no_docker=False, + no_image_pull=no_image_pull, + host_file=host_file, + host_list=host_list, + host_username=host_username, + host_password=host_password, + output_dir=output_dir, + private_key=private_key, + ) + + runner = SuperBenchRunner(sb_config, docker_config, ansible_config, sb_output_dir) + runner.run_sys_info() + if runner.get_failure_count() != 0: + sys.exit(runner.get_failure_count()) diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index 4a57b5b20..1ca28c1cd 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -3,17 +3,28 @@ """SuperBench CLI node subgroup command handler.""" +import json +from pathlib import Path + from superbench.tools import SystemInfo +from superbench.common.utils import create_sb_output_dir -def info_command_handler(): +def info_command_handler(output_dir=None): """Get node hardware info. + Args: + output_dir (str): Output directory. + Returns: dict: node info. """ try: + output_dir = create_sb_output_dir(output_dir) info = SystemInfo().get_all() + output_dir_path = Path(output_dir) + with open(output_dir_path / 'sys_info.json', 'w') as f: + json.dump(info, f) except Exception as ex: raise RuntimeError('Failed to get node info.') from ex return info diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 28b5c7186..c39cd57d2 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -197,6 +197,24 @@ def deploy(self): # pragma: no cover ) self._ansible_client.run(self._ansible_client.get_playbook_config('deploy.yaml', extravars=extravars)) + def run_sys_info(self): + """Run the SuperBench benchmarks distributedly.""" + self.check_env() + + logger.info('Runner is going to run node info.') + + fcmd = "docker exec sb-workspace bash -c '{command}'" + if self._docker_config.skip: + fcmd = "bash -c 'cd $SB_WORKSPACE && {command}'" + ansible_runner_config = self._ansible_client.get_shell_config( + fcmd.format(command='sb node info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) + ) + ansible_rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip)) + + if ansible_rc != 0: + self.cleanup() + self.fetch_results() + def check_env(self): # pragma: no cover """Check SuperBench environment.""" logger.info('Checking SuperBench environment.') From 2fba16cfbbaa45996e28a0e0bc35a24e3b9f2754 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 23 May 2023 03:01:25 +0000 Subject: [PATCH 02/11] update to remove info in stdout --- superbench/cli/_node_handler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index 1ca28c1cd..1fb9360d7 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -27,4 +27,3 @@ def info_command_handler(output_dir=None): json.dump(info, f) except Exception as ex: raise RuntimeError('Failed to get node info.') from ex - return info From 0861f30fee6bd6e8851b1e584e8b522289e38e29 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 23 May 2023 03:34:22 +0000 Subject: [PATCH 03/11] update docs --- docs/cli.md | 68 +++++++++++++++++++++++++++++ docs/user-tutorial/system-config.md | 18 +++++++- superbench/cli/_handler.py | 18 +++++--- superbench/cli/_help.py | 10 +++++ superbench/cli/_node_handler.py | 3 -- superbench/runner/runner.py | 2 +- 6 files changed, 109 insertions(+), 10 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index df1c1ca4d..5c56e5b97 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -165,6 +165,26 @@ Execute GPT2 model benchmark in default configuration: sb exec --config-override superbench.enable="['gpt2_models']" ``` +### `sb node info` +Get system info on the local node. + +```bash title="SB CLI" +sb node info [--output-dir] +``` + +#### Optional arguments + +| Name | Default | Description | +|----------------|---------|-----------------------------------------------------------------------------| +| `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | + +#### Examples + +Get system info on the local node and save it into the `outputs` dir: +```bash title="SB CLI" +sb node info --output-dir outputs +``` + ### `sb result diagnosis` Filter the defective machines automatically from benchmarking results according to rules defined in rule file. @@ -335,6 +355,54 @@ sb run --no-docker --host-list localhost --config-override \ superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark ``` +### `sb run-info` + +```bash title="SB CLI" +sb run-info [--docker-image] + [--docker-password] + [--docker-username] + [--host-file] + [--host-list] + [--host-password] + [--host-username] + [--no-image-pull] + [--output-dir] + [--private-key] +``` + +#### Optional arguments + +| Name | Default | Description | +|-----------------------|-------------------------|-----------------------------------------------------------------------------------| +| `--docker-image` `-i` | `superbench/superbench` | Docker image URI, [here](./user-tutorial/container-images.mdx) listed all images. | +| `--docker-password` | `None` | Docker registry password if authentication is needed. | +| `--docker-username` | `None` | Docker registry username if authentication is needed. | +| `--host-file` `-f` | `None` | Path to Ansible inventory host file. | +| `--host-list` `-l` | `None` | Comma separated host list. | +| `--host-password` | `None` | Host password or key passphase if needed. | +| `--host-username` | `None` | Host username if needed. | +| `--no-image-pull` | `False` | Skip pull and use local Docker image. | +| `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | +| `--private-key` | `None` | Path to private key if needed. | + +#### Global arguments + +| Name | Default | Description | +|---------------|---------|--------------------| +| `--help` `-h` | N/A | Show help message. | + +#### Examples + +Collect system info on local GPU node: +```bash title="SB CLI" +sb run-info --host-list localhost +``` + +Collect system info on all nodes in `./host.ini`: +```bash title="SB CLI" +sb run-info --host-file ./host.ini +``` + ### `sb version` Print the current SuperBench CLI version. diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index dbde728d3..0d74e047e 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -4,6 +4,8 @@ id: system-config # System Config Info +This tool is to collect the system information automatically on the tested GPU nodes including the following hardware categories: + - [System](#system) - [Memory](#memory) - [CPU](#cpu) @@ -12,7 +14,21 @@ id: system-config - [Accelerator](#accelerator) - [PCIe](#pcie) -## Parameter amd Details +## Usage + +1. [Install SuperBench](../getting-started/installation.mdx) on the local machine. + +2. Prepare the host file of the tested GPU nodes using [Ansible Inventory](../getting-started/configuration.md#ansible-inventory) on the local machine. + +3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb run-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). + + ``` + sb run-info -f host.ini --output-dir ${output-dir} + ``` + +4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}. + +## Parameter and Details ### System diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py index 6f46d4761..107a803b5 100644 --- a/superbench/cli/_handler.py +++ b/superbench/cli/_handler.py @@ -333,15 +333,23 @@ def run_info_command_handler( output_dir=None, private_key=None ): - """Get node hardware info. + """Collect the system info on all given nodes. Args: - output_dir (str): Output directory. + docker_image (str, optional): Docker image URI. Defaults to superbench/superbench:latest. + docker_username (str, optional): Docker registry username if authentication is needed. Defaults to None. + docker_password (str, optional): Docker registry password if authentication is needed. Defaults to None. + no_image_pull (bool, optional): Skip pull and use local Docker image. Defaults to False. + host_file (str, optional): Path to Ansible inventory host file. Defaults to None. + host_list (str, optional): Comma separated host list. Defaults to None. + host_username (str, optional): Host username if needed. Defaults to None. + host_password (str, optional): Host password or key passphase if needed. Defaults to None. + output_dir (str, optional): Path to output directory. Defaults to None. + private_key (str, optional): Path to private key if needed. Defaults to None. - Returns: - dict: node info. + Raises: + CLIError: If input arguments are invalid. """ - docker_config, ansible_config, sb_config, sb_output_dir = process_runner_arguments( docker_image=docker_image, docker_username=docker_username, diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py index 2c7f507b2..3544f98ea 100644 --- a/superbench/cli/_help.py +++ b/superbench/cli/_help.py @@ -65,6 +65,16 @@ --config-override superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark """.format(cli_name=CLI_NAME) +helps['run-info'] = """ + type: command + short-summary: Collect the system info distributedly. + examples: + - name: Collect system info on local GPU node + text: {cli_name} run-info --host-list localhost + - name: Collect system info on all nodes in ./host.ini" + text: {cli_name} run-info --host-file ./host.ini +""".format(cli_name=CLI_NAME) + helps['benchmark'] = """ type: group short-summary: Commands to manage benchmarks. diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index 1fb9360d7..980ff435a 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -15,9 +15,6 @@ def info_command_handler(output_dir=None): Args: output_dir (str): Output directory. - - Returns: - dict: node info. """ try: output_dir = create_sb_output_dir(output_dir) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index c39cd57d2..29bf14f82 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -198,7 +198,7 @@ def deploy(self): # pragma: no cover self._ansible_client.run(self._ansible_client.get_playbook_config('deploy.yaml', extravars=extravars)) def run_sys_info(self): - """Run the SuperBench benchmarks distributedly.""" + """Run the system info on all nodes.""" self.check_env() logger.info('Runner is going to run node info.') From 6848e308926d8fa553513dab829274d16f88e51c Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 19 Jun 2023 06:08:37 +0000 Subject: [PATCH 04/11] merge local and remote into same command --- docs/cli.md | 28 ++++------------------------ docs/user-tutorial/system-config.md | 2 +- superbench/cli/_commands.py | 2 +- superbench/cli/_handler.py | 17 ++++++++++++++++- superbench/cli/_help.py | 6 +++--- superbench/cli/_node_handler.py | 11 ++--------- superbench/runner/runner.py | 2 +- 7 files changed, 28 insertions(+), 40 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 5c56e5b97..b7c43022a 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -165,26 +165,6 @@ Execute GPT2 model benchmark in default configuration: sb exec --config-override superbench.enable="['gpt2_models']" ``` -### `sb node info` -Get system info on the local node. - -```bash title="SB CLI" -sb node info [--output-dir] -``` - -#### Optional arguments - -| Name | Default | Description | -|----------------|---------|-----------------------------------------------------------------------------| -| `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | - -#### Examples - -Get system info on the local node and save it into the `outputs` dir: -```bash title="SB CLI" -sb node info --output-dir outputs -``` - ### `sb result diagnosis` Filter the defective machines automatically from benchmarking results according to rules defined in rule file. @@ -355,10 +335,10 @@ sb run --no-docker --host-list localhost --config-override \ superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark ``` -### `sb run-info` +### `sb node-info` ```bash title="SB CLI" -sb run-info [--docker-image] +sb node-info [--docker-image] [--docker-password] [--docker-username] [--host-file] @@ -395,12 +375,12 @@ sb run-info [--docker-image] Collect system info on local GPU node: ```bash title="SB CLI" -sb run-info --host-list localhost +sb node-info ``` Collect system info on all nodes in `./host.ini`: ```bash title="SB CLI" -sb run-info --host-file ./host.ini +sb node-info --host-file ./host.ini ``` ### `sb version` diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index 0d74e047e..19589925c 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -20,7 +20,7 @@ This tool is to collect the system information automatically on the tested GPU n 2. Prepare the host file of the tested GPU nodes using [Ansible Inventory](../getting-started/configuration.md#ansible-inventory) on the local machine. -3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb run-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). +3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb node-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). ``` sb run-info -f host.ini --output-dir ${output-dir} diff --git a/superbench/cli/_commands.py b/superbench/cli/_commands.py index 246db0b07..e70d6539a 100644 --- a/superbench/cli/_commands.py +++ b/superbench/cli/_commands.py @@ -23,7 +23,7 @@ def load_command_table(self, args): g.command('deploy', 'deploy_command_handler') g.command('exec', 'exec_command_handler') g.command('run', 'run_command_handler') - g.command('run-info', 'run_info_command_handler') + g.command('node-info', 'info_command_handler') with CommandGroup(self, 'benchmark', 'superbench.cli._benchmark_handler#{}') as g: g.command('list', 'benchmark_list_command_handler') g.command('list-parameters', 'benchmark_list_params_command_handler') diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py index 107a803b5..60fb420b7 100644 --- a/superbench/cli/_handler.py +++ b/superbench/cli/_handler.py @@ -3,6 +3,7 @@ """SuperBench CLI command handler.""" +import json import sys from pathlib import Path from importlib_metadata import version, PackageNotFoundError @@ -14,6 +15,7 @@ from superbench.runner import SuperBenchRunner from superbench.executor import SuperBenchExecutor from superbench.common.utils import create_sb_output_dir, get_sb_config +from superbench.tools import SystemInfo def check_argument_file(name, file): @@ -321,7 +323,7 @@ def run_command_handler( sys.exit(runner.get_failure_count()) -def run_info_command_handler( +def info_command_handler( docker_image='superbench/superbench', docker_username=None, docker_password=None, @@ -350,6 +352,19 @@ def run_info_command_handler( Raises: CLIError: If input arguments are invalid. """ + # local + if not (host_file or host_list): + try: + output_dir = create_sb_output_dir(output_dir) + info = SystemInfo().get_all() + output_dir_path = Path(output_dir) + with open(output_dir_path / 'sys_info.json', 'w') as f: + json.dump(info, f) + except Exception as ex: + raise RuntimeError('Failed to get node info.') from ex + return + + # remote docker_config, ansible_config, sb_config, sb_output_dir = process_runner_arguments( docker_image=docker_image, docker_username=docker_username, diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py index 3544f98ea..cfb49de36 100644 --- a/superbench/cli/_help.py +++ b/superbench/cli/_help.py @@ -65,14 +65,14 @@ --config-override superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark """.format(cli_name=CLI_NAME) -helps['run-info'] = """ +helps['node-info'] = """ type: command short-summary: Collect the system info distributedly. examples: - name: Collect system info on local GPU node - text: {cli_name} run-info --host-list localhost + text: {cli_name} node-info - name: Collect system info on all nodes in ./host.ini" - text: {cli_name} run-info --host-file ./host.ini + text: {cli_name} node-info --host-file ./host.ini """.format(cli_name=CLI_NAME) helps['benchmark'] = """ diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index 980ff435a..ad1761830 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -3,24 +3,17 @@ """SuperBench CLI node subgroup command handler.""" -import json -from pathlib import Path - from superbench.tools import SystemInfo -from superbench.common.utils import create_sb_output_dir -def info_command_handler(output_dir=None): +def info_command_handler(): """Get node hardware info. Args: output_dir (str): Output directory. """ try: - output_dir = create_sb_output_dir(output_dir) info = SystemInfo().get_all() - output_dir_path = Path(output_dir) - with open(output_dir_path / 'sys_info.json', 'w') as f: - json.dump(info, f) except Exception as ex: raise RuntimeError('Failed to get node info.') from ex + return info diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 29bf14f82..0a4290bfa 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -207,7 +207,7 @@ def run_sys_info(self): if self._docker_config.skip: fcmd = "bash -c 'cd $SB_WORKSPACE && {command}'" ansible_runner_config = self._ansible_client.get_shell_config( - fcmd.format(command='sb node info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) + fcmd.format(command='sb node-info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) ) ansible_rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip)) From dcd851fb40efbae8b5d33ba765a92e49d882c9d3 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 19 Jun 2023 08:52:22 +0000 Subject: [PATCH 05/11] update --- superbench/cli/_node_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index ad1761830..4a57b5b20 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -9,8 +9,8 @@ def info_command_handler(): """Get node hardware info. - Args: - output_dir (str): Output directory. + Returns: + dict: node info. """ try: info = SystemInfo().get_all() From 2d6cab2d30cee30503891e4598f2527e292c6a21 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 20 Jun 2023 03:12:24 +0000 Subject: [PATCH 06/11] update typo in doc --- docs/user-tutorial/system-config.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index 19589925c..899fb22f8 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -23,7 +23,7 @@ This tool is to collect the system information automatically on the tested GPU n 3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb node-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). ``` - sb run-info -f host.ini --output-dir ${output-dir} + sb node-info -f host.ini --output-dir ${output-dir} ``` 4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}. From 6db96ede78fd79debf198dcecfec9234d32f0c2c Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Sun, 25 Jun 2023 12:41:29 +0000 Subject: [PATCH 07/11] update --- docs/cli.md | 46 ++---------------- docs/user-tutorial/system-config.md | 16 ++++++- superbench/cli/_commands.py | 4 +- superbench/cli/_handler.py | 72 ++++++----------------------- superbench/cli/_help.py | 10 +--- superbench/runner/runner.py | 2 +- 6 files changed, 38 insertions(+), 112 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index b7c43022a..a32dd37c6 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -335,52 +335,14 @@ sb run --no-docker --host-list localhost --config-override \ superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark ``` -### `sb node-info` - -```bash title="SB CLI" -sb node-info [--docker-image] - [--docker-password] - [--docker-username] - [--host-file] - [--host-list] - [--host-password] - [--host-username] - [--no-image-pull] - [--output-dir] - [--private-key] -``` - -#### Optional arguments - -| Name | Default | Description | -|-----------------------|-------------------------|-----------------------------------------------------------------------------------| -| `--docker-image` `-i` | `superbench/superbench` | Docker image URI, [here](./user-tutorial/container-images.mdx) listed all images. | -| `--docker-password` | `None` | Docker registry password if authentication is needed. | -| `--docker-username` | `None` | Docker registry username if authentication is needed. | -| `--host-file` `-f` | `None` | Path to Ansible inventory host file. | -| `--host-list` `-l` | `None` | Comma separated host list. | -| `--host-password` | `None` | Host password or key passphase if needed. | -| `--host-username` | `None` | Host username if needed. | -| `--no-image-pull` | `False` | Skip pull and use local Docker image. | -| `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | -| `--private-key` | `None` | Path to private key if needed. | - -#### Global arguments - -| Name | Default | Description | -|---------------|---------|--------------------| -| `--help` `-h` | N/A | Show help message. | - -#### Examples - -Collect system info on local GPU node: +Collect system info on local GPU node only without run benchmarks: ```bash title="SB CLI" -sb node-info +sb run --get-info ``` -Collect system info on all nodes in `./host.ini`: +Collect system info on all nodes in ./host.ini" distributed only without run benchmarks: ```bash title="SB CLI" -sb node-info --host-file ./host.ini +sb run --get-info --host-file ./host.ini ``` ### `sb version` diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index 899fb22f8..7e1249b39 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -16,14 +16,26 @@ This tool is to collect the system information automatically on the tested GPU n ## Usage +### Usage on local machine + +1. [Install SuperBench](../getting-started/installation.mdx) on the local machine using root privilege. + +2. Start to collect the sys info using `sb run --get-info --output-dir ${output-dir}` command using root privilege. + +3. After the command finished, you can find the output system info json file `sys-info.json` of local node under \${output_dir}. + +### Usage on multiple remote machines + 1. [Install SuperBench](../getting-started/installation.mdx) on the local machine. +2. [Deploy SuperBench](../getting-started/run-superbench.md#deploy) onto the remote machines. + 2. Prepare the host file of the tested GPU nodes using [Ansible Inventory](../getting-started/configuration.md#ansible-inventory) on the local machine. -3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb node-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). +3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb run --get-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). ``` - sb node-info -f host.ini --output-dir ${output-dir} + sb run --get-info -f host.ini -c config.yaml --output-dir ${output-dir} ``` 4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}. diff --git a/superbench/cli/_commands.py b/superbench/cli/_commands.py index e70d6539a..2122034a3 100644 --- a/superbench/cli/_commands.py +++ b/superbench/cli/_commands.py @@ -23,7 +23,6 @@ def load_command_table(self, args): g.command('deploy', 'deploy_command_handler') g.command('exec', 'exec_command_handler') g.command('run', 'run_command_handler') - g.command('node-info', 'info_command_handler') with CommandGroup(self, 'benchmark', 'superbench.cli._benchmark_handler#{}') as g: g.command('list', 'benchmark_list_command_handler') g.command('list-parameters', 'benchmark_list_params_command_handler') @@ -68,6 +67,9 @@ def load_arguments(self, command): nargs='+', help='Extra arguments to override config_file.' ) + ac.argument( + 'get_info', options_list=('--get-info', '-g'), action='store_true', help='Collect node system info.' + ) with ArgumentsContext(self, 'benchmark') as ac: ac.argument('name', options_list=('--name', '-n'), type=str, help='Benchmark name or regular expression.') diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py index 60fb420b7..15b0a8a6b 100644 --- a/superbench/cli/_handler.py +++ b/superbench/cli/_handler.py @@ -277,7 +277,8 @@ def run_command_handler( output_dir=None, private_key=None, config_file=None, - config_override=None + config_override=None, + get_info=False, ): """Run the SuperBench benchmarks distributedly. @@ -297,63 +298,12 @@ def run_command_handler( config_file (str, optional): Path to SuperBench config file. Defaults to None. config_override (str, optional): Extra arguments to override config_file, following [Hydra syntax](https://hydra.cc/docs/advanced/override_grammar/basic). Defaults to None. + get_info (bool, optional): Collect node system info. Defaults to False. Raises: CLIError: If input arguments are invalid. """ - docker_config, ansible_config, sb_config, sb_output_dir = process_runner_arguments( - docker_image=docker_image, - docker_username=docker_username, - docker_password=docker_password, - no_docker=no_docker, - no_image_pull=False, - host_file=host_file, - host_list=host_list, - host_username=host_username, - host_password=host_password, - output_dir=output_dir, - private_key=private_key, - config_file=config_file, - config_override=config_override, - ) - - runner = SuperBenchRunner(sb_config, docker_config, ansible_config, sb_output_dir) - runner.run() - if runner.get_failure_count() != 0: - sys.exit(runner.get_failure_count()) - - -def info_command_handler( - docker_image='superbench/superbench', - docker_username=None, - docker_password=None, - no_image_pull=False, - host_file=None, - host_list=None, - host_username=None, - host_password=None, - output_dir=None, - private_key=None -): - """Collect the system info on all given nodes. - - Args: - docker_image (str, optional): Docker image URI. Defaults to superbench/superbench:latest. - docker_username (str, optional): Docker registry username if authentication is needed. Defaults to None. - docker_password (str, optional): Docker registry password if authentication is needed. Defaults to None. - no_image_pull (bool, optional): Skip pull and use local Docker image. Defaults to False. - host_file (str, optional): Path to Ansible inventory host file. Defaults to None. - host_list (str, optional): Comma separated host list. Defaults to None. - host_username (str, optional): Host username if needed. Defaults to None. - host_password (str, optional): Host password or key passphase if needed. Defaults to None. - output_dir (str, optional): Path to output directory. Defaults to None. - private_key (str, optional): Path to private key if needed. Defaults to None. - - Raises: - CLIError: If input arguments are invalid. - """ - # local - if not (host_file or host_list): + if (get_info and not (host_file or host_list)): try: output_dir = create_sb_output_dir(output_dir) info = SystemInfo().get_all() @@ -364,22 +314,28 @@ def info_command_handler( raise RuntimeError('Failed to get node info.') from ex return - # remote docker_config, ansible_config, sb_config, sb_output_dir = process_runner_arguments( docker_image=docker_image, docker_username=docker_username, docker_password=docker_password, - no_docker=False, - no_image_pull=no_image_pull, + no_docker=no_docker, + no_image_pull=False, host_file=host_file, host_list=host_list, host_username=host_username, host_password=host_password, output_dir=output_dir, private_key=private_key, + config_file=config_file, + config_override=config_override, ) runner = SuperBenchRunner(sb_config, docker_config, ansible_config, sb_output_dir) - runner.run_sys_info() + + if get_info: + runner.run_sys_info() + else: + runner.run() + if runner.get_failure_count() != 0: sys.exit(runner.get_failure_count()) diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py index cfb49de36..118e1180e 100644 --- a/superbench/cli/_help.py +++ b/superbench/cli/_help.py @@ -63,16 +63,10 @@ text: > {cli_name} run --no-docker --host-list localhost --config-override superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark -""".format(cli_name=CLI_NAME) - -helps['node-info'] = """ - type: command - short-summary: Collect the system info distributedly. - examples: - name: Collect system info on local GPU node - text: {cli_name} node-info + text: {cli_name} --get-info - name: Collect system info on all nodes in ./host.ini" - text: {cli_name} node-info --host-file ./host.ini + text: {cli_name} --get-info --host-file ./host.ini """.format(cli_name=CLI_NAME) helps['benchmark'] = """ diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 0a4290bfa..1384baae2 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -207,7 +207,7 @@ def run_sys_info(self): if self._docker_config.skip: fcmd = "bash -c 'cd $SB_WORKSPACE && {command}'" ansible_runner_config = self._ansible_client.get_shell_config( - fcmd.format(command='sb node-info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) + fcmd.format(command='sb run --get-info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) ) ansible_rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip)) From f7b1fb7b4d6617d2e88021ca0b29792bfa67f720 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Wed, 28 Jun 2023 04:02:17 +0000 Subject: [PATCH 08/11] update --- docs/cli.md | 26 +++++++++++++++++++++++--- docs/user-tutorial/system-config.md | 4 ++-- superbench/cli/_handler.py | 16 +--------------- superbench/cli/_help.py | 6 +++--- superbench/cli/_node_handler.py | 14 +++++++++++++- superbench/runner/runner.py | 2 +- 6 files changed, 43 insertions(+), 25 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index a32dd37c6..35d08787e 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -165,6 +165,26 @@ Execute GPT2 model benchmark in default configuration: sb exec --config-override superbench.enable="['gpt2_models']" ``` +### `sb node info` +Get system info on the local node. + +```bash title="SB CLI" +sb node info [--output-dir] +``` + +#### Optional arguments + +| Name | Default | Description | +|----------------|---------|-----------------------------------------------------------------------------| +| `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | + +#### Examples + +Get system info on the local node and save it into the `outputs` dir: +```bash title="SB CLI" +sb node info --output-dir outputs +``` + ### `sb result diagnosis` Filter the defective machines automatically from benchmarking results according to rules defined in rule file. @@ -335,12 +355,12 @@ sb run --no-docker --host-list localhost --config-override \ superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark ``` -Collect system info on local GPU node only without run benchmarks: +Collect system info on all nodes in ./host.ini" distributed without running benchmarks: ```bash title="SB CLI" -sb run --get-info +sb run --get-info --host-file ./host.ini -C superbench.enable=none ``` -Collect system info on all nodes in ./host.ini" distributed only without run benchmarks: +Collect system info on all nodes in ./host.ini" distributed while running benchmarks: ```bash title="SB CLI" sb run --get-info --host-file ./host.ini ``` diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index 7e1249b39..d28e5df16 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -20,7 +20,7 @@ This tool is to collect the system information automatically on the tested GPU n 1. [Install SuperBench](../getting-started/installation.mdx) on the local machine using root privilege. -2. Start to collect the sys info using `sb run --get-info --output-dir ${output-dir}` command using root privilege. +2. Start to collect the sys info using `sb node info --output-dir ${output-dir}` command using root privilege. 3. After the command finished, you can find the output system info json file `sys-info.json` of local node under \${output_dir}. @@ -35,7 +35,7 @@ This tool is to collect the system information automatically on the tested GPU n 3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb run --get-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). ``` - sb run --get-info -f host.ini -c config.yaml --output-dir ${output-dir} + sb run --get-info -f host.ini -c config.yaml --output-dir ${output-dir} -C superbench.enable=none ``` 4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}. diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py index 15b0a8a6b..41c9f3741 100644 --- a/superbench/cli/_handler.py +++ b/superbench/cli/_handler.py @@ -3,7 +3,6 @@ """SuperBench CLI command handler.""" -import json import sys from pathlib import Path from importlib_metadata import version, PackageNotFoundError @@ -15,7 +14,6 @@ from superbench.runner import SuperBenchRunner from superbench.executor import SuperBenchExecutor from superbench.common.utils import create_sb_output_dir, get_sb_config -from superbench.tools import SystemInfo def check_argument_file(name, file): @@ -303,17 +301,6 @@ def run_command_handler( Raises: CLIError: If input arguments are invalid. """ - if (get_info and not (host_file or host_list)): - try: - output_dir = create_sb_output_dir(output_dir) - info = SystemInfo().get_all() - output_dir_path = Path(output_dir) - with open(output_dir_path / 'sys_info.json', 'w') as f: - json.dump(info, f) - except Exception as ex: - raise RuntimeError('Failed to get node info.') from ex - return - docker_config, ansible_config, sb_config, sb_output_dir = process_runner_arguments( docker_image=docker_image, docker_username=docker_username, @@ -332,10 +319,9 @@ def run_command_handler( runner = SuperBenchRunner(sb_config, docker_config, ansible_config, sb_output_dir) + runner.run() if get_info: runner.run_sys_info() - else: - runner.run() if runner.get_failure_count() != 0: sys.exit(runner.get_failure_count()) diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py index 118e1180e..a8b59b881 100644 --- a/superbench/cli/_help.py +++ b/superbench/cli/_help.py @@ -63,9 +63,9 @@ text: > {cli_name} run --no-docker --host-list localhost --config-override superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark - - name: Collect system info on local GPU node - text: {cli_name} --get-info - - name: Collect system info on all nodes in ./host.ini" + - name: Collect system info on all nodes in ./host.ini" without running benchmarks + text: {cli_name} --get-info --host-file ./host.ini -C superbench.enable=none + - name: Collect system info on all nodes in ./host.ini" while running benchmarks text: {cli_name} --get-info --host-file ./host.ini """.format(cli_name=CLI_NAME) diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index 4a57b5b20..9615b540d 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -3,17 +3,29 @@ """SuperBench CLI node subgroup command handler.""" +from pathlib import Path +import json + from superbench.tools import SystemInfo +from superbench.common.utils import create_sb_output_dir -def info_command_handler(): +def info_command_handler(output_dir=None): """Get node hardware info. + Args: + output_dir (str): Output directory. + Returns: dict: node info. """ try: info = SystemInfo().get_all() + output_dir = create_sb_output_dir(output_dir) + info = SystemInfo().get_all() + output_dir_path = Path(output_dir) + with open(output_dir_path / 'sys_info.json', 'w') as f: + json.dump(info, f) except Exception as ex: raise RuntimeError('Failed to get node info.') from ex return info diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 1955e1765..4558d8179 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -209,7 +209,7 @@ def run_sys_info(self): if self._docker_config.skip: fcmd = "bash -c 'cd $SB_WORKSPACE && {command}'" ansible_runner_config = self._ansible_client.get_shell_config( - fcmd.format(command='sb run --get-info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) + fcmd.format(command='sb node info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) ) ansible_rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip)) From 51aa199902fdf9aa223355864adcba1e64c0ec6a Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Wed, 28 Jun 2023 11:33:58 +0000 Subject: [PATCH 09/11] updata --- docs/cli.md | 2 ++ docs/user-tutorial/system-config.md | 2 +- superbench/cli/_help.py | 4 ++-- superbench/cli/_node_handler.py | 1 - superbench/runner/runner.py | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 35d08787e..d96d17e21 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -311,6 +311,7 @@ sb run [--config-file] [--no-docker] [--output-dir] [--private-key] + [--get-info] ``` #### Optional arguments @@ -329,6 +330,7 @@ sb run [--config-file] | `--no-docker` | `False` | Run on host directly without Docker. | | `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | | `--private-key` | `None` | Path to private key if needed. | +| `--get-info` | `False` | Collect system info. | #### Global arguments diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index d28e5df16..2a749ba52 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -35,7 +35,7 @@ This tool is to collect the system information automatically on the tested GPU n 3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb run --get-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). ``` - sb run --get-info -f host.ini -c config.yaml --output-dir ${output-dir} -C superbench.enable=none + sb run --get-info -f host.ini --output-dir ${output-dir} -C superbench.enable=none ``` 4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}. diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py index a8b59b881..fb7f87973 100644 --- a/superbench/cli/_help.py +++ b/superbench/cli/_help.py @@ -64,9 +64,9 @@ {cli_name} run --no-docker --host-list localhost --config-override superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark - name: Collect system info on all nodes in ./host.ini" without running benchmarks - text: {cli_name} --get-info --host-file ./host.ini -C superbench.enable=none + text: {cli_name} run --get-info --host-file ./host.ini -C superbench.enable=none - name: Collect system info on all nodes in ./host.ini" while running benchmarks - text: {cli_name} --get-info --host-file ./host.ini + text: {cli_name} run --get-info --host-file ./host.ini """.format(cli_name=CLI_NAME) helps['benchmark'] = """ diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index 9615b540d..d59ed8b85 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -22,7 +22,6 @@ def info_command_handler(output_dir=None): try: info = SystemInfo().get_all() output_dir = create_sb_output_dir(output_dir) - info = SystemInfo().get_all() output_dir_path = Path(output_dir) with open(output_dir_path / 'sys_info.json', 'w') as f: json.dump(info, f) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 4558d8179..dd2270067 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -203,7 +203,7 @@ def run_sys_info(self): """Run the system info on all nodes.""" self.check_env() - logger.info('Runner is going to run node info.') + logger.info('Runner is going to get info.') fcmd = "docker exec sb-workspace bash -c '{command}'" if self._docker_config.skip: From b5ec5b0efa645ab1cb0fdf6eef4ce96aa99acd01 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 29 Jun 2023 02:08:12 +0000 Subject: [PATCH 10/11] update --- superbench/runner/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index dd2270067..bd8cc9c83 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -203,7 +203,7 @@ def run_sys_info(self): """Run the system info on all nodes.""" self.check_env() - logger.info('Runner is going to get info.') + logger.info('Runner is going to get node system info.') fcmd = "docker exec sb-workspace bash -c '{command}'" if self._docker_config.skip: From ab09dfaa0c3d794be9a6f8a3e2ebd42160a5b66e Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 29 Jun 2023 03:05:38 +0000 Subject: [PATCH 11/11] update doc --- docs/cli.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index d96d17e21..1f6b13a7a 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -304,6 +304,7 @@ sb run [--config-file] [--docker-image] [--docker-password] [--docker-username] + [--get-info] [--host-file] [--host-list] [--host-password] @@ -311,7 +312,6 @@ sb run [--config-file] [--no-docker] [--output-dir] [--private-key] - [--get-info] ``` #### Optional arguments @@ -323,6 +323,7 @@ sb run [--config-file] | `--docker-image` `-i` | `superbench/superbench` | Docker image URI. | | `--docker-password` | `None` | Docker registry password if authentication is needed. | | `--docker-username` | `None` | Docker registry username if authentication is needed. | +| `--get-info` | `False` | Collect system info. | | `--host-file` `-f` | `None` | Path to Ansible inventory host file. | | `--host-list` `-l` | `None` | Comma separated host list. | | `--host-password` | `None` | Host password or key passphase if needed. | @@ -330,7 +331,6 @@ sb run [--config-file] | `--no-docker` | `False` | Run on host directly without Docker. | | `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | | `--private-key` | `None` | Path to private key if needed. | -| `--get-info` | `False` | Collect system info. | #### Global arguments