diff --git a/docs/cli.md b/docs/cli.md index df1c1ca4d..1f6b13a7a 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -165,6 +165,26 @@ Execute GPT2 model benchmark in default configuration: sb exec --config-override superbench.enable="['gpt2_models']" ``` +### `sb node info` +Get system info on the local node. + +```bash title="SB CLI" +sb node info [--output-dir] +``` + +#### Optional arguments + +| Name | Default | Description | +|----------------|---------|-----------------------------------------------------------------------------| +| `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | + +#### Examples + +Get system info on the local node and save it into the `outputs` dir: +```bash title="SB CLI" +sb node info --output-dir outputs +``` + ### `sb result diagnosis` Filter the defective machines automatically from benchmarking results according to rules defined in rule file. @@ -284,6 +304,7 @@ sb run [--config-file] [--docker-image] [--docker-password] [--docker-username] + [--get-info] [--host-file] [--host-list] [--host-password] @@ -302,6 +323,7 @@ sb run [--config-file] | `--docker-image` `-i` | `superbench/superbench` | Docker image URI. | | `--docker-password` | `None` | Docker registry password if authentication is needed. | | `--docker-username` | `None` | Docker registry username if authentication is needed. | +| `--get-info` | `False` | Collect system info. | | `--host-file` `-f` | `None` | Path to Ansible inventory host file. | | `--host-list` `-l` | `None` | Comma separated host list. | | `--host-password` | `None` | Host password or key passphase if needed. | @@ -335,6 +357,16 @@ sb run --no-docker --host-list localhost --config-override \ superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark ``` +Collect system info on all nodes in ./host.ini" distributed without running benchmarks: +```bash title="SB CLI" +sb run --get-info --host-file ./host.ini -C superbench.enable=none +``` + +Collect system info on all nodes in ./host.ini" distributed while running benchmarks: +```bash title="SB CLI" +sb run --get-info --host-file ./host.ini +``` + ### `sb version` Print the current SuperBench CLI version. diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index dbde728d3..2a749ba52 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -4,6 +4,8 @@ id: system-config # System Config Info +This tool is to collect the system information automatically on the tested GPU nodes including the following hardware categories: + - [System](#system) - [Memory](#memory) - [CPU](#cpu) @@ -12,7 +14,33 @@ id: system-config - [Accelerator](#accelerator) - [PCIe](#pcie) -## Parameter amd Details +## Usage + +### Usage on local machine + +1. [Install SuperBench](../getting-started/installation.mdx) on the local machine using root privilege. + +2. Start to collect the sys info using `sb node info --output-dir ${output-dir}` command using root privilege. + +3. After the command finished, you can find the output system info json file `sys-info.json` of local node under \${output_dir}. + +### Usage on multiple remote machines + +1. [Install SuperBench](../getting-started/installation.mdx) on the local machine. + +2. [Deploy SuperBench](../getting-started/run-superbench.md#deploy) onto the remote machines. + +2. Prepare the host file of the tested GPU nodes using [Ansible Inventory](../getting-started/configuration.md#ansible-inventory) on the local machine. + +3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb run --get-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). + + ``` + sb run --get-info -f host.ini --output-dir ${output-dir} -C superbench.enable=none + ``` + +4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}. + +## Parameter and Details ### System diff --git a/superbench/cli/_commands.py b/superbench/cli/_commands.py index f37bc0f33..2122034a3 100644 --- a/superbench/cli/_commands.py +++ b/superbench/cli/_commands.py @@ -67,6 +67,9 @@ def load_arguments(self, command): nargs='+', help='Extra arguments to override config_file.' ) + ac.argument( + 'get_info', options_list=('--get-info', '-g'), action='store_true', help='Collect node system info.' + ) with ArgumentsContext(self, 'benchmark') as ac: ac.argument('name', options_list=('--name', '-n'), type=str, help='Benchmark name or regular expression.') diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py index 3c2d1cbaa..41c9f3741 100644 --- a/superbench/cli/_handler.py +++ b/superbench/cli/_handler.py @@ -275,7 +275,8 @@ def run_command_handler( output_dir=None, private_key=None, config_file=None, - config_override=None + config_override=None, + get_info=False, ): """Run the SuperBench benchmarks distributedly. @@ -295,6 +296,7 @@ def run_command_handler( config_file (str, optional): Path to SuperBench config file. Defaults to None. config_override (str, optional): Extra arguments to override config_file, following [Hydra syntax](https://hydra.cc/docs/advanced/override_grammar/basic). Defaults to None. + get_info (bool, optional): Collect node system info. Defaults to False. Raises: CLIError: If input arguments are invalid. @@ -316,6 +318,10 @@ def run_command_handler( ) runner = SuperBenchRunner(sb_config, docker_config, ansible_config, sb_output_dir) + runner.run() + if get_info: + runner.run_sys_info() + if runner.get_failure_count() != 0: sys.exit(runner.get_failure_count()) diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py index 2c7f507b2..fb7f87973 100644 --- a/superbench/cli/_help.py +++ b/superbench/cli/_help.py @@ -63,6 +63,10 @@ text: > {cli_name} run --no-docker --host-list localhost --config-override superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark + - name: Collect system info on all nodes in ./host.ini" without running benchmarks + text: {cli_name} run --get-info --host-file ./host.ini -C superbench.enable=none + - name: Collect system info on all nodes in ./host.ini" while running benchmarks + text: {cli_name} run --get-info --host-file ./host.ini """.format(cli_name=CLI_NAME) helps['benchmark'] = """ diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index 4a57b5b20..d59ed8b85 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -3,17 +3,28 @@ """SuperBench CLI node subgroup command handler.""" +from pathlib import Path +import json + from superbench.tools import SystemInfo +from superbench.common.utils import create_sb_output_dir -def info_command_handler(): +def info_command_handler(output_dir=None): """Get node hardware info. + Args: + output_dir (str): Output directory. + Returns: dict: node info. """ try: info = SystemInfo().get_all() + output_dir = create_sb_output_dir(output_dir) + output_dir_path = Path(output_dir) + with open(output_dir_path / 'sys_info.json', 'w') as f: + json.dump(info, f) except Exception as ex: raise RuntimeError('Failed to get node info.') from ex return info diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index d91020bfb..bd8cc9c83 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -199,6 +199,24 @@ def deploy(self): # pragma: no cover ) self._ansible_client.run(self._ansible_client.get_playbook_config('deploy.yaml', extravars=extravars)) + def run_sys_info(self): + """Run the system info on all nodes.""" + self.check_env() + + logger.info('Runner is going to get node system info.') + + fcmd = "docker exec sb-workspace bash -c '{command}'" + if self._docker_config.skip: + fcmd = "bash -c 'cd $SB_WORKSPACE && {command}'" + ansible_runner_config = self._ansible_client.get_shell_config( + fcmd.format(command='sb node info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) + ) + ansible_rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip)) + + if ansible_rc != 0: + self.cleanup() + self.fetch_results() + def check_env(self): # pragma: no cover """Check SuperBench environment.""" logger.info('Checking SuperBench environment.')