Adding HPL benchmark (#482)

**Description** - Adding HPL benchmark --------- Co-authored-by: Ubuntu <azureuser@sbtestvm.jzlku1oskncengjiado35wf1hd.ax.internal.cloudapp.net> Co-authored-by: Peng Cheng <chengpeng5555@outlook.com>
microsoft · Mar 21, 2023 · 655bd0a · 655bd0a
1 parent 644b539
commit 655bd0a
Show file tree

Hide file tree

Showing 17 changed files with 1,595 additions and 5 deletions.
diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
@@ -128,6 +128,13 @@ RUN cd /tmp && \
     apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
     rm -rf aocc-compiler-4.0.0_1_amd64.deb
 
+# Install AMD BLIS
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/
 

diff --git a/dockerfile/cuda11.8.dockerfile b/dockerfile/cuda11.8.dockerfile
@@ -108,6 +108,13 @@ RUN cd /tmp && \
     apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
     rm -rf aocc-compiler-4.0.0_1_amd64.deb
 
+# Install AMD BLIS
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/
 

diff --git a/dockerfile/rocm5.0.x.dockerfile b/dockerfile/rocm5.0.x.dockerfile
@@ -108,6 +108,13 @@ RUN cd /tmp && \
     apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
     rm -rf aocc-compiler-4.0.0_1_amd64.deb
 
+# Install AMD BLIS
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+
 # Install rccl-rdma-sharp-plugins
 ENV SHARP_VERSION=5.0
 RUN cd /opt/rocm && \

diff --git a/dockerfile/rocm5.1.x.dockerfile b/dockerfile/rocm5.1.x.dockerfile
@@ -120,6 +120,13 @@ RUN cd /tmp && \
     apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
     rm -rf aocc-compiler-4.0.0_1_amd64.deb
 
+# Install AMD BLIS
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz
+
 ENV PATH="${PATH}:/opt/rocm/hip/bin/" \
     LD_LIBRARY_PATH="/usr/local/lib/:${LD_LIBRARY_PATH}" \
     SB_HOME=/opt/superbench \

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -171,6 +171,21 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
 
+### `cpu-hpl`
+
+#### Introduction
+
+HPL or High Performance Computing Linpack evaluates compute bandwidth by solving dense linear systems in double precision arethmetic.
+Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computers](https://netlib.org/benchmark/hpl/)
+
+#### Metrics
+
+| Name                | Unit               | Description                                                                |
+|---------------------|--------------------|----------------------------------------------------------------------------|
+| cpu-hpl/tests_pass  |                    | HPL completed running and correctness test has passed (1: pass, 0: fail).  |
+| cpu-hpl/throughput  | bandwidth (GFlops) | Compute bandwidth.                                                         |
+| cpu-hpl/time        | time (s)           | Time elapsed during HPL run.                                               |
+
 ### `cpu-stream`
 
 #### Introduction

diff --git a/examples/benchmarks/cpu_hpl_performance.py b/examples/benchmarks/cpu_hpl_performance.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for CPU HPL performance.
+
+Commands to run:
+  python3 examples/benchmarks/cpu_hpl_performance.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'cpu-hpl',
+        parameters='--cpu_arch zen3 \
+        --blockSize 224 --coreCount 60 --blocks 1 --problemSize 224000'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -17,6 +17,7 @@
 from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
 from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
 from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
+from superbench.benchmarks.micro_benchmarks.cpu_hpl_performance import CpuHplBenchmark
 from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
@@ -33,6 +34,7 @@
 __all__ = [
     'ComputationCommunicationOverlap',
     'CpuMemBwLatencyBenchmark',
+    'CpuHplBenchmark',
     'CpuStreamBenchmark',
     'CublasBenchmark',
     'CublasLtBenchmark',

diff --git a/superbench/benchmarks/micro_benchmarks/cpu_hpl_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_hpl_performance.py
@@ -0,0 +1,152 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module for running the HPL benchmark."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class CpuHplBenchmark(MicroBenchmarkWithInvoke):
+    """The HPL benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'hpl_run.sh'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self.__cpu_arch = ['zen3', 'zen4']
+
+        self._parser.add_argument(
+            '--cpu_arch',
+            type=str,
+            default='zen4',
+            required=False,
+            help='The targeted cpu architectures to run \
+                HPL. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch))
+        )
+        self._parser.add_argument(
+            '--blockSize',
+            type=int,
+            default=384,
+            required=False,
+            help='Size of blocks. This parameter is an HPL input. Default 384.'
+        )
+        self._parser.add_argument(
+            '--coreCount',
+            type=int,
+            default=88,    # for HBv4 total number of cores is 176 => 88 per cpu
+            required=False,
+            help='Number of cores per CPU. Used for MPI and HPL configuration. \
+            Default 88 (HBv4 has a total of 176 w/ 2 cpus therefore 88 per cpu)'
+        )
+        self._parser.add_argument(
+            '--blocks',
+            type=int,
+            default=1,
+            required=False,
+            help='Number of blocks. This parameter is an HPL input. Default 1.'
+        )
+        self._parser.add_argument(
+            '--problemSize',
+            type=int,
+            default=384000,
+            required=False,
+            help='This is the problem size designated by "N" notation. \
+            This parameter is an HPL input. Default is 384000'
+        )
+
+    def _preprocess(self, hpl_template):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        if not self._set_binary_path():
+            logger.error(
+                'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
+            )
+            return False
+
+        # xhpl type
+        xhpl = 'xhpl_z4'
+        if self._args.cpu_arch == 'zen3':
+            xhpl = 'xhpl_z3'
+
+        # command
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        command = command + ' ' + xhpl + ' ' + str(self._args.coreCount)
+
+        # modify HPL.dat
+        if hpl_template:
+            hpl_input_file = hpl_template
+        else:
+            hpl_input_file = os.path.join(self._args.bin_dir, 'template_hpl.dat')
+        search_string = ['problemSize', 'blockCount', 'blockSize']
+        with open(hpl_input_file, 'r') as hplfile:
+            lines = hplfile.readlines()
+        hpl_input_file = os.path.join(os.getcwd(), 'HPL.dat')
+        with open(hpl_input_file, 'w') as hplfile:
+            for line in lines:
+                if search_string[0] in line:
+                    line = line.replace(search_string[0], str(self._args.problemSize))
+                elif search_string[1] in line:
+                    line = line.replace(search_string[1], str(self._args.blocks))
+                elif search_string[2] in line:
+                    line = line.replace(search_string[2], str(self._args.blockSize))
+                hplfile.write(line)
+
+        self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        content = raw_output.splitlines()
+
+        for idx, line in enumerate(content):
+            if 'T/V' in line and 'Gflops' in line:
+                break
+
+        results = content[idx + 2].split()
+
+        for line in content[idx + 2:]:
+            if '1 tests completed and passed residual checks' in line:
+                self._result.add_result('tests_pass', 1)
+            elif '0 tests completed and passed residual checks' in line:
+                self._result.add_result('tests_pass', 0)
+
+        self._result.add_result('time', float(results[5]))
+        self._result.add_result('throughput', float(results[6]))
+
+        # raw output
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('cpu-hpl', CpuHplBenchmark)
diff --git a/tests/benchmarks/micro_benchmarks/test_cpu_hpl_performance.py b/tests/benchmarks/micro_benchmarks/test_cpu_hpl_performance.py
@@ -0,0 +1,66 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for HPL benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+class CpuHplBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
+    """Test class for HPL benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/hpl_run.sh'])
+        return True
+
+    @decorator.load_data('tests/data/hpl_results.log')
+    def test_hpl(self, results):
+        """Test HPL benchmark command generation."""
+        benchmark_name = 'cpu-hpl'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+        assert (benchmark_class)
+
+        parameters = '--cpu_arch zen3 \
+        --blockSize 224 --coreCount 60 --blocks 1 --problemSize 224000'
+
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess(hpl_template='third_party/hpl-tests/template_hpl.dat')
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == benchmark_name)
+        assert (benchmark.type == BenchmarkType.MICRO)
+
+        # Check parameters specified in BenchmarkContext.
+
+        assert (benchmark._args.cpu_arch == 'zen3')
+        assert (benchmark._args.blockSize == 224)
+        assert (benchmark._args.coreCount == 60)
+        assert (benchmark._args.blocks == 1)
+        assert (benchmark._args.problemSize == 224000)
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('60' in benchmark._commands[0])
+        assert ('hpl_run.sh' in benchmark._commands[0])
+        assert ('xhpl_z3' in benchmark._commands[0])
+
+        # Check results
+        assert (benchmark._process_raw_result(0, results))
+        assert (benchmark.result['return_code'][0] == 0)
+        assert (float(benchmark.result['time'][0]) == 4645.37)
+        assert (float(benchmark.result['throughput'][0]) == 8126.1)
+
+
+if __name__ == '__main__':
+    unittest.main()