Skip to content

Commit

Permalink
Adding HPL benchmark (#482)
Browse files Browse the repository at this point in the history
**Description**

- Adding HPL benchmark

---------

Co-authored-by: Ubuntu <azureuser@sbtestvm.jzlku1oskncengjiado35wf1hd.ax.internal.cloudapp.net>
Co-authored-by: Peng Cheng <chengpeng5555@outlook.com>
  • Loading branch information
3 people authored Mar 21, 2023
1 parent 644b539 commit 655bd0a
Show file tree
Hide file tree
Showing 17 changed files with 1,595 additions and 5 deletions.
7 changes: 7 additions & 0 deletions dockerfile/cuda11.1.1.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@ RUN cd /tmp && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb

# Install AMD BLIS
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz

# Add config files
ADD dockerfile/etc /opt/microsoft/

Expand Down
7 changes: 7 additions & 0 deletions dockerfile/cuda11.8.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ RUN cd /tmp && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb

# Install AMD BLIS
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz

# Add config files
ADD dockerfile/etc /opt/microsoft/

Expand Down
7 changes: 7 additions & 0 deletions dockerfile/rocm5.0.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ RUN cd /tmp && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb

# Install AMD BLIS
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz

# Install rccl-rdma-sharp-plugins
ENV SHARP_VERSION=5.0
RUN cd /opt/rocm && \
Expand Down
7 changes: 7 additions & 0 deletions dockerfile/rocm5.1.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ RUN cd /tmp && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb

# Install AMD BLIS
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz

ENV PATH="${PATH}:/opt/rocm/hip/bin/" \
LD_LIBRARY_PATH="/usr/local/lib/:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
Expand Down
15 changes: 15 additions & 0 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,21 @@ Supports the use of double unit types and the use of tensor cores.
| gpu-burn/gpu_[0-9]_pass | yes/no | The result of the gpu-burn test for each GPU (1: yes, 0: no). |
| gpu-burn/abort | yes/no | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |

### `cpu-hpl`

#### Introduction

HPL or High Performance Computing Linpack evaluates compute bandwidth by solving dense linear systems in double precision arethmetic.
Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computers](https://netlib.org/benchmark/hpl/)

#### Metrics

| Name | Unit | Description |
|---------------------|--------------------|----------------------------------------------------------------------------|
| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). |
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
| cpu-hpl/time | time (s) | Time elapsed during HPL run. |

### `cpu-stream`

#### Introduction
Expand Down
26 changes: 26 additions & 0 deletions examples/benchmarks/cpu_hpl_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Micro benchmark example for CPU HPL performance.
Commands to run:
python3 examples/benchmarks/cpu_hpl_performance.py
"""

from superbench.benchmarks import BenchmarkRegistry
from superbench.common.utils import logger

if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'cpu-hpl',
parameters='--cpu_arch zen3 \
--blockSize 224 --coreCount 60 --blocks 1 --problemSize 224000'
)

benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
2 changes: 2 additions & 0 deletions superbench/benchmarks/micro_benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
from superbench.benchmarks.micro_benchmarks.cpu_hpl_performance import CpuHplBenchmark
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
Expand All @@ -33,6 +34,7 @@
__all__ = [
'ComputationCommunicationOverlap',
'CpuMemBwLatencyBenchmark',
'CpuHplBenchmark',
'CpuStreamBenchmark',
'CublasBenchmark',
'CublasLtBenchmark',
Expand Down
152 changes: 152 additions & 0 deletions superbench/benchmarks/micro_benchmarks/cpu_hpl_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Module for running the HPL benchmark."""

import os

from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke


class CpuHplBenchmark(MicroBenchmarkWithInvoke):
"""The HPL benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)

self._bin_name = 'hpl_run.sh'

def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()

self.__cpu_arch = ['zen3', 'zen4']

self._parser.add_argument(
'--cpu_arch',
type=str,
default='zen4',
required=False,
help='The targeted cpu architectures to run \
HPL. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch))
)
self._parser.add_argument(
'--blockSize',
type=int,
default=384,
required=False,
help='Size of blocks. This parameter is an HPL input. Default 384.'
)
self._parser.add_argument(
'--coreCount',
type=int,
default=88, # for HBv4 total number of cores is 176 => 88 per cpu
required=False,
help='Number of cores per CPU. Used for MPI and HPL configuration. \
Default 88 (HBv4 has a total of 176 w/ 2 cpus therefore 88 per cpu)'
)
self._parser.add_argument(
'--blocks',
type=int,
default=1,
required=False,
help='Number of blocks. This parameter is an HPL input. Default 1.'
)
self._parser.add_argument(
'--problemSize',
type=int,
default=384000,
required=False,
help='This is the problem size designated by "N" notation. \
This parameter is an HPL input. Default is 384000'
)

def _preprocess(self, hpl_template):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False

if not self._set_binary_path():
logger.error(
'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
)
return False

# xhpl type
xhpl = 'xhpl_z4'
if self._args.cpu_arch == 'zen3':
xhpl = 'xhpl_z3'

# command
command = os.path.join(self._args.bin_dir, self._bin_name)
command = command + ' ' + xhpl + ' ' + str(self._args.coreCount)

# modify HPL.dat
if hpl_template:
hpl_input_file = hpl_template
else:
hpl_input_file = os.path.join(self._args.bin_dir, 'template_hpl.dat')
search_string = ['problemSize', 'blockCount', 'blockSize']
with open(hpl_input_file, 'r') as hplfile:
lines = hplfile.readlines()
hpl_input_file = os.path.join(os.getcwd(), 'HPL.dat')
with open(hpl_input_file, 'w') as hplfile:
for line in lines:
if search_string[0] in line:
line = line.replace(search_string[0], str(self._args.problemSize))
elif search_string[1] in line:
line = line.replace(search_string[1], str(self._args.blocks))
elif search_string[2] in line:
line = line.replace(search_string[2], str(self._args.blockSize))
hplfile.write(line)

self._commands.append(command)
return True

def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
content = raw_output.splitlines()

for idx, line in enumerate(content):
if 'T/V' in line and 'Gflops' in line:
break

results = content[idx + 2].split()

for line in content[idx + 2:]:
if '1 tests completed and passed residual checks' in line:
self._result.add_result('tests_pass', 1)
elif '0 tests completed and passed residual checks' in line:
self._result.add_result('tests_pass', 0)

self._result.add_result('time', float(results[5]))
self._result.add_result('throughput', float(results[6]))

# raw output
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)

return True


BenchmarkRegistry.register_benchmark('cpu-hpl', CpuHplBenchmark)
66 changes: 66 additions & 0 deletions tests/benchmarks/micro_benchmarks/test_cpu_hpl_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for HPL benchmark."""

import unittest

from tests.helper import decorator
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform


class CpuHplBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
"""Test class for HPL benchmark."""
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class."""
super().setUpClass()
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/hpl_run.sh'])
return True

@decorator.load_data('tests/data/hpl_results.log')
def test_hpl(self, results):
"""Test HPL benchmark command generation."""
benchmark_name = 'cpu-hpl'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
assert (benchmark_class)

parameters = '--cpu_arch zen3 \
--blockSize 224 --coreCount 60 --blocks 1 --problemSize 224000'

benchmark = benchmark_class(benchmark_name, parameters=parameters)

# Check basic information
assert (benchmark)
ret = benchmark._preprocess(hpl_template='third_party/hpl-tests/template_hpl.dat')
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == benchmark_name)
assert (benchmark.type == BenchmarkType.MICRO)

# Check parameters specified in BenchmarkContext.

assert (benchmark._args.cpu_arch == 'zen3')
assert (benchmark._args.blockSize == 224)
assert (benchmark._args.coreCount == 60)
assert (benchmark._args.blocks == 1)
assert (benchmark._args.problemSize == 224000)

# Check command
assert (1 == len(benchmark._commands))
assert ('60' in benchmark._commands[0])
assert ('hpl_run.sh' in benchmark._commands[0])
assert ('xhpl_z3' in benchmark._commands[0])

# Check results
assert (benchmark._process_raw_result(0, results))
assert (benchmark.result['return_code'][0] == 0)
assert (float(benchmark.result['time'][0]) == 4645.37)
assert (float(benchmark.result['throughput'][0]) == 8126.1)


if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 655bd0a

Please sign in to comment.