Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark - Support batch/shape range in cublaslt gemm #494

Merged
merged 4 commits into from
Mar 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 10 additions & 10 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/

#### Metrics

| Name | Unit | Description |
|------------------------------------------------|----------------|---------------------------------|
| cublaslt-gemm/${dtype}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |
| Name | Unit | Description |
|----------------------------------------------------------|----------------|---------------------------------|
| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |

### `cublas-function`

Expand Down Expand Up @@ -195,13 +195,13 @@ performed by [University of Virginia STREAM benchmark](https://www.cs.virginia.e

#### Metrics

| Name | Unit | Description |
|----------------------------------------------------------|------------------|---------------------------------------------------------------------|
| cpu-stream/threads | | Number of threads used for the test. Determined by core count. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg | time (s) | Average elapsed times over all iterations. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min | time (s) | Minimum elapsed times over all iterations. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max | time (s) | Maximum elapsed times over all iterations. |
| Name | Unit | Description |
|----------------------------------------------------------|------------------|----------------------------------------------------------------|
| cpu-stream/threads | | Number of threads used for the test. Determined by core count. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg | time (s) | Average elapsed times over all iterations. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min | time (s) | Minimum elapsed times over all iterations. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max | time (s) | Maximum elapsed times over all iterations. |

## Communication Benchmarks

Expand Down
82 changes: 64 additions & 18 deletions superbench/benchmarks/micro_benchmarks/cublaslt_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""Module of the cuBLASLt GEMM benchmark."""

import os
import itertools

from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
Expand All @@ -24,6 +25,37 @@ def __init__(self, name, parameters=''):
self._bin_name = 'cublaslt_gemm'
self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2']

def mrange(self, start, stop=-1, multiplication_factor=2):
"""Range constructor with multiplication factor.

Args:
start (int): Start number.
stop (int, optional): Stop number. Defaults to -1.
multiplication_factor (int, optional): Multiplication factor. Defaults to 2.

Yields:
int: number in the range.
"""
while True:
yield start
start *= multiplication_factor
if start > stop or start == 0 or multiplication_factor < 2:
break

def validate_mrange(self, string):
"""Validate mrange string in format start[[:stop]:multiplication_factor].

Args:
string (str): mrange string.

Returns:
bool: whether the mrange is expected.
"""
nums = string.split(':')
if len(nums) > 3:
return False
return bool(all(x.isdigit() for x in nums))

def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
Expand All @@ -33,14 +65,17 @@ def add_parser_arguments(self):
type=str,
nargs='+',
default=[f'{x},{x},{x}' for x in [2048, 4096, 8192]],
help='Shapes in m,n,k format.',
help='Shapes in m,n,k format. Support format start:stop:multiplication_factor, e.g., 16:128:2.',
)
self._parser.add_argument(
'--batch',
type=int,
default=0,
type=str,
default='0',
required=False,
help='Batch size for strided batch GEMM, set 0 to disable.',
help=(
'Batch size for strided batch GEMM, set 0 to disable.'
' Support format start:stop:multiplication_factor, e.g., 16:128:2.'
),
)
self._parser.add_argument(
'--num_warmup',
Expand All @@ -57,11 +92,12 @@ def add_parser_arguments(self):
help='Number of steps to measure.',
)
self._parser.add_argument(
'--in_type',
'--in_types',
type=str,
default='fp8e4m3',
nargs='+',
default=['fp8e4m3'],
required=False,
help='Input data type, supports {}.'.format(' '.join(self._in_types)),
help='List of input data types, support {}.'.format(' '.join(self._in_types)),
)

def _preprocess(self):
Expand All @@ -75,20 +111,28 @@ def _preprocess(self):

self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)

if self._args.in_type not in self._in_types:
logger.error(f'Invalid input type {self._args.in_type}.')
if not self.validate_mrange(self._args.batch):
logger.error(f'Invalid batch size {self._args.batch}.')
return False

self._commands = []
for shape in self._args.shapes:
shape_list = shape.replace(',', ' ').split()
if len(shape_list) != 3 or not all(x.isdigit() for x in shape_list):
logger.error(f'Invalid shape {shape}.')
for _in_type in self._args.in_types:
if _in_type not in self._in_types:
logger.error(f'Invalid input type {_in_type}.')
return False
self._commands.append(
f'{self.__bin_path} -m {shape_list[0]} -n {shape_list[1]} -k {shape_list[2]} '
f'-b {self._args.batch} -w {self._args.num_warmup} -i {self._args.num_steps} -t {self._args.in_type}'
)
for _b in self.mrange(*map(int, self._args.batch.split(':'))):
for shape in self._args.shapes:
shape_list = shape.replace(',', ' ').split()
if len(shape_list) != 3 or not all(self.validate_mrange(x) for x in shape_list):
logger.error(f'Invalid shape {shape}.')
return False
for _m, _n, _k in itertools.product(
yukirora marked this conversation as resolved.
Show resolved Hide resolved
*map(lambda shape: self.mrange(*map(int, shape.split(':'))), shape_list)
):
self._commands.append(
f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -b {_b} '
f'-w {self._args.num_warmup} -i {self._args.num_steps} -t {_in_type}'
)

return True

Expand All @@ -110,7 +154,9 @@ def _process_raw_result(self, cmd_idx, raw_output):
fields = raw_output.strip().split()
if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]):
raise ValueError('Invalid result.')
self._result.add_result(f'{self._args.in_type}_{"_".join(fields[:3])}_flops', float(fields[-1]))
self._result.add_result(
f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops', float(fields[-1])
)
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(
Expand Down
62 changes: 56 additions & 6 deletions tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Tests for cublaslt-gemm benchmark."""

import unittest
from types import SimpleNamespace
from types import GeneratorType, SimpleNamespace

from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
Expand All @@ -19,7 +19,12 @@ def setUpClass(cls):
super().setUpClass()
cls.benchmark_name = 'cublaslt-gemm'
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/cublaslt_fp8_gemm'])
cls.createMockFiles(cls, ['bin/cublaslt_gemm'])

def get_benchmark(self):
"""Get Benchmark."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
return benchmark_cls(self.benchmark_name, parameters='')

def test_cublaslt_gemm_cls(self):
"""Test cublaslt-gemm benchmark class."""
Expand All @@ -30,11 +35,56 @@ def test_cublaslt_gemm_cls(self):
else:
self.assertIsNone(benchmark_cls)

def test_mrange(self):
"""Test mrange generation."""
benchmark = self.get_benchmark()
self.assertIsInstance(benchmark.mrange(1), GeneratorType)
self.assertListEqual([4, 8, 16, 32], list(benchmark.mrange(4, 32, 2)))
self.assertListEqual([2, 4, 8, 16], list(benchmark.mrange(2, 31, 2)))
self.assertListEqual([2, 4, 8], list(benchmark.mrange(2, 8)))
self.assertListEqual([2], list(benchmark.mrange(2, 0, 2)))
self.assertListEqual([2], list(benchmark.mrange(2)))
self.assertListEqual([2], list(benchmark.mrange(2, 4, 1)))
self.assertListEqual([2], list(benchmark.mrange(2, 4, 0)))
self.assertListEqual([0], list(benchmark.mrange(0, 0)))
self.assertListEqual([0], list(benchmark.mrange(0)))

def test_validate_mrange(self):
"""Test mrange validation."""
benchmark = self.get_benchmark()
self.assertTrue(benchmark.validate_mrange('2:32:2'))
self.assertTrue(benchmark.validate_mrange('4:32'))
self.assertTrue(benchmark.validate_mrange('8'))
self.assertFalse(benchmark.validate_mrange('2:32:2:4'))
self.assertFalse(benchmark.validate_mrange('2.5:32'))

def test_cublaslt_gemm_command_generation(self):
"""Test cublaslt-gemm benchmark command generation."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
benchmark = benchmark_cls(
self.benchmark_name,
parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64',
)
self.assertTrue(benchmark._preprocess())
self.assertEqual(4 * (2 * 2 * 3 + 2) * 3, len(benchmark._commands))

def cmd(t, b, m, n, k):
return f'{benchmark._CublasLtBenchmark__bin_path} -m {m} -n {n} -k {k} -b {b} -w 20 -i 50 -t {t}'

for _t in ['fp16', 'fp32', 'fp64']:
for _b in [2, 4, 8, 16]:
for _m in [2, 4]:
for _n in [4, 8]:
for _k in [8, 16, 32]:
self.assertIn(cmd(_t, _b, _m, _n, _k), benchmark._commands)
for _m in [32, 128]:
self.assertIn(cmd(_t, _b, _m, 128, 128), benchmark._commands)

def test_cublaslt_gemm_result_parsing(self):
"""Test cublaslt-gemm benchmark result parsing."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
benchmark = benchmark_cls(self.benchmark_name, parameters='')
benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_type='fp8e4m3', log_raw_data=False)
benchmark = self.get_benchmark()
self.assertTrue(benchmark._preprocess())
benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_types=['fp8e4m3'], log_raw_data=False)
benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)

# Positive case - valid raw output
Expand All @@ -44,7 +94,7 @@ def test_cublaslt_gemm_result_parsing(self):

self.assertEqual(3, len(benchmark.result))
for shape in benchmark._args.shapes:
self.assertEqual(2.222, benchmark.result[f'fp8e4m3_{shape.replace(",", "_")}_flops'][0])
self.assertEqual(2.222, benchmark.result[f'fp8e4m3_0_{shape.replace(",", "_")}_flops'][0])

# Negative case - invalid raw output
self.assertFalse(benchmark._process_raw_result(1, 'cuBLAS API failed'))