diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index b44e0de70..0b294f136 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -66,9 +66,9 @@ Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/ #### Metrics -| Name | Unit | Description | -|------------------------------------------------|----------------|---------------------------------| -| cublaslt-gemm/${dtype}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. | +| Name | Unit | Description | +|----------------------------------------------------------|----------------|---------------------------------| +| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. | ### `cublas-function` @@ -195,13 +195,13 @@ performed by [University of Virginia STREAM benchmark](https://www.cs.virginia.e #### Metrics -| Name | Unit | Description | -|----------------------------------------------------------|------------------|---------------------------------------------------------------------| -| cpu-stream/threads | | Number of threads used for the test. Determined by core count. | -| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation. | -| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg | time (s) | Average elapsed times over all iterations. | -| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min | time (s) | Minimum elapsed times over all iterations. | -| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max | time (s) | Maximum elapsed times over all iterations. | +| Name | Unit | Description | +|----------------------------------------------------------|------------------|----------------------------------------------------------------| +| cpu-stream/threads | | Number of threads used for the test. Determined by core count. | +| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation. | +| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg | time (s) | Average elapsed times over all iterations. | +| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min | time (s) | Minimum elapsed times over all iterations. | +| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max | time (s) | Maximum elapsed times over all iterations. | ## Communication Benchmarks diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py index 2ce93abe8..59733ea10 100644 --- a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py +++ b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py @@ -4,6 +4,7 @@ """Module of the cuBLASLt GEMM benchmark.""" import os +import itertools from superbench.common.utils import logger from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode @@ -24,6 +25,37 @@ def __init__(self, name, parameters=''): self._bin_name = 'cublaslt_gemm' self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2'] + def mrange(self, start, stop=-1, multiplication_factor=2): + """Range constructor with multiplication factor. + + Args: + start (int): Start number. + stop (int, optional): Stop number. Defaults to -1. + multiplication_factor (int, optional): Multiplication factor. Defaults to 2. + + Yields: + int: number in the range. + """ + while True: + yield start + start *= multiplication_factor + if start > stop or start == 0 or multiplication_factor < 2: + break + + def validate_mrange(self, string): + """Validate mrange string in format start[[:stop]:multiplication_factor]. + + Args: + string (str): mrange string. + + Returns: + bool: whether the mrange is expected. + """ + nums = string.split(':') + if len(nums) > 3: + return False + return bool(all(x.isdigit() for x in nums)) + def add_parser_arguments(self): """Add the specified arguments.""" super().add_parser_arguments() @@ -33,14 +65,17 @@ def add_parser_arguments(self): type=str, nargs='+', default=[f'{x},{x},{x}' for x in [2048, 4096, 8192]], - help='Shapes in m,n,k format.', + help='Shapes in m,n,k format. Support format start:stop:multiplication_factor, e.g., 16:128:2.', ) self._parser.add_argument( '--batch', - type=int, - default=0, + type=str, + default='0', required=False, - help='Batch size for strided batch GEMM, set 0 to disable.', + help=( + 'Batch size for strided batch GEMM, set 0 to disable.' + ' Support format start:stop:multiplication_factor, e.g., 16:128:2.' + ), ) self._parser.add_argument( '--num_warmup', @@ -57,11 +92,12 @@ def add_parser_arguments(self): help='Number of steps to measure.', ) self._parser.add_argument( - '--in_type', + '--in_types', type=str, - default='fp8e4m3', + nargs='+', + default=['fp8e4m3'], required=False, - help='Input data type, supports {}.'.format(' '.join(self._in_types)), + help='List of input data types, support {}.'.format(' '.join(self._in_types)), ) def _preprocess(self): @@ -75,20 +111,28 @@ def _preprocess(self): self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name) - if self._args.in_type not in self._in_types: - logger.error(f'Invalid input type {self._args.in_type}.') + if not self.validate_mrange(self._args.batch): + logger.error(f'Invalid batch size {self._args.batch}.') return False self._commands = [] - for shape in self._args.shapes: - shape_list = shape.replace(',', ' ').split() - if len(shape_list) != 3 or not all(x.isdigit() for x in shape_list): - logger.error(f'Invalid shape {shape}.') + for _in_type in self._args.in_types: + if _in_type not in self._in_types: + logger.error(f'Invalid input type {_in_type}.') return False - self._commands.append( - f'{self.__bin_path} -m {shape_list[0]} -n {shape_list[1]} -k {shape_list[2]} ' - f'-b {self._args.batch} -w {self._args.num_warmup} -i {self._args.num_steps} -t {self._args.in_type}' - ) + for _b in self.mrange(*map(int, self._args.batch.split(':'))): + for shape in self._args.shapes: + shape_list = shape.replace(',', ' ').split() + if len(shape_list) != 3 or not all(self.validate_mrange(x) for x in shape_list): + logger.error(f'Invalid shape {shape}.') + return False + for _m, _n, _k in itertools.product( + *map(lambda shape: self.mrange(*map(int, shape.split(':'))), shape_list) + ): + self._commands.append( + f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -b {_b} ' + f'-w {self._args.num_warmup} -i {self._args.num_steps} -t {_in_type}' + ) return True @@ -110,7 +154,9 @@ def _process_raw_result(self, cmd_idx, raw_output): fields = raw_output.strip().split() if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]): raise ValueError('Invalid result.') - self._result.add_result(f'{self._args.in_type}_{"_".join(fields[:3])}_flops', float(fields[-1])) + self._result.add_result( + f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops', float(fields[-1]) + ) except BaseException as e: self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) logger.error( diff --git a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py index b4bd89f52..b504062a2 100644 --- a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py +++ b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py @@ -4,7 +4,7 @@ """Tests for cublaslt-gemm benchmark.""" import unittest -from types import SimpleNamespace +from types import GeneratorType, SimpleNamespace from tests.helper.testcase import BenchmarkTestCase from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform @@ -19,7 +19,12 @@ def setUpClass(cls): super().setUpClass() cls.benchmark_name = 'cublaslt-gemm' cls.createMockEnvs(cls) - cls.createMockFiles(cls, ['bin/cublaslt_fp8_gemm']) + cls.createMockFiles(cls, ['bin/cublaslt_gemm']) + + def get_benchmark(self): + """Get Benchmark.""" + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + return benchmark_cls(self.benchmark_name, parameters='') def test_cublaslt_gemm_cls(self): """Test cublaslt-gemm benchmark class.""" @@ -30,11 +35,56 @@ def test_cublaslt_gemm_cls(self): else: self.assertIsNone(benchmark_cls) + def test_mrange(self): + """Test mrange generation.""" + benchmark = self.get_benchmark() + self.assertIsInstance(benchmark.mrange(1), GeneratorType) + self.assertListEqual([4, 8, 16, 32], list(benchmark.mrange(4, 32, 2))) + self.assertListEqual([2, 4, 8, 16], list(benchmark.mrange(2, 31, 2))) + self.assertListEqual([2, 4, 8], list(benchmark.mrange(2, 8))) + self.assertListEqual([2], list(benchmark.mrange(2, 0, 2))) + self.assertListEqual([2], list(benchmark.mrange(2))) + self.assertListEqual([2], list(benchmark.mrange(2, 4, 1))) + self.assertListEqual([2], list(benchmark.mrange(2, 4, 0))) + self.assertListEqual([0], list(benchmark.mrange(0, 0))) + self.assertListEqual([0], list(benchmark.mrange(0))) + + def test_validate_mrange(self): + """Test mrange validation.""" + benchmark = self.get_benchmark() + self.assertTrue(benchmark.validate_mrange('2:32:2')) + self.assertTrue(benchmark.validate_mrange('4:32')) + self.assertTrue(benchmark.validate_mrange('8')) + self.assertFalse(benchmark.validate_mrange('2:32:2:4')) + self.assertFalse(benchmark.validate_mrange('2.5:32')) + + def test_cublaslt_gemm_command_generation(self): + """Test cublaslt-gemm benchmark command generation.""" + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + benchmark = benchmark_cls( + self.benchmark_name, + parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64', + ) + self.assertTrue(benchmark._preprocess()) + self.assertEqual(4 * (2 * 2 * 3 + 2) * 3, len(benchmark._commands)) + + def cmd(t, b, m, n, k): + return f'{benchmark._CublasLtBenchmark__bin_path} -m {m} -n {n} -k {k} -b {b} -w 20 -i 50 -t {t}' + + for _t in ['fp16', 'fp32', 'fp64']: + for _b in [2, 4, 8, 16]: + for _m in [2, 4]: + for _n in [4, 8]: + for _k in [8, 16, 32]: + self.assertIn(cmd(_t, _b, _m, _n, _k), benchmark._commands) + for _m in [32, 128]: + self.assertIn(cmd(_t, _b, _m, 128, 128), benchmark._commands) + def test_cublaslt_gemm_result_parsing(self): """Test cublaslt-gemm benchmark result parsing.""" - (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) - benchmark = benchmark_cls(self.benchmark_name, parameters='') - benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_type='fp8e4m3', log_raw_data=False) + benchmark = self.get_benchmark() + self.assertTrue(benchmark._preprocess()) + benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_types=['fp8e4m3'], log_raw_data=False) benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) # Positive case - valid raw output @@ -44,7 +94,7 @@ def test_cublaslt_gemm_result_parsing(self): self.assertEqual(3, len(benchmark.result)) for shape in benchmark._args.shapes: - self.assertEqual(2.222, benchmark.result[f'fp8e4m3_{shape.replace(",", "_")}_flops'][0]) + self.assertEqual(2.222, benchmark.result[f'fp8e4m3_0_{shape.replace(",", "_")}_flops'][0]) # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'cuBLAS API failed'))