Benchmark - Support batch/shape range in cublaslt gemm (#494)

Support batch and shape range with multiplication factors in cublaslt gemm benchmark.
microsoft · Mar 22, 2023 · dbeba80 · dbeba80
1 parent 655bd0a
commit dbeba80
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 34 deletions.
diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -66,9 +66,9 @@ Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/
 
 #### Metrics
 
-| Name                                           | Unit           | Description                     |
-|------------------------------------------------|----------------|---------------------------------|
-| cublaslt-gemm/${dtype}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |
+| Name                                                     | Unit           | Description                     |
+|----------------------------------------------------------|----------------|---------------------------------|
+| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |
 
 ### `cublas-function`
 
@@ -195,13 +195,13 @@ performed by [University of Virginia STREAM benchmark](https://www.cs.virginia.e
 
 #### Metrics
 
-| Name                                                     | Unit             | Description                                                         |
-|----------------------------------------------------------|------------------|---------------------------------------------------------------------|
-| cpu-stream/threads                                       |                  | Number of threads used for the test. Determined by core count.      |
-| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation.                    |
-| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg   | time (s)         | Average elapsed times over all iterations.                          |
-| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min   | time (s)         | Minimum elapsed times over all iterations.                          |
-| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max   | time (s)         | Maximum elapsed times over all iterations.                          |
+| Name                                                     | Unit             | Description                                                    |
+|----------------------------------------------------------|------------------|----------------------------------------------------------------|
+| cpu-stream/threads                                       |                  | Number of threads used for the test. Determined by core count. |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation.               |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg   | time (s)         | Average elapsed times over all iterations.                     |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min   | time (s)         | Minimum elapsed times over all iterations.                     |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max   | time (s)         | Maximum elapsed times over all iterations.                     |
 
 ## Communication Benchmarks
 

diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
@@ -4,6 +4,7 @@
 """Module of the cuBLASLt GEMM benchmark."""
 
 import os
+import itertools
 
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
@@ -24,6 +25,37 @@ def __init__(self, name, parameters=''):
         self._bin_name = 'cublaslt_gemm'
         self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2']
 
+    def mrange(self, start, stop=-1, multiplication_factor=2):
+        """Range constructor with multiplication factor.
+
+        Args:
+            start (int): Start number.
+            stop (int, optional): Stop number. Defaults to -1.
+            multiplication_factor (int, optional): Multiplication factor. Defaults to 2.
+
+        Yields:
+            int: number in the range.
+        """
+        while True:
+            yield start
+            start *= multiplication_factor
+            if start > stop or start == 0 or multiplication_factor < 2:
+                break
+
+    def validate_mrange(self, string):
+        """Validate mrange string in format start[[:stop]:multiplication_factor].
+
+        Args:
+            string (str): mrange string.
+
+        Returns:
+            bool: whether the mrange is expected.
+        """
+        nums = string.split(':')
+        if len(nums) > 3:
+            return False
+        return bool(all(x.isdigit() for x in nums))
+
     def add_parser_arguments(self):
         """Add the specified arguments."""
         super().add_parser_arguments()
@@ -33,14 +65,17 @@ def add_parser_arguments(self):
             type=str,
             nargs='+',
             default=[f'{x},{x},{x}' for x in [2048, 4096, 8192]],
-            help='Shapes in m,n,k format.',
+            help='Shapes in m,n,k format. Support format start:stop:multiplication_factor, e.g., 16:128:2.',
         )
         self._parser.add_argument(
             '--batch',
-            type=int,
-            default=0,
+            type=str,
+            default='0',
             required=False,
-            help='Batch size for strided batch GEMM, set 0 to disable.',
+            help=(
+                'Batch size for strided batch GEMM, set 0 to disable.'
+                ' Support format start:stop:multiplication_factor, e.g., 16:128:2.'
+            ),
         )
         self._parser.add_argument(
             '--num_warmup',
@@ -57,11 +92,12 @@ def add_parser_arguments(self):
             help='Number of steps to measure.',
         )
         self._parser.add_argument(
-            '--in_type',
+            '--in_types',
             type=str,
-            default='fp8e4m3',
+            nargs='+',
+            default=['fp8e4m3'],
             required=False,
-            help='Input data type, supports {}.'.format(' '.join(self._in_types)),
+            help='List of input data types, support {}.'.format(' '.join(self._in_types)),
         )
 
     def _preprocess(self):
@@ -75,20 +111,28 @@ def _preprocess(self):
 
         self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
 
-        if self._args.in_type not in self._in_types:
-            logger.error(f'Invalid input type {self._args.in_type}.')
+        if not self.validate_mrange(self._args.batch):
+            logger.error(f'Invalid batch size {self._args.batch}.')
             return False
 
         self._commands = []
-        for shape in self._args.shapes:
-            shape_list = shape.replace(',', ' ').split()
-            if len(shape_list) != 3 or not all(x.isdigit() for x in shape_list):
-                logger.error(f'Invalid shape {shape}.')
+        for _in_type in self._args.in_types:
+            if _in_type not in self._in_types:
+                logger.error(f'Invalid input type {_in_type}.')
                 return False
-            self._commands.append(
-                f'{self.__bin_path} -m {shape_list[0]} -n {shape_list[1]} -k {shape_list[2]} '
-                f'-b {self._args.batch} -w {self._args.num_warmup} -i {self._args.num_steps} -t {self._args.in_type}'
-            )
+            for _b in self.mrange(*map(int, self._args.batch.split(':'))):
+                for shape in self._args.shapes:
+                    shape_list = shape.replace(',', ' ').split()
+                    if len(shape_list) != 3 or not all(self.validate_mrange(x) for x in shape_list):
+                        logger.error(f'Invalid shape {shape}.')
+                        return False
+                    for _m, _n, _k in itertools.product(
+                        *map(lambda shape: self.mrange(*map(int, shape.split(':'))), shape_list)
+                    ):
+                        self._commands.append(
+                            f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -b {_b} '
+                            f'-w {self._args.num_warmup} -i {self._args.num_steps} -t {_in_type}'
+                        )
 
         return True
 
@@ -110,7 +154,9 @@ def _process_raw_result(self, cmd_idx, raw_output):
             fields = raw_output.strip().split()
             if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]):
                 raise ValueError('Invalid result.')
-            self._result.add_result(f'{self._args.in_type}_{"_".join(fields[:3])}_flops', float(fields[-1]))
+            self._result.add_result(
+                f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops', float(fields[-1])
+            )
         except BaseException as e:
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
             logger.error(

diff --git a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
@@ -4,7 +4,7 @@
 """Tests for cublaslt-gemm benchmark."""
 
 import unittest
-from types import SimpleNamespace
+from types import GeneratorType, SimpleNamespace
 
 from tests.helper.testcase import BenchmarkTestCase
 from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
@@ -19,7 +19,12 @@ def setUpClass(cls):
         super().setUpClass()
         cls.benchmark_name = 'cublaslt-gemm'
         cls.createMockEnvs(cls)
-        cls.createMockFiles(cls, ['bin/cublaslt_fp8_gemm'])
+        cls.createMockFiles(cls, ['bin/cublaslt_gemm'])
+
+    def get_benchmark(self):
+        """Get Benchmark."""
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
+        return benchmark_cls(self.benchmark_name, parameters='')
 
     def test_cublaslt_gemm_cls(self):
         """Test cublaslt-gemm benchmark class."""
@@ -30,11 +35,56 @@ def test_cublaslt_gemm_cls(self):
             else:
                 self.assertIsNone(benchmark_cls)
 
+    def test_mrange(self):
+        """Test mrange generation."""
+        benchmark = self.get_benchmark()
+        self.assertIsInstance(benchmark.mrange(1), GeneratorType)
+        self.assertListEqual([4, 8, 16, 32], list(benchmark.mrange(4, 32, 2)))
+        self.assertListEqual([2, 4, 8, 16], list(benchmark.mrange(2, 31, 2)))
+        self.assertListEqual([2, 4, 8], list(benchmark.mrange(2, 8)))
+        self.assertListEqual([2], list(benchmark.mrange(2, 0, 2)))
+        self.assertListEqual([2], list(benchmark.mrange(2)))
+        self.assertListEqual([2], list(benchmark.mrange(2, 4, 1)))
+        self.assertListEqual([2], list(benchmark.mrange(2, 4, 0)))
+        self.assertListEqual([0], list(benchmark.mrange(0, 0)))
+        self.assertListEqual([0], list(benchmark.mrange(0)))
+
+    def test_validate_mrange(self):
+        """Test mrange validation."""
+        benchmark = self.get_benchmark()
+        self.assertTrue(benchmark.validate_mrange('2:32:2'))
+        self.assertTrue(benchmark.validate_mrange('4:32'))
+        self.assertTrue(benchmark.validate_mrange('8'))
+        self.assertFalse(benchmark.validate_mrange('2:32:2:4'))
+        self.assertFalse(benchmark.validate_mrange('2.5:32'))
+
+    def test_cublaslt_gemm_command_generation(self):
+        """Test cublaslt-gemm benchmark command generation."""
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
+        benchmark = benchmark_cls(
+            self.benchmark_name,
+            parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64',
+        )
+        self.assertTrue(benchmark._preprocess())
+        self.assertEqual(4 * (2 * 2 * 3 + 2) * 3, len(benchmark._commands))
+
+        def cmd(t, b, m, n, k):
+            return f'{benchmark._CublasLtBenchmark__bin_path} -m {m} -n {n} -k {k} -b {b} -w 20 -i 50 -t {t}'
+
+        for _t in ['fp16', 'fp32', 'fp64']:
+            for _b in [2, 4, 8, 16]:
+                for _m in [2, 4]:
+                    for _n in [4, 8]:
+                        for _k in [8, 16, 32]:
+                            self.assertIn(cmd(_t, _b, _m, _n, _k), benchmark._commands)
+                for _m in [32, 128]:
+                    self.assertIn(cmd(_t, _b, _m, 128, 128), benchmark._commands)
+
     def test_cublaslt_gemm_result_parsing(self):
         """Test cublaslt-gemm benchmark result parsing."""
-        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
-        benchmark = benchmark_cls(self.benchmark_name, parameters='')
-        benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_type='fp8e4m3', log_raw_data=False)
+        benchmark = self.get_benchmark()
+        self.assertTrue(benchmark._preprocess())
+        benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_types=['fp8e4m3'], log_raw_data=False)
         benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
 
         # Positive case - valid raw output
@@ -44,7 +94,7 @@ def test_cublaslt_gemm_result_parsing(self):
 
         self.assertEqual(3, len(benchmark.result))
         for shape in benchmark._args.shapes:
-            self.assertEqual(2.222, benchmark.result[f'fp8e4m3_{shape.replace(",", "_")}_flops'][0])
+            self.assertEqual(2.222, benchmark.result[f'fp8e4m3_0_{shape.replace(",", "_")}_flops'][0])
 
         # Negative case - invalid raw output
         self.assertFalse(benchmark._process_raw_result(1, 'cuBLAS API failed'))