Add microbenchmark for composable kernel gemm

yukirora · yukirora · commit 0e86db8467c6 · 2024-04-29T05:54:01.000Z
Add microbenchmark for composable kernel gemm, automatically generated
by GPT-4 based on existing code.

---------

Co-authored-by: GPT-4
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -28,6 +28,7 @@
 from superbench.benchmarks.micro_benchmarks.ib_validation_performance import IBBenchmark
 from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch
 from superbench.benchmarks.micro_benchmarks.ort_inference_performance import ORTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.rocm_composable_kernel_performance import RocmComposableKernelBenchmark
 from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
 from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
@@ -64,6 +65,7 @@
     'MicroBenchmark',
     'MicroBenchmarkWithInvoke',
     'ORTInferenceBenchmark',
+    'RocmComposableKernelBenchmark',
     'RocmGemmFlopsBenchmark',
     'RocmMemBwBenchmark',
     'ShardingMatmul',
diff --git a/superbench/benchmarks/micro_benchmarks/rocm_composable_kernel_performance.py b/superbench/benchmarks/micro_benchmarks/rocm_composable_kernel_performance.py
@@ -0,0 +1,211 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the ROCm composable kernel GEMM benchmark."""
+
+import os
+import re
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks import BlasLtBaseBenchmark
+
+
+class RocmComposableKernelBenchmark(BlasLtBaseBenchmark):
+    """The composable kernel GEMM benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'ckProfiler'
+        self._in_types = ['fp32', 'fp16', 'bf16', 'fp8', 'int8']
+        self._in_type_map = {
+            'fp16': '1',
+            'fp32': '0',
+            'bf16': '2',
+            'fp8': '4',
+            'int8': '3',
+        }
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--in_types',
+            type=str,
+            nargs='+',
+            default=['fp16'],
+            required=False,
+            help='List of input data types, support {}.'.format(' '.join(self._in_types)),
+        )
+        self._parser.add_argument(
+            '--initialization',
+            type=str,
+            default='int',
+            choices=['float', 'int'],
+            required=False,
+            help='Initialize matrix data.',
+        )
+        self._parser.add_argument(
+            '--matrixA_layout',
+            type=str,
+            default='row',
+            choices=['row', 'col'],
+            required=False,
+            help='Matrix A Layout. RowMajor or ColMajor.',
+        )
+        self._parser.add_argument(
+            '--matrixB_layout',
+            type=str,
+            default='row',
+            choices=['row', 'col'],
+            required=False,
+            help='Matrix B Layout. RowMajor or ColMajor.',
+        )
+        self._parser.add_argument(
+            '--check_data',
+            action='store_true',
+            required=False,
+            help='Whether check data correctness.',
+        )
+        self._parser.add_argument(
+            '--splitk',
+            type=int,
+            default=None,
+            required=False,
+            nargs='+',
+            help='Split K dimension.',
+        )
+        self._parser.add_argument(
+            '--streamk',
+            type=int,
+            default=None,
+            required=False,
+            nargs='+',
+            help='Stream K blocks.',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+
+        self._commands = []
+        self._precision_in_commands = []
+        matrix_layout = '0'
+        if self._args.matrixA_layout == 'row' and self._args.matrixB_layout == 'row':
+            matrix_layout = '0'
+        elif self._args.matrixA_layout == 'row' and self._args.matrixB_layout == 'col':
+            matrix_layout = '1'
+        elif self._args.matrixA_layout == 'col' and self._args.matrixB_layout == 'row':
+            matrix_layout = '2'
+        elif self._args.matrixA_layout == 'col' and self._args.matrixB_layout == 'col':
+            matrix_layout = '3'
+        if self._args.check_data:
+            self._args.check_data = '1'
+        else:
+            self._args.check_data = '0'
+        init = 1 if self._args.initialization == 'int' else 2
+        for (_m, _n, _k, _b, _in_type) in self._shapes_to_run:
+            params = f'{self._in_type_map[_in_type]}' + \
+                f' {matrix_layout} {self._args.check_data} {init} 0 1' + \
+                f' {_m} {_n} {_k} -1 -1 -1'
+            command = f'{self.__bin_path} gemm {params} {self._args.num_warmup} {self._args.num_steps}'
+            self._commands.append(command)
+            logger.info(command)
+            if self._args.splitk:
+                if not isinstance(self._args.splitk, list):
+                    self._args.splitk = [self._args.splitk]
+                for splitk in self._args.splitk:
+                    command = f'{self.__bin_path} gemm_splitk {params} {splitk}' + \
+                        f' {self._args.num_warmup} {self._args.num_steps}'
+                    self._commands.append(command)
+                logger.info(command)
+            if self._args.streamk:
+                if not isinstance(self._args.streamk, list):
+                    self._args.streamk = [self._args.streamk]
+                for streamk in self._args.streamk:
+                    command = f'{self.__bin_path} gemm_streamk {params} {streamk}' + \
+                        f' {self._args.num_warmup} {self._args.num_steps}'
+                    self._commands.append(command)
+                    logger.info(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+
+        try:
+            lines = raw_output.splitlines()
+            index = None
+
+            # Find the line containing 'hipblaslt-Gflops'
+            for i, line in enumerate(lines):
+                if 'Best Perf' in line:
+                    index = i
+                    break
+
+            if index is not None:
+                # Search the text for each pattern
+                datatype_match = re.search(r"datatype = (\w+)", line)
+                m_match = re.search(r"M = (\d+)", line)
+                n_match = re.search(r"N = (\d+)", line)
+                k_match = re.search(r"K = (\d+)", line)
+                flops_match = re.search(r"(\d+\.?\d*) TFlops", line)
+
+                # Extract the matched groups
+                datatype = datatype_match.group(1) if datatype_match else None
+                m = int(m_match.group(1)) if m_match else None
+                n = int(n_match.group(1)) if n_match else None
+                k = int(k_match.group(1)) if k_match else None
+                flops = float(flops_match.group(1)) if flops_match else None
+
+                metric = f'{datatype}_{m}_{n}_{k}_flops'
+                self._result.add_result(metric, flops)
+            else:
+                self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+
+        except BaseException as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            return False
+        finally:
+            if cmd_idx == len(self._commands) - 1:
+                for metric in self.results:
+                    self.results[metric] = [max(self.results[metric])]
+        return True
+
+
+BenchmarkRegistry.register_benchmark('composable-kernel-gemm', RocmComposableKernelBenchmark, platform=Platform.ROCM)
diff --git a/tests/benchmarks/micro_benchmarks/test_rocm_composable_kernel.py b/tests/benchmarks/micro_benchmarks/test_rocm_composable_kernel.py
@@ -0,0 +1,96 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for ROCm composable kernel benchmark."""
+
+import unittest
+from types import SimpleNamespace
+
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+from superbench.benchmarks.result import BenchmarkResult
+
+
+class composable_kernelBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase):
+    """Class for composable kernel benchmark test cases."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.benchmark_name = 'composable-kernel-gemm'
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/ckProfiler'])
+
+    def get_benchmark(self):
+        """Get Benchmark."""
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.ROCM)
+        return benchmark_cls(self.benchmark_name, parameters='')
+
+    def test_composable_kernel_gemm_cls(self):
+        """Test composable-kernel-gemm benchmark class."""
+        for platform in Platform:
+            (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, platform)
+            if platform is Platform.ROCM:
+                self.assertIsNotNone(benchmark_cls)
+            else:
+                self.assertIsNone(benchmark_cls)
+
+    def test_composable_kernel_gemm_command_generation(self):
+        """Test composable-kernel-gemm benchmark command generation."""
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.ROCM)
+        benchmark = benchmark_cls(
+            self.benchmark_name,
+            parameters=' --shapes 2,4,8 --in_types fp16 fp32',
+        )
+
+        self.assertTrue(benchmark._preprocess())
+        self.assertEqual(len(benchmark._args.in_types), len(benchmark._commands))
+
+        benchmark = benchmark_cls(
+            self.benchmark_name,
+            parameters=' --shapes 2,4,8 --in_types fp16 fp32 --splitk 2 4 --streamk -1',
+        )
+
+        self.assertTrue(benchmark._preprocess())
+        self.assertEqual(4 * len(benchmark._args.in_types), len(benchmark._commands))
+        for _t in ['fp16', 'fp32']:
+            params = f'{benchmark._in_type_map[_t]} 0 0 1 0 1 2 4 8 -1 -1 -1'
+            command = f'{benchmark._RocmComposableKernelBenchmark__bin_path} gemm {params} {benchmark._args.num_warmup} {benchmark._args.num_steps}'
+            assert (command in benchmark._commands)
+
+            for splitk in [2, 4]:
+                command = f'{benchmark._RocmComposableKernelBenchmark__bin_path} gemm_splitk {params} {splitk} {benchmark._args.num_warmup} {benchmark._args.num_steps}'
+                assert (command in benchmark._commands)
+
+            command = f'{benchmark._RocmComposableKernelBenchmark__bin_path} gemm_streamk {params} -1 {benchmark._args.num_warmup} {benchmark._args.num_steps}'
+            assert (command in benchmark._commands)
+
+    def test_composable_kernel_gemm_result_parsing(self):
+        """Test composable-kernel-gemm benchmark result parsing."""
+        benchmark = self.get_benchmark()
+        self.assertTrue(benchmark._preprocess())
+        benchmark._args = SimpleNamespace(shapes=['8192,8192,8192'], in_types=['fp16'], log_raw_data=False)
+        benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
+
+        example_raw_output = """
+Perf:    17.0853 ms, 64.3544 TFlops, 23.5673 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B256_Vec8x1x4_512x16x4x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    51.8717 ms, 21.1967 TFlops, 7.76248 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B64_Vec8x1x4_16x16x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    51.2179 ms, 21.4673 TFlops, 7.86157 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B64_Vec8x1x4_16x16x16x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    24.4389 ms, 44.9902 TFlops, 16.4759 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B128_Vec8x1x4_16x32x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    12.0388 ms, 91.331 TFlops, 33.4464 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B128_Vec8x2x4_16x64x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    12.8774 ms, 85.3828 TFlops, 31.2681 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B128_Vec8x4x4_16x128x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    14.7506 ms, 74.54 TFlops, 27.2974 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B128_Vec8x8x4_16x256x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    12.0325 ms, 91.3782 TFlops, 33.4637 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B256_Vec8x4x4_16x256x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:     26.055 ms, 42.1996 TFlops, 15.4539 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B128_Vec8x1x4_32x16x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    13.9292 ms, 78.9358 TFlops, 28.9072 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B128_Vec8x1x4_64x16x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:     8.0511 ms, 136.567 TFlops, 50.0122 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B128_Vec8x1x4_128x16x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    18.9246 ms, 58.0995 TFlops, 21.2767 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B128_Vec8x1x4_256x16x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Perf:    15.0647 ms, 72.986 TFlops, 26.7283 GB/s, GemmXdlSplitKCShuffle_MNKPadding_RRR_B256_Vec8x1x4_256x16x8x8 LoopScheduler: Default, PipelineVersion: v2, KBatch 2
+Best Perf for datatype = f16 ALayout =  RowMajor BLayout =  RowMajor M = 8192 N = 8192 K = 8192 StrideA = 8192 StrideB = 8192 StrideC = 8192 KBatch = 2 : 2.17246 ms, 506.113 TFlops, 185.344 GB/s, GemmXdlSplitKCShuffle_Default_RRR_B256_Vec8x2x8_256x128x4x8 LoopScheduler: Default, PipelineVersion: v1
+"""
+        # Positive case - valid raw output
+        self.assertTrue(benchmark._process_raw_result(0, example_raw_output))
+        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
+
+        self.assertEqual(2, len(benchmark.result))
+        self.assertEqual(506.113, benchmark.result['f16_8192_8192_8192_flops'][0])
diff --git a/third_party/Makefile b/third_party/Makefile
@@ -12,15 +12,16 @@ ROCM_PATH ?= /opt/rocm
 CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
 ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
 HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
+COMPOSABLEKERNEL_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
 ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
 
-.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
+.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm rocm_composable_kernel
 
 # Build all targets.
 all: cuda rocm
 cuda_with_msccl: cuda cuda_msccl
 cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
-rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
+rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_composable_kernel
 cpu: common cpu_perftest
 common: cpu_hpl cpu_stream fio
 directx_amd: directx_amf_encoding_latency
@@ -120,6 +121,17 @@ rocm_hipblaslt: sb_micro_path
 		cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/;  \
 	fi
 
+# Build composable_kernel.
+# Composable Kernel is released with rocm, like rocm-6.0 and so on.
+rocm_composable_kernel: sb_micro_path
+	@if [ ! -e $(SB_MICRO_PATH)/bin/ckProfiler ] && [ -z `which ckProfiler` ]; then \
+		if [ -d composable_kernel ]; then rm -rf composable_kernel; fi; \
+		git clone -b ${COMPOSABLEKERNEL_BRANCH} https://github.com/ROCm/composable_kernel; \
+		cd composable_kernel && mkdir build && cd build; \
+		cmake -D CMAKE_PREFIX_PATH=/opt/rocm -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -D CMAKE_BUILD_TYPE=Release -D DTYPES="fp64;fp32;fp16;fp8;bf16;int8" ..; \
+		make -j ckProfiler install; \
+	fi
+
 # Build hipBusBandwidth.
 # HIP is released with rocm, like rocm-4.2.0 and so on.
 # The version we use is the released tag which is consistent with the rocm version in the environment or docker.