microsoft · yzygitzh · Dec 10, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
@@ -12,7 +12,7 @@
 
 from superbench.common.utils import logger
 from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode, Precision
-from superbench.benchmarks.micro_benchmarks import MicroBenchmark
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
 from superbench.benchmarks.context import Enum
 from superbench.benchmarks.reducer import ReduceType
 
@@ -168,7 +168,7 @@ def forward(self, x):
         return activation_out
 
 
-class DistInference(MicroBenchmark):
+class DistInference(MicroBenchmarkWithInvoke):
     """The base class of micro-benchmarks."""
     def __init__(self, name, parameters=''):
         """Constructor.
@@ -184,6 +184,9 @@ def __init__(self, name, parameters=''):
         self.__device = None
         self.__cuda_available = False
 
+        # For cpp impl path
+        self._bin_name = 'dist_inference'
+
     def __timer(self):
         """Returns the current time which ensures all previous CUDA events have been finished.
 
@@ -201,6 +204,12 @@ def add_parser_arguments(self):
         """Add the specified arguments."""
         super().add_parser_arguments()
 
+        self._parser.add_argument(
+            '--use_cpp_impl',
+            action='store_true',
+            required=False,
+            help='Whether to use cpp-based implementation.',
+        )
         self._parser.add_argument(
             '--batch_size',
             type=int,
@@ -222,6 +231,20 @@ def add_parser_arguments(self):
             required=False,
             help='Hidden size.',
         )
+        self._parser.add_argument(
+            '--alpha',
+            type=float,
+            default=1.0,
+            required=False,
+            help='Coefficient alpha in D = alpha*(A*B) + beta*(C).',
+        )
+        self._parser.add_argument(
+            '--beta',
+            type=float,
+            default=1.0,
+            required=False,
+            help='Coefficient beta in D = alpha*(A*B) + beta*(C).',
+        )
         self._parser.add_argument(
             '--num_layers',
             type=int,
@@ -285,6 +308,12 @@ def add_parser_arguments(self):
             required=False,
             help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
         )
+        self._parser.add_argument(
+            '--use_cuda_graph',
+            action='store_true',
+            required=False,
+            help='Whether to launch kernels in CUDA graph mode.',
+        )
 
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
@@ -295,32 +324,47 @@ def _preprocess(self):
         if not super()._preprocess():
             return False
 
-        if self._args.distributed_impl != DistributedImpl.DDP:
-            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
-            logger.error(
-                'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
-                    self._name, self._args.distributed_impl
+        if self._args.use_cpp_impl:
+            # Assemble commands if cpp impl path
+            self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+
+            args = '-m %d -n %d -k %d' % (self._args.input_size, self._args.batch_size, self._args.hidden_size)
+            args += ' --alpha %g --beta %g' % (self._args.alpha, self._args.beta)
+            args += ' --num_layers %d --num_warmups %d --num_iters %d' % \
+                (self._args.num_layers, self._args.num_warmup, self._args.num_steps)
+            if self._args.use_cuda_graph:
+                args += ' --use_cuda_graph'
+            self._commands = ['%s %s' % (self.__bin_path, args)]
+        else:
+            # Initialize PyTorch if pytorch impl path
+            if self._args.distributed_impl != DistributedImpl.DDP:
+                self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
+                logger.error(
+                    'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
+                        self._name, self._args.distributed_impl
+                    )
                 )
-            )
-            return False
+                return False
+
+            try:
+                torch.distributed.init_process_group(backend=self._args.distributed_backend.value)
+                self.__world_size = int(os.environ['WORLD_SIZE'])
+                self.__local_rank = int(os.environ['LOCAL_RANK'])
+            except BaseException as e:
+                self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
+                torch.distributed.destroy_process_group()
+                logger.error(
+                    'Initialize distributed env failed - benchmark: {}, message: {}.'.format(self._name, str(e))
+                )
+                return False
 
-        try:
-            torch.distributed.init_process_group(backend=self._args.distributed_backend.value)
-            self.__world_size = int(os.environ['WORLD_SIZE'])
-            self.__local_rank = int(os.environ['LOCAL_RANK'])
-        except BaseException as e:
-            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
-            torch.distributed.destroy_process_group()
-            logger.error('Initialize distributed env failed - benchmark: {}, message: {}.'.format(self._name, str(e)))
-            return False
-
-        if torch.cuda.is_available():
-            torch.cuda.set_device(self.__local_rank)
-            self.__device = torch.device('cuda:{}'.format(self.__local_rank))
-            self.__cuda_available = True
-        else:
-            self.__device = torch.device('cpu:{}'.format(self.__local_rank))
-            self.__cuda_available = False
+            if torch.cuda.is_available():
+                torch.cuda.set_device(self.__local_rank)
+                self.__device = torch.device('cuda:{}'.format(self.__local_rank))
+                self.__cuda_available = True
+            else:
+                self.__device = torch.device('cpu:{}'.format(self.__local_rank))
+                self.__cuda_available = False
 
         return True
 
@@ -401,38 +445,79 @@ def _benchmark(self):
         Return:
             True if _benchmark succeeds.
         """
-        batch_size = self._args.batch_size
-        input_size = self._args.input_size
-        hidden_size = self._args.hidden_size
-        num_layers = self._args.num_layers
-        computation = self._args.computation_kernel
-        communication = self._args.communication_kernel
-        activation = self._args.activation_kernel
-        precision = self._args.precision
-        num_warmup = self._args.num_warmup
-        num_steps = self._args.num_steps
-
-        if self.__local_rank == 0:
-            logger.info(
-                'Distributed Inference - using {} GPUs: '
-                'batch_size={}, input_size={}, hidden_size={}, num_layers={}, '
-                'computation_kernel={}, communication_kernel={}, activation_kernel={}, precision={}, '
-                'num_warmup={} num_steps={}'.format(
-                    self.__world_size, batch_size, input_size, hidden_size, num_layers, computation, communication,
-                    activation, precision, num_warmup, num_steps
+        if self._args.use_cpp_impl:
+            # Execute commands if cpp impl path
+            if not super()._benchmark():
+                return False
+            return True
+        else:
+            # Execute PyTorch model if pytorch impl path
+            batch_size = self._args.batch_size
+            input_size = self._args.input_size
+            hidden_size = self._args.hidden_size
+            num_layers = self._args.num_layers
+            computation = self._args.computation_kernel
+            communication = self._args.communication_kernel
+            activation = self._args.activation_kernel
+            precision = self._args.precision
+            num_warmup = self._args.num_warmup
+            num_steps = self._args.num_steps
+
+            if self.__local_rank == 0:
+                logger.info(
+                    'Distributed Inference - using {} GPUs: '
+                    'batch_size={}, input_size={}, hidden_size={}, num_layers={}, '
+                    'computation_kernel={}, communication_kernel={}, activation_kernel={}, precision={}, '
+                    'num_warmup={} num_steps={}'.format(
+                        self.__world_size, batch_size, input_size, hidden_size, num_layers, computation, communication,
+                        activation, precision, num_warmup, num_steps
+                    )
                 )
+
+            # Prepare model
+            model = self._prepare_model(
+                input_size, hidden_size, num_layers, computation, communication, activation, precision,
+                self.__world_size
             )
 
-        # Prepare model
-        model = self._prepare_model(
-            input_size, hidden_size, num_layers, computation, communication, activation, precision, self.__world_size
-        )
+            # Run model
+            step_times = self._run_model(model, batch_size, input_size, precision, self.__device, num_warmup, num_steps)
+
+            # Process data and return
+            return self._process_data(step_times)
 
-        # Run model
-        step_times = self._run_model(model, batch_size, input_size, precision, self.__device, num_warmup, num_steps)
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
 
-        # Process data and return
-        return self._process_data(step_times)
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+
+        try:
+            output_lines = [x.strip() for x in raw_output.strip().splitlines()]
+            step_time = None
+            for output_line in output_lines:
+                if ' ms per iteration' in output_line:
+                    step_time = float(output_line.split(' ms per iteration')[0].split()[-1])
+                    break
+            return self._process_numeric_result(
+                'step_times', [step_time], reduce_type=ReduceType.MAX, cal_percentile=True
+            )
+        except BaseException as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            return False
 
     def _postprocess(self):
         """Postprocess/cleanup operations after the benchmarking.
@@ -443,12 +528,13 @@ def _postprocess(self):
         if not super()._postprocess():
             return False
 
-        try:
-            torch.distributed.destroy_process_group()
-        except BaseException as e:
-            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
-            logger.error('Post process failed - benchmark: {}, message: {}.'.format(self._name, str(e)))
-            return False
+        if not self._args.use_cpp_impl:
+            try:
+                torch.distributed.destroy_process_group()
+            except BaseException as e:
+                self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
+                logger.error('Post process failed - benchmark: {}, message: {}.'.format(self._name, str(e)))
+                return False
 
         return True
 

@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.18)
+
+project(dist_inference LANGUAGES CXX)
+
+find_package(MPI REQUIRED)
+include_directories(SYSTEM ${MPI_INCLUDE_PATH})
+
+find_package(CUDAToolkit QUIET)
+
+# Cuda environment
+if(CUDAToolkit_FOUND)
+    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
+
+    include(../cuda_common.cmake)
+    add_executable(dist_inference dist_inference.cu)
+    set_property(TARGET dist_inference PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
+    target_link_libraries(dist_inference MPI::MPI_CXX nccl cublasLt)
+else()
+    # ROCm environment
+    include(../rocm_common.cmake)
+    find_package(hip QUIET)
+    if(hip_FOUND)
+        message(STATUS "Found ROCm: " ${HIP_VERSION})
+        # Convert cuda code to hip code inplace
+        execute_process(COMMAND hipify-perl -print-stats -o dist_inference.cpp dist_inference.cu
+                        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+        # Add HIP targets
+        add_executable(dist_inference dist_inference.cpp)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1")
+        target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device)
+    else()
+        message(FATAL_ERROR "No CUDA or ROCm environment found.")
+    endif()
+endif()
+
+install(TARGETS dist_inference RUNTIME DESTINATION bin)