[HIP] Optimize parallel_reduce (kokkos#6229)

* always use lds Change-Id: I6dd7b347b74c197257160ab8657f57c7c0489cb2 * fix half and int8_t reductions Change-Id: I1ddb1e65768b4df0f556000a3b9fcf7ee4c00e28 * Use __syncthreads_or, implemented since ROCm 4.5 Change-Id: Ibf147742743fa03d97c2d77d65855b21a58db1d9 * add heuristic Change-Id: I9141ddaac84e5d8c590756122d5763407642afcd * tune heuristic for RHODO Change-Id: Id6b8a29aaaa5613d6bca1ae3031025996bfb8304 * apply code style patch Change-Id: I2f1e2f4fcfc563d27f1c6ca31fbd32f81953ac76 * tweak min block-size Change-Id: I4b3c016f14621bc23fd2262e5201498d3ec9e8a4 * Update core/src/HIP/Kokkos_HIP_Parallel_Range.hpp Co-authored-by: Daniel Arndt <arndtd@ornl.gov> * remove unused variable Change-Id: I41e6819d04fce43d017eec02920d3f6bdc40b52b --------- Co-authored-by: Nicholas Curtis <nicurtis@amd.com> Co-authored-by: Daniel Arndt <arndtd@ornl.gov>
nliber · Jul 27, 2023 · 6cffdb4 · 6cffdb4
1 parent 8b08415
commit 6cffdb4
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 57 deletions.
diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -123,6 +123,21 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
   using functor_type   = FunctorType;
   using size_type      = Kokkos::HIP::size_type;
   using index_type     = typename Policy::index_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::HIP::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
 
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
@@ -132,11 +147,10 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space = nullptr;
-  size_type* m_scratch_flags = nullptr;
+  word_size_type* m_scratch_space = nullptr;
+  size_type* m_scratch_flags      = nullptr;
 
-  static bool constexpr UseShflReduction =
-      static_cast<bool>(ReducerType::static_value_size());
+  static bool constexpr UseShflReduction = false;
 
  private:
   struct ShflReductionTag {};
@@ -164,13 +178,14 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
 
   __device__ inline void run(SHMEMReductionTag) const {
     const ReducerType& reducer = m_functor_reducer.get_reducer();
-    const integral_nonzero_constant<
-        size_type, ReducerType::static_value_size() / sizeof(size_type)>
-        word_count(reducer.value_size() / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
+        word_count(reducer.value_size() / sizeof(word_size_type));
 
     {
       reference_type value = reducer.init(reinterpret_cast<pointer_type>(
-          ::Kokkos::kokkos_impl_hip_shared_memory<size_type>() +
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
           threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
@@ -193,18 +208,19 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
     if (!do_final_reduction)
       do_final_reduction = hip_single_inter_block_reduce_scan<false>(
           reducer, blockIdx.x, gridDim.x,
-          ::Kokkos::kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
-          m_scratch_flags);
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>(),
+          m_scratch_space, m_scratch_flags);
     if (do_final_reduction) {
       // This is the final block with the final result at the final threads'
       // location
 
-      size_type* const shared =
-          ::Kokkos::kokkos_impl_hip_shared_memory<size_type>() +
+      word_size_type* const shared =
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
           (blockDim.y - 1) * word_count.value;
-      size_type* const global = m_result_ptr_device_accessible
-                                    ? reinterpret_cast<size_type*>(m_result_ptr)
-                                    : m_scratch_space;
+      word_size_type* const global =
+          m_result_ptr_device_accessible
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
+              : m_scratch_space;
 
       if (threadIdx.y == 0) {
         reducer.final(reinterpret_cast<value_type*>(shared));
@@ -300,24 +316,40 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
       // use a slightly less constrained, but still well bounded limit for
       // scratch
       int nblocks = (nwork + block.y - 1) / block.y;
-      // Heuristic deciding the value of nblocks. The values for the light
-      // weight case have been chosen using a vector product benchmark on MI250.
-      constexpr auto light_weight =
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight;
-      constexpr typename Policy::work_item_property property;
-      if ((property & light_weight) == light_weight) {
-        if (nblocks < block_size) {
-          // Keep nblocks as is
-        } else if (nblocks < 16 * block_size) {
-          nblocks = block_size;
-        } else {
-          nblocks = 4 * block_size;
-        }
+      // Heuristic deciding the value of nblocks.
+      // The general idea here is we want to:
+      //    1. Not undersubscribe the device (i.e., we want at least
+      //    preferred_block_min blocks)
+      //    2. Have each thread reduce > 1 value to minimize overheads
+      //    3. Limit the total # of blocks, to avoid unbounded scratch space
+      constexpr int block_max           = 4096;
+      constexpr int preferred_block_min = 1024;
+
+      if (nblocks < preferred_block_min) {
+        // keep blocks as is, already have low parallelism
+      } else if (nblocks > block_max) {
+        // "large dispatch" -> already have lots of parallelism
+        nblocks = block_max;
       } else {
-        nblocks = std::min(nblocks, 4096);
+        // in the intermediate range, try to have each thread process multiple
+        // items to offset the cost of the reduction (with not enough
+        // parallelism to hide it)
+        int items_per_thread =
+            (nwork + nblocks * block_size - 1) / (nblocks * block_size);
+        if (items_per_thread < 4) {
+          int ratio = std::min(
+              (nblocks + preferred_block_min - 1) / preferred_block_min,
+              (4 + items_per_thread - 1) / items_per_thread);
+          nblocks /= ratio;
+        }
       }
-      m_scratch_space = ::Kokkos::Impl::hip_internal_scratch_space(
-          m_policy.space(), reducer.value_size() * nblocks);
+
+      // TODO: down casting these uses more space than required?
+      m_scratch_space =
+          (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space(
+              m_policy.space(), reducer.value_size() * nblocks);
+      // Intentionally do not downcast to word_size_type since we use HIP
+      // atomics in Kokkos_HIP_ReduceScan.hpp
       m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags(
           m_policy.space(), sizeof(size_type));
       // Required grid.x <= block.y

diff --git a/core/src/HIP/Kokkos_HIP_ReduceScan.hpp b/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
@@ -116,16 +116,12 @@ struct HIPReductionsFunctor<FunctorType, true> {
 
     // Use the last block that is done to do the do the reduction across the
     // block
-    __shared__ unsigned int num_teams_done;
+    unsigned int num_teams_done = 0;
     if (threadIdx.x + threadIdx.y == 0) {
       num_teams_done = Kokkos::atomic_fetch_add(global_flags, 1) + 1;
     }
     bool is_last_block = false;
-    // FIXME_HIP HIP does not support syncthreads_or. That's why we need to make
-    // num_teams_done __shared__
-    // if (__syncthreads_or(num_teams_done == gridDim.x)) {*/
-    __syncthreads();
-    if (num_teams_done == gridDim.x) {
+    if (__syncthreads_or(num_teams_done == gridDim.x)) {
       is_last_block = true;
       *global_flags = 0;
       functor.init(&value);
@@ -214,16 +210,12 @@ struct HIPReductionsFunctor<FunctorType, false> {
 
     // Use the last block that is done to do the do the reduction across the
     // block
-    __shared__ unsigned int num_teams_done;
+    unsigned int num_teams_done = 0;
     if (threadIdx.x + threadIdx.y == 0) {
       num_teams_done = Kokkos::atomic_fetch_add(global_flags, 1) + 1;
     }
     bool is_last_block = false;
-    // FIXME_HIP HIP does not support syncthreads_or. That's why we need to make
-    // num_teams_done __shared__
-    // if (__syncthreads_or(num_teams_done == gridDim.x)) {*/
-    __syncthreads();
-    if (num_teams_done == gridDim.x) {
+    if (__syncthreads_or(num_teams_done == gridDim.x)) {
       is_last_block = true;
       *global_flags = 0;
       functor.init(&value);
@@ -395,20 +387,10 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
   // Contributing blocks note that their contribution has been completed via an
   // atomic-increment flag If this block is not the last block to contribute to
   // this group then the block is done.
-  // FIXME_HIP __syncthreads_or is not supported by HIP yet.
-  // const bool is_last_block = !__syncthreads_or(
-  //    threadIdx.y
-  //        ? 0
-  //        : (1 + atomicInc(global_flags, block_count - 1) < block_count));
-  __shared__ int n_done;
-  n_done = 0;
-  __syncthreads();
-  if (threadIdx.y == 0) {
-    n_done = 1 + atomicInc(global_flags, block_count - 1);
-  }
-  __syncthreads();
-  bool const is_last_block = (n_done == static_cast<int>(block_count));
-
+  const bool is_last_block = !__syncthreads_or(
+      threadIdx.y
+          ? 0
+          : (1 + atomicInc(global_flags, block_count - 1) < block_count));
   if (is_last_block) {
     size_type const b = (static_cast<long long int>(block_count) *
                          static_cast<long long int>(threadIdx.y)) >>