Improve SYCL reduction performance: workgroup_reduction (kokkos#6270)

* Improve SYCL workgroup reduction * Choose correct reduction algorithm * RUN SYCL CI on A100 instead * Change sign of n_active_subgroups
nliber · Jul 17, 2023 · 11ce288 · 11ce288
1 parent 933d23b
commit 11ce288
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 30 deletions.
diff --git a/.jenkins b/.jenkins
@@ -96,7 +96,7 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.sycl'
                             dir 'scripts/docker'
-                            label 'nvidia-docker && volta'
+                            label 'nvidia-docker && ampere'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache'
                         }
                     }
@@ -111,7 +111,7 @@ pipeline {
                                 -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \
                                 -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \
                                 -DKokkos_ARCH_NATIVE=ON \
-                                -DKokkos_ARCH_VOLTA70=ON \
+                                -DKokkos_ARCH_AMPERE80=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \

diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
@@ -31,61 +31,63 @@ namespace Kokkos {
 
 namespace Impl {
 
+// FIXME_SYCL It appears that using shuffles is slower than going through local
+// memory.
 template <class ReducerType>
-inline constexpr bool use_shuffle_based_algorithm =
-    std::is_reference_v<typename ReducerType::reference_type>;
+inline constexpr bool use_shuffle_based_algorithm = false;
+// std::is_reference_v<typename ReducerType::reference_type>;
 
 namespace SYCLReduction {
 template <typename ValueType, typename ReducerType, int dim>
 std::enable_if_t<!use_shuffle_based_algorithm<ReducerType>> workgroup_reduction(
     sycl::nd_item<dim>& item, sycl::local_accessor<ValueType> local_mem,
     sycl::device_ptr<ValueType> results_ptr,
     sycl::global_ptr<ValueType> device_accessible_result_ptr,
-    const unsigned int value_count, const ReducerType& final_reducer,
+    const unsigned int value_count_, const ReducerType& final_reducer,
     bool final, unsigned int max_size) {
-  const auto local_id = item.get_local_linear_id();
+  const unsigned int value_count =
+      std::is_reference_v<typename ReducerType::reference_type> ? 1
+                                                                : value_count_;
+  const int local_id = item.get_local_linear_id();
 
   // Perform the actual workgroup reduction in each subgroup
   // separately.
-  auto sg             = item.get_sub_group();
-  auto* result        = &local_mem[local_id * value_count];
-  const auto id_in_sg = sg.get_local_id()[0];
+  auto sg            = item.get_sub_group();
+  auto* result       = &local_mem[local_id * value_count];
+  const int id_in_sg = sg.get_local_id()[0];
   const auto local_range =
       std::min<unsigned int>(sg.get_local_range()[0], max_size);
   const auto upper_stride_bound =
-      std::min(local_range - id_in_sg, max_size - local_id);
+      std::min<unsigned int>(local_range - id_in_sg, max_size - local_id);
   for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
     if (stride < upper_stride_bound)
       final_reducer.join(result, &local_mem[(local_id + stride) * value_count]);
     sycl::group_barrier(sg);
   }
   sycl::group_barrier(item.get_group());
 
-  // Copy the subgroup results into the first positions of the
-  // reduction array.
-  if (id_in_sg == 0)
-    final_reducer.copy(&local_mem[sg.get_group_id()[0] * value_count], result);
-  sycl::group_barrier(item.get_group());
-
   // Do the final reduction only using the first subgroup.
   if (sg.get_group_id()[0] == 0) {
-    const auto n_subgroups = sg.get_group_range()[0];
-    auto* result_          = &local_mem[id_in_sg * value_count];
+    const unsigned int n_subgroups = sg.get_group_range()[0];
+    const int max_subgroup_size    = sg.get_max_local_range()[0];
+    auto* result_ = &local_mem[id_in_sg * max_subgroup_size * value_count];
     // In case the number of subgroups is larger than the range of
     // the first subgroup, we first combine the items with a higher
     // index.
     for (unsigned int offset = local_range; offset < n_subgroups;
          offset += local_range)
       if (id_in_sg + offset < n_subgroups)
-        final_reducer.join(result_,
-                           &local_mem[(id_in_sg + offset) * value_count]);
+        final_reducer.join(
+            result_,
+            &local_mem[(id_in_sg + offset) * max_subgroup_size * value_count]);
     sycl::group_barrier(sg);
 
     // Then, we proceed as before.
     for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
       if (id_in_sg + stride < n_subgroups)
-        final_reducer.join(result_,
-                           &local_mem[(id_in_sg + stride) * value_count]);
+        final_reducer.join(
+            result_,
+            &local_mem[(id_in_sg + stride) * max_subgroup_size * value_count]);
       sycl::group_barrier(sg);
     }
 
@@ -118,24 +120,27 @@ std::enable_if_t<use_shuffle_based_algorithm<ReducerType>> workgroup_reduction(
 
   // Perform the actual workgroup reduction in each subgroup
   // separately.
-  auto sg             = item.get_sub_group();
-  const auto id_in_sg = sg.get_local_id()[0];
+  auto sg            = item.get_sub_group();
+  const int id_in_sg = sg.get_local_id()[0];
   const auto local_range =
       std::min<unsigned int>(sg.get_local_range()[0], max_size);
+
   const auto upper_stride_bound =
-      std::min(local_range - id_in_sg, max_size - local_id);
+      std::min<unsigned int>(local_range - id_in_sg, max_size - local_id);
   for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
     auto tmp = sg.shuffle_down(local_value, stride);
     if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp);
   }
 
   // Copy the subgroup results into the first positions of the
   // reduction array.
-  const auto max_subgroup_size = sg.get_max_local_range()[0];
-  const auto n_active_subgroups =
+  const int max_subgroup_size = sg.get_max_local_range()[0];
+  const int n_active_subgroups =
       (max_size + max_subgroup_size - 1) / max_subgroup_size;
-  if (id_in_sg == 0 && sg.get_group_id()[0] <= n_active_subgroups)
-    local_mem[sg.get_group_id()[0]] = local_value;
+  const int sg_group_id = sg.get_group_id()[0];
+  if (id_in_sg == 0 && sg_group_id <= n_active_subgroups)
+    local_mem[sg_group_id] = local_value;
+
   item.barrier(sycl::access::fence_space::local_space);
 
   // Do the final reduction only using the first subgroup.
@@ -684,7 +689,7 @@ class ParallelReduce<CombinedFunctorReducerType,
           const index_type n_global_y = 1;
           const index_type n_global_z = 1;
 
-          if constexpr (ReducerType::static_value_size() == 0) {
+          if constexpr (!use_shuffle_based_algorithm<ReducerType>) {
             reference_type update =
                 reducer.init(&local_mem[local_id * value_count]);