SYCL: Improve team_reduce implementation (kokkos#6562)

* SYCL: Improve team_reduce implementation * Manually unroll loop to propagate shuffle destinations at compile-time * Update step_width * Fix sign comparison warnings * Comment and choosing step_width=16 * KOKKOS_ASSERT that the subgroup range doesn't exceed 32 * Remove unattainable cases * Add a barrier guarding the reduction array upon exiting team_reduce * Update comment to better reflect intent of barrier --------- Co-authored-by: Christian Trott <crtrott@sandia.gov>
masterleinad · Apr 5, 2024 · 98b1a38 · 98b1a38
1 parent 1256f69
commit 98b1a38
Showing 1 changed file with 37 additions and 40 deletions.
diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -133,72 +133,69 @@ class SYCLTeamMember {
     const unsigned int team_rank_ = team_rank();
 
     // First combine the values in the same subgroup
+#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+    auto shuffle_combine = [&](int shift) {
+      if (vector_range * shift < sub_group_range) {
+        const value_type tmp = sg.shuffle_down(value, vector_range * shift);
+        if (team_rank_ + shift < team_size_) reducer.join(value, tmp);
+      }
+    };
+    shuffle_combine(1);
+    shuffle_combine(2);
+    shuffle_combine(4);
+    shuffle_combine(8);
+    shuffle_combine(16);
+    KOKKOS_ASSERT(sub_group_range <= 32);
+#else
     for (unsigned int shift = 1; vector_range * shift < sub_group_range;
          shift <<= 1) {
       const value_type tmp = sg.shuffle_down(value, vector_range * shift);
       if (team_rank_ + shift < team_size_) reducer.join(value, tmp);
     }
+#endif
     value = sg.shuffle(value, 0);
 
-    const auto n_subgroups = sg.get_group_range()[0];
+    const int n_subgroups = sg.get_group_range()[0];
     if (n_subgroups == 1) {
       reducer.reference() = value;
       return;
     }
 
-    // We need to chunk up the whole reduction because we might not have
-    // allocated enough memory.
-    const unsigned int maximum_work_range =
-        std::min<int>(m_team_reduce_size / sizeof(value_type), n_subgroups);
+    // It was found experimentally that 16 is a good value for Intel PVC.
+    // Since there is a maximum number of 1024 threads with subgroup size 16,
+    // we have a maximum of 64 subgroups per workgroup which means 64/16=4
+    // rounds for loading values into the reduction_array, and 16 redundant
+    // reduction steps executed by every thread.
+    constexpr int step_width = 16;
+    auto tmp_alloc = sycl::ext::oneapi::group_local_memory_for_overwrite<
+        value_type[step_width]>(m_item.get_group());
+    auto& reduction_array = *tmp_alloc;
 
     const auto id_in_sg = sg.get_local_id()[0];
-    auto reduction_array =
-        static_cast<sycl::local_ptr<value_type>>(m_team_reduce);
 
-    // Load values into the first maximum_work_range values of the reduction
+    // Load values into the first step_width values of the reduction
     // array in chunks. This means that only sub groups with an id in the
     // corresponding chunk load values.
-    const auto group_id = sg.get_group_id()[0];
-    if (id_in_sg == 0 && group_id < maximum_work_range)
+    const int group_id = sg.get_group_id()[0];
+    if (id_in_sg == 0 && group_id < step_width)
       reduction_array[group_id] = value;
     sycl::group_barrier(m_item.get_group());
 
-    for (unsigned int start = maximum_work_range; start < n_subgroups;
-         start += maximum_work_range) {
+    for (int start = step_width; start < n_subgroups; start += step_width) {
       if (id_in_sg == 0 && group_id >= start &&
-          group_id <
-              std::min<unsigned int>(start + maximum_work_range, n_subgroups))
+          group_id < std::min(start + step_width, n_subgroups))
         reducer.join(reduction_array[group_id - start], value);
       sycl::group_barrier(m_item.get_group());
     }
 
-    // Let the first subgroup do the final reduction
-    if (group_id == 0) {
-      const auto local_range = sg.get_local_range()[0];
-      auto result =
-          reduction_array[id_in_sg < maximum_work_range ? id_in_sg : 0];
-      // In case the maximum_work_range is larger than the range of the first
-      // subgroup, we first combine the items with a higher index.
-      for (unsigned int offset = local_range; offset < maximum_work_range;
-           offset += local_range)
-        if (id_in_sg + offset < maximum_work_range)
-          reducer.join(result, reduction_array[id_in_sg + offset]);
-      sycl::group_barrier(sg);
-
-      // Now do the actual subgroup reduction.
-      const auto min_range =
-          std::min<unsigned int>(maximum_work_range, local_range);
-      for (unsigned int stride = 1; stride < min_range; stride <<= 1) {
-        const auto tmp = sg.shuffle_down(result, stride);
-        if (id_in_sg + stride < min_range) reducer.join(result, tmp);
-      }
-      if (id_in_sg == 0) reduction_array[0] = result;
-    }
-    sycl::group_barrier(m_item.get_group());
+    // Do the final reduction for all threads redundantly
+    value = reduction_array[0];
+    for (int i = 1; i < std::min(step_width, n_subgroups); ++i)
+      reducer.join(value, reduction_array[i]);
 
-    reducer.reference() = reduction_array[0];
-    // Make sure that the reduction array hasn't been modified in the meantime.
-    m_item.barrier(sycl::access::fence_space::local_space);
+    reducer.reference() = value;
+    // Make sure that every thread is done using the reduction array.
+    sycl::group_barrier(m_item.get_group());
   }
 
   //--------------------------------------------------------------------------