Skip to content

Commit

Permalink
SYCL: Improve team_reduce implementation (kokkos#6562)
Browse files Browse the repository at this point in the history
* SYCL: Improve team_reduce implementation

* Manually unroll loop to propagate shuffle destinations at compile-time

* Update step_width

* Fix sign comparison warnings

* Comment and choosing step_width=16

* KOKKOS_ASSERT that the subgroup range doesn't exceed 32

* Remove unattainable cases

* Add a barrier guarding the reduction array upon exiting team_reduce

* Update comment to better reflect intent of barrier

---------

Co-authored-by: Christian Trott <crtrott@sandia.gov>
  • Loading branch information
masterleinad and crtrott committed Apr 5, 2024
1 parent 1256f69 commit 98b1a38
Showing 1 changed file with 37 additions and 40 deletions.
77 changes: 37 additions & 40 deletions core/src/SYCL/Kokkos_SYCL_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,72 +133,69 @@ class SYCLTeamMember {
const unsigned int team_rank_ = team_rank();

// First combine the values in the same subgroup
#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
auto shuffle_combine = [&](int shift) {
if (vector_range * shift < sub_group_range) {
const value_type tmp = sg.shuffle_down(value, vector_range * shift);
if (team_rank_ + shift < team_size_) reducer.join(value, tmp);
}
};
shuffle_combine(1);
shuffle_combine(2);
shuffle_combine(4);
shuffle_combine(8);
shuffle_combine(16);
KOKKOS_ASSERT(sub_group_range <= 32);
#else
for (unsigned int shift = 1; vector_range * shift < sub_group_range;
shift <<= 1) {
const value_type tmp = sg.shuffle_down(value, vector_range * shift);
if (team_rank_ + shift < team_size_) reducer.join(value, tmp);
}
#endif
value = sg.shuffle(value, 0);

const auto n_subgroups = sg.get_group_range()[0];
const int n_subgroups = sg.get_group_range()[0];
if (n_subgroups == 1) {
reducer.reference() = value;
return;
}

// We need to chunk up the whole reduction because we might not have
// allocated enough memory.
const unsigned int maximum_work_range =
std::min<int>(m_team_reduce_size / sizeof(value_type), n_subgroups);
// It was found experimentally that 16 is a good value for Intel PVC.
// Since there is a maximum number of 1024 threads with subgroup size 16,
// we have a maximum of 64 subgroups per workgroup which means 64/16=4
// rounds for loading values into the reduction_array, and 16 redundant
// reduction steps executed by every thread.
constexpr int step_width = 16;
auto tmp_alloc = sycl::ext::oneapi::group_local_memory_for_overwrite<
value_type[step_width]>(m_item.get_group());
auto& reduction_array = *tmp_alloc;

const auto id_in_sg = sg.get_local_id()[0];
auto reduction_array =
static_cast<sycl::local_ptr<value_type>>(m_team_reduce);

// Load values into the first maximum_work_range values of the reduction
// Load values into the first step_width values of the reduction
// array in chunks. This means that only sub groups with an id in the
// corresponding chunk load values.
const auto group_id = sg.get_group_id()[0];
if (id_in_sg == 0 && group_id < maximum_work_range)
const int group_id = sg.get_group_id()[0];
if (id_in_sg == 0 && group_id < step_width)
reduction_array[group_id] = value;
sycl::group_barrier(m_item.get_group());

for (unsigned int start = maximum_work_range; start < n_subgroups;
start += maximum_work_range) {
for (int start = step_width; start < n_subgroups; start += step_width) {
if (id_in_sg == 0 && group_id >= start &&
group_id <
std::min<unsigned int>(start + maximum_work_range, n_subgroups))
group_id < std::min(start + step_width, n_subgroups))
reducer.join(reduction_array[group_id - start], value);
sycl::group_barrier(m_item.get_group());
}

// Let the first subgroup do the final reduction
if (group_id == 0) {
const auto local_range = sg.get_local_range()[0];
auto result =
reduction_array[id_in_sg < maximum_work_range ? id_in_sg : 0];
// In case the maximum_work_range is larger than the range of the first
// subgroup, we first combine the items with a higher index.
for (unsigned int offset = local_range; offset < maximum_work_range;
offset += local_range)
if (id_in_sg + offset < maximum_work_range)
reducer.join(result, reduction_array[id_in_sg + offset]);
sycl::group_barrier(sg);

// Now do the actual subgroup reduction.
const auto min_range =
std::min<unsigned int>(maximum_work_range, local_range);
for (unsigned int stride = 1; stride < min_range; stride <<= 1) {
const auto tmp = sg.shuffle_down(result, stride);
if (id_in_sg + stride < min_range) reducer.join(result, tmp);
}
if (id_in_sg == 0) reduction_array[0] = result;
}
sycl::group_barrier(m_item.get_group());
// Do the final reduction for all threads redundantly
value = reduction_array[0];
for (int i = 1; i < std::min(step_width, n_subgroups); ++i)
reducer.join(value, reduction_array[i]);

reducer.reference() = reduction_array[0];
// Make sure that the reduction array hasn't been modified in the meantime.
m_item.barrier(sycl::access::fence_space::local_space);
reducer.reference() = value;
// Make sure that every thread is done using the reduction array.
sycl::group_barrier(m_item.get_group());
}

//--------------------------------------------------------------------------
Expand Down

0 comments on commit 98b1a38

Please sign in to comment.