Skip to content

Commit

Permalink
[HIP] Optimize parallel_reduce (kokkos#6229)
Browse files Browse the repository at this point in the history
* always use lds

Change-Id: I6dd7b347b74c197257160ab8657f57c7c0489cb2

* fix half and int8_t reductions

Change-Id: I1ddb1e65768b4df0f556000a3b9fcf7ee4c00e28

* Use __syncthreads_or, implemented since ROCm 4.5

Change-Id: Ibf147742743fa03d97c2d77d65855b21a58db1d9

* add heuristic

Change-Id: I9141ddaac84e5d8c590756122d5763407642afcd

* tune heuristic for RHODO

Change-Id: Id6b8a29aaaa5613d6bca1ae3031025996bfb8304

* apply code style patch

Change-Id: I2f1e2f4fcfc563d27f1c6ca31fbd32f81953ac76

* tweak min block-size

Change-Id: I4b3c016f14621bc23fd2262e5201498d3ec9e8a4

* Update core/src/HIP/Kokkos_HIP_Parallel_Range.hpp

Co-authored-by: Daniel Arndt <arndtd@ornl.gov>

* remove unused variable

Change-Id: I41e6819d04fce43d017eec02920d3f6bdc40b52b

---------

Co-authored-by: Nicholas Curtis <nicurtis@amd.com>
Co-authored-by: Daniel Arndt <arndtd@ornl.gov>
  • Loading branch information
3 people committed Jul 27, 2023
1 parent 8b08415 commit 6cffdb4
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 57 deletions.
94 changes: 63 additions & 31 deletions core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,21 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
using functor_type = FunctorType;
using size_type = Kokkos::HIP::size_type;
using index_type = typename Policy::index_type;
// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::HIP::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the scan is performed.
// Within the scan, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the scan, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
using word_size_type = std::conditional_t<
sizeof(value_type) < sizeof(size_type),
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;

// Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
// blockDim.z == 1
Expand All @@ -132,11 +147,10 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;
const bool m_result_ptr_host_accessible;
size_type* m_scratch_space = nullptr;
size_type* m_scratch_flags = nullptr;
word_size_type* m_scratch_space = nullptr;
size_type* m_scratch_flags = nullptr;

static bool constexpr UseShflReduction =
static_cast<bool>(ReducerType::static_value_size());
static bool constexpr UseShflReduction = false;

private:
struct ShflReductionTag {};
Expand Down Expand Up @@ -164,13 +178,14 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,

__device__ inline void run(SHMEMReductionTag) const {
const ReducerType& reducer = m_functor_reducer.get_reducer();
const integral_nonzero_constant<
size_type, ReducerType::static_value_size() / sizeof(size_type)>
word_count(reducer.value_size() / sizeof(size_type));
const integral_nonzero_constant<word_size_type,
ReducerType::static_value_size() /
sizeof(word_size_type)>
word_count(reducer.value_size() / sizeof(word_size_type));

{
reference_type value = reducer.init(reinterpret_cast<pointer_type>(
::Kokkos::kokkos_impl_hip_shared_memory<size_type>() +
::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
threadIdx.y * word_count.value));

// Number of blocks is bounded so that the reduction can be limited to two
Expand All @@ -193,18 +208,19 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
if (!do_final_reduction)
do_final_reduction = hip_single_inter_block_reduce_scan<false>(
reducer, blockIdx.x, gridDim.x,
::Kokkos::kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
m_scratch_flags);
::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>(),
m_scratch_space, m_scratch_flags);
if (do_final_reduction) {
// This is the final block with the final result at the final threads'
// location

size_type* const shared =
::Kokkos::kokkos_impl_hip_shared_memory<size_type>() +
word_size_type* const shared =
::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
(blockDim.y - 1) * word_count.value;
size_type* const global = m_result_ptr_device_accessible
? reinterpret_cast<size_type*>(m_result_ptr)
: m_scratch_space;
word_size_type* const global =
m_result_ptr_device_accessible
? reinterpret_cast<word_size_type*>(m_result_ptr)
: m_scratch_space;

if (threadIdx.y == 0) {
reducer.final(reinterpret_cast<value_type*>(shared));
Expand Down Expand Up @@ -300,24 +316,40 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
// use a slightly less constrained, but still well bounded limit for
// scratch
int nblocks = (nwork + block.y - 1) / block.y;
// Heuristic deciding the value of nblocks. The values for the light
// weight case have been chosen using a vector product benchmark on MI250.
constexpr auto light_weight =
Kokkos::Experimental::WorkItemProperty::HintLightWeight;
constexpr typename Policy::work_item_property property;
if ((property & light_weight) == light_weight) {
if (nblocks < block_size) {
// Keep nblocks as is
} else if (nblocks < 16 * block_size) {
nblocks = block_size;
} else {
nblocks = 4 * block_size;
}
// Heuristic deciding the value of nblocks.
// The general idea here is we want to:
// 1. Not undersubscribe the device (i.e., we want at least
// preferred_block_min blocks)
// 2. Have each thread reduce > 1 value to minimize overheads
// 3. Limit the total # of blocks, to avoid unbounded scratch space
constexpr int block_max = 4096;
constexpr int preferred_block_min = 1024;

if (nblocks < preferred_block_min) {
// keep blocks as is, already have low parallelism
} else if (nblocks > block_max) {
// "large dispatch" -> already have lots of parallelism
nblocks = block_max;
} else {
nblocks = std::min(nblocks, 4096);
// in the intermediate range, try to have each thread process multiple
// items to offset the cost of the reduction (with not enough
// parallelism to hide it)
int items_per_thread =
(nwork + nblocks * block_size - 1) / (nblocks * block_size);
if (items_per_thread < 4) {
int ratio = std::min(
(nblocks + preferred_block_min - 1) / preferred_block_min,
(4 + items_per_thread - 1) / items_per_thread);
nblocks /= ratio;
}
}
m_scratch_space = ::Kokkos::Impl::hip_internal_scratch_space(
m_policy.space(), reducer.value_size() * nblocks);

// TODO: down casting these uses more space than required?
m_scratch_space =
(word_size_type*)::Kokkos::Impl::hip_internal_scratch_space(
m_policy.space(), reducer.value_size() * nblocks);
// Intentionally do not downcast to word_size_type since we use HIP
// atomics in Kokkos_HIP_ReduceScan.hpp
m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags(
m_policy.space(), sizeof(size_type));
// Required grid.x <= block.y
Expand Down
34 changes: 8 additions & 26 deletions core/src/HIP/Kokkos_HIP_ReduceScan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,12 @@ struct HIPReductionsFunctor<FunctorType, true> {

// Use the last block that is done to do the do the reduction across the
// block
__shared__ unsigned int num_teams_done;
unsigned int num_teams_done = 0;
if (threadIdx.x + threadIdx.y == 0) {
num_teams_done = Kokkos::atomic_fetch_add(global_flags, 1) + 1;
}
bool is_last_block = false;
// FIXME_HIP HIP does not support syncthreads_or. That's why we need to make
// num_teams_done __shared__
// if (__syncthreads_or(num_teams_done == gridDim.x)) {*/
__syncthreads();
if (num_teams_done == gridDim.x) {
if (__syncthreads_or(num_teams_done == gridDim.x)) {
is_last_block = true;
*global_flags = 0;
functor.init(&value);
Expand Down Expand Up @@ -214,16 +210,12 @@ struct HIPReductionsFunctor<FunctorType, false> {

// Use the last block that is done to do the do the reduction across the
// block
__shared__ unsigned int num_teams_done;
unsigned int num_teams_done = 0;
if (threadIdx.x + threadIdx.y == 0) {
num_teams_done = Kokkos::atomic_fetch_add(global_flags, 1) + 1;
}
bool is_last_block = false;
// FIXME_HIP HIP does not support syncthreads_or. That's why we need to make
// num_teams_done __shared__
// if (__syncthreads_or(num_teams_done == gridDim.x)) {*/
__syncthreads();
if (num_teams_done == gridDim.x) {
if (__syncthreads_or(num_teams_done == gridDim.x)) {
is_last_block = true;
*global_flags = 0;
functor.init(&value);
Expand Down Expand Up @@ -395,20 +387,10 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
// Contributing blocks note that their contribution has been completed via an
// atomic-increment flag If this block is not the last block to contribute to
// this group then the block is done.
// FIXME_HIP __syncthreads_or is not supported by HIP yet.
// const bool is_last_block = !__syncthreads_or(
// threadIdx.y
// ? 0
// : (1 + atomicInc(global_flags, block_count - 1) < block_count));
__shared__ int n_done;
n_done = 0;
__syncthreads();
if (threadIdx.y == 0) {
n_done = 1 + atomicInc(global_flags, block_count - 1);
}
__syncthreads();
bool const is_last_block = (n_done == static_cast<int>(block_count));

const bool is_last_block = !__syncthreads_or(
threadIdx.y
? 0
: (1 + atomicInc(global_flags, block_count - 1) < block_count));
if (is_last_block) {
size_type const b = (static_cast<long long int>(block_count) *
static_cast<long long int>(threadIdx.y)) >>
Expand Down

0 comments on commit 6cffdb4

Please sign in to comment.