Skip to content

Commit

Permalink
Update to HIP TeamPolicy Block number heuristic (kokkos#6284)
Browse files Browse the repository at this point in the history
* Update to TeamPolicy Block number heuristic to improve performance in mid-range regimes

* Commented out UseShflReduction=false and related changes, each one has a nearby FIXME_HIP.
Tests are passing, build succeeds.

* Applied formatting patch

* Removed commented out code for smaller size types

* Added check for HintLightWeight_t, to allow more consistent performance on light weight kernels

* Applied formatting patch

* Removed outdated comment block

* Another format patch

* Move computation of block size to a function

* Make compute_block_count const

---------

Co-authored-by: Bruno Turcksin <bruno.turcksin@gmail.com>
  • Loading branch information
IanBogle and Rombur committed Oct 10, 2023
1 parent 4e69e40 commit 8420c2f
Showing 1 changed file with 33 additions and 5 deletions.
38 changes: 33 additions & 5 deletions core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,9 @@ class ParallelReduce<CombinedFunctorReducerType,
using functor_type = FunctorType;
using size_type = HIP::size_type;

// static int constexpr UseShflReduction = false;
// FIXME_HIP This should be disabled unconditionally for best performance, but
// it currently causes tests to fail.
static constexpr int UseShflReduction =
(ReducerType::static_value_size() != 0);

Expand Down Expand Up @@ -654,6 +657,35 @@ class ParallelReduce<CombinedFunctorReducerType,
}
}

int compute_block_count() const {
constexpr auto light_weight =
Kokkos::Experimental::WorkItemProperty::HintLightWeight;
constexpr typename Policy::work_item_property property;
// Numbers were tuned on MI210 using dot product and yAx benchmarks
constexpr int block_max =
(property & light_weight) == light_weight ? 2097152 : 65536;
constexpr int preferred_block_min = 1024;
int block_count = m_league_size;
if (block_count < preferred_block_min) {
// keep blocks as is, already low parallelism
} else if (block_count >= block_max) {
block_count = block_max;

} else {
int nwork = m_league_size * m_team_size;
int items_per_thread =
(nwork + block_count * m_team_size - 1) / (block_count * m_team_size);
if (items_per_thread < 4) {
int ratio = std::min(
(block_count + preferred_block_min - 1) / preferred_block_min,
(4 + items_per_thread - 1) / items_per_thread);
block_count /= ratio;
}
}

return block_count;
}

public:
__device__ inline void operator()() const {
int64_t threadid = 0;
Expand Down Expand Up @@ -681,7 +713,6 @@ class ParallelReduce<CombinedFunctorReducerType,
reference_type value =
reducer.init(kokkos_impl_hip_shared_memory<size_type>() +
threadIdx.y * word_count.value);

// Iterate this block through the league
iterate_through_league(threadid, value);

Expand Down Expand Up @@ -757,10 +788,7 @@ class ParallelReduce<CombinedFunctorReducerType,
Policy::is_graph_kernel::value ||
!std::is_same<ReducerType, InvalidType>::value;
if (!is_empty_range || need_device_set) {
const int block_count =
UseShflReduction
? std::min(m_league_size, size_type(1024 * HIPTraits::WarpSize))
: std::min(static_cast<int>(m_league_size), m_team_size);
int const block_count = compute_block_count();

m_scratch_space = hip_internal_scratch_space(
m_policy.space(), reducer.value_size() * block_count);
Expand Down

0 comments on commit 8420c2f

Please sign in to comment.