Skip to content

Commit

Permalink
Allow templated functors in parallel_for, parallel_reduce and paralle…
Browse files Browse the repository at this point in the history
…l_scan (kokkos#5976)

* Allow templated functors in parallel_for, parallel_reduce and parallel_scan

* Reorder template arguments for cuda_single_inter_block_reduce_scan_shmem

* Add another test to TestFunctorAnalysis.hpp

* Document OverrrideValueType some more

* Document that reducer functor is templated on purpose

* GenericScanFunctor->GenericExclusiveScanFunctor

* SizeType->IndexType

* Revert unnecessary changes in Test16_ParallelScan.hpp
  • Loading branch information
masterleinad committed May 10, 2023
1 parent fb0c1b8 commit c62a42e
Show file tree
Hide file tree
Showing 35 changed files with 339 additions and 194 deletions.
15 changes: 8 additions & 7 deletions core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ class ParallelReduce<CombinedFunctorReducerType,
inline unsigned local_block_size(const FunctorType& f) {
unsigned n = CudaTraits::WarpSize * 8;
int shmem_size =
cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
f, n);
using closure_type =
Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
Expand All @@ -339,8 +339,9 @@ class ParallelReduce<CombinedFunctorReducerType,
m_policy.space().impl_internal_space_instance(), attr, f, 1,
shmem_size, 0)))) {
n >>= 1;
shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
WorkTag>(f, n);
shmem_size =
cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
f, n);
}
return n;
}
Expand Down Expand Up @@ -381,8 +382,8 @@ class ParallelReduce<CombinedFunctorReducerType,
const int shmem =
UseShflReduction
? 0
: cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
WorkTag>(
: cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
value_type>(
m_functor_reducer.get_functor(), block.y);

CudaParallelLaunch<ParallelReduce, LaunchBounds>(
Expand Down Expand Up @@ -428,8 +429,8 @@ class ParallelReduce<CombinedFunctorReducerType,
m_scratch_space(nullptr),
m_scratch_flags(nullptr),
m_unified_space(nullptr) {
check_reduced_view_shmem_size<WorkTag>(m_policy,
m_functor_reducer.get_functor());
check_reduced_view_shmem_size<WorkTag, value_type>(
m_policy, m_functor_reducer.get_functor());
}
};
} // namespace Impl
Expand Down
44 changes: 24 additions & 20 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
inline unsigned local_block_size(const FunctorType& f) {
unsigned n = CudaTraits::WarpSize * 8;
int shmem_size =
cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
f, n);
using closure_type =
Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
Expand All @@ -273,8 +273,9 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
m_policy.space().impl_internal_space_instance(), attr, f, 1,
shmem_size, 0)))) {
n >>= 1;
shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
WorkTag>(f, n);
shmem_size =
cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
f, n);
}
return n;
}
Expand Down Expand Up @@ -314,8 +315,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
const int shmem =
UseShflReduction
? 0
: cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
WorkTag>(
: cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
value_type>(
m_functor_reducer.get_functor(), block.y);

if ((nwork == 0)
Expand Down Expand Up @@ -373,8 +374,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
m_scratch_space(nullptr),
m_scratch_flags(nullptr),
m_unified_space(nullptr) {
check_reduced_view_shmem_size<WorkTag>(m_policy,
m_functor_reducer.get_functor());
check_reduced_view_shmem_size<WorkTag, value_type>(
m_policy, m_functor_reducer.get_functor());
}
};

Expand All @@ -390,7 +391,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
using LaunchBounds = typename Policy::launch_bounds;

using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
Policy, FunctorType>;
Policy, FunctorType, void>;

public:
using pointer_type = typename Analysis::pointer_type;
Expand Down Expand Up @@ -609,11 +610,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// testing

unsigned n = CudaTraits::WarpSize * 4;
while (n && unsigned(m_policy.space()
.impl_internal_space_instance()
->m_maxShmemPerBlock) <
cuda_single_inter_block_reduce_scan_shmem<true, FunctorType,
WorkTag>(f, n)) {
while (n &&
unsigned(m_policy.space()
.impl_internal_space_instance()
->m_maxShmemPerBlock) <
cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
value_type>(f, n)) {
n >>= 1;
}
return n;
Expand Down Expand Up @@ -703,8 +705,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
using WorkRange = typename Policy::WorkRange;
using LaunchBounds = typename Policy::launch_bounds;

using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
Policy, FunctorType>;
using Analysis =
Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN, Policy,
FunctorType, ReturnType>;

public:
using value_type = typename Analysis::value_type;
Expand Down Expand Up @@ -931,11 +934,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// testing

unsigned n = CudaTraits::WarpSize * 4;
while (n && unsigned(m_policy.space()
.impl_internal_space_instance()
->m_maxShmemPerBlock) <
cuda_single_inter_block_reduce_scan_shmem<true, FunctorType,
WorkTag>(f, n)) {
while (n &&
unsigned(m_policy.space()
.impl_internal_space_instance()
->m_maxShmemPerBlock) <
cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
value_type>(f, n)) {
n >>= 1;
}
return n;
Expand Down
10 changes: 5 additions & 5 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
const ParallelReduceTag&) const {
using functor_analysis_type =
Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
TeamPolicyInternal, FunctorType>;
TeamPolicyInternal, FunctorType, void>;
using closure_type = Impl::ParallelReduce<
CombinedFunctorReducer<FunctorType,
typename functor_analysis_type::Reducer>,
Expand Down Expand Up @@ -153,7 +153,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
const ParallelReduceTag&) const {
using functor_analysis_type =
Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
TeamPolicyInternal, FunctorType>;
TeamPolicyInternal, FunctorType, void>;
using closure_type = Impl::ParallelReduce<
CombinedFunctorReducer<FunctorType,
typename functor_analysis_type::Reducer>,
Expand Down Expand Up @@ -365,7 +365,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
typename Impl::DeduceFunctorPatternInterface<ClosureType>::type;
using Analysis =
Impl::FunctorAnalysis<Interface, typename ClosureType::Policy,
FunctorType>;
FunctorType, void>;

cudaFuncAttributes attr =
CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
Expand Down Expand Up @@ -893,8 +893,8 @@ class ParallelReduce<CombinedFunctorReducerType,
m_team_begin =
UseShflReduction
? 0
: cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
WorkTag>(
: cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
value_type>(
arg_functor_reducer.get_functor(), m_team_size);
m_shmem_begin = sizeof(double) * (m_team_size + 2);
m_shmem_size = m_policy.scratch_size(0, m_team_size) +
Expand Down
13 changes: 7 additions & 6 deletions core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -672,34 +672,35 @@ __device__ bool cuda_single_inter_block_reduce_scan(
}

// Size in bytes required for inter block reduce or scan
template <bool DoScan, class FunctorType, class ArgTag>
template <bool DoScan, class ArgTag, class ValueType, class FunctorType>
inline std::enable_if_t<DoScan, unsigned>
cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
const unsigned BlockSize) {
using Analysis =
Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
RangePolicy<Cuda, ArgTag>, FunctorType>;
RangePolicy<Cuda, ArgTag>, FunctorType, ValueType>;

return (BlockSize + 2) * Analysis::value_size(functor);
}

template <bool DoScan, class FunctorType, class ArgTag>
template <bool DoScan, class ArgTag, class ValueType, class FunctorType>
inline std::enable_if_t<!DoScan, unsigned>
cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
const unsigned BlockSize) {
using Analysis =
Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
RangePolicy<Cuda, ArgTag>, FunctorType>;
RangePolicy<Cuda, ArgTag>, FunctorType, ValueType>;

return (BlockSize + 2) * Analysis::value_size(functor);
}

template <typename WorkTag, typename Policy, typename FunctorType>
template <typename WorkTag, typename ValueType, typename Policy,
typename FunctorType>
inline void check_reduced_view_shmem_size(const Policy& policy,
const FunctorType& functor) {
size_t minBlockSize = CudaTraits::WarpSize * 1;
unsigned reqShmemSize =
cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, ValueType>(
functor, minBlockSize);
size_t maxShmemPerBlock =
policy.space().impl_internal_space_instance()->m_maxShmemPerBlock;
Expand Down
6 changes: 4 additions & 2 deletions core/src/Cuda/Kokkos_Cuda_Task.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1042,7 +1042,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
// Extract value_type from closure

using value_type = typename Kokkos::Impl::FunctorAnalysis<
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
void>::value_type;

if (1 < loop_boundaries.thread.team_size()) {
// make sure all threads perform all loop iterations
Expand Down Expand Up @@ -1107,7 +1108,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
// Extract value_type from closure

using value_type = typename Kokkos::Impl::FunctorAnalysis<
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
void>::value_type;

if (1 < loop_boundaries.thread.team_size()) {
// make sure all threads perform all loop iterations
Expand Down
15 changes: 9 additions & 6 deletions core/src/Cuda/Kokkos_Cuda_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,9 @@ class CudaTeamMember {
(void)reducer;
(void)value;
KOKKOS_IF_ON_DEVICE(
(typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
TeamPolicy<Cuda>, ReducerType>::Reducer
(typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>,
ReducerType, typename ReducerType::value_type>::Reducer
wrapped_reducer(reducer);
cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);
reducer.reference() = value;))
Expand Down Expand Up @@ -228,7 +229,8 @@ class CudaTeamMember {
Impl::CudaJoinFunctor<Type> cuda_join_functor;
typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::SCAN, TeamPolicy<Cuda>,
Impl::CudaJoinFunctor<Type>>::Reducer reducer(cuda_join_functor);
Impl::CudaJoinFunctor<Type>, Type>::Reducer
reducer(cuda_join_functor);
Impl::cuda_intra_block_reduce_scan<true>(reducer, base_data + 1);

if (global_accum) {
Expand Down Expand Up @@ -688,8 +690,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
const FunctorType& lambda) {
// Extract value_type from lambda
using value_type = typename Kokkos::Impl::FunctorAnalysis<
Kokkos::Impl::FunctorPatternInterface::SCAN, void,
FunctorType>::value_type;
Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
void>::value_type;

const auto start = loop_bounds.start;
const auto end = loop_bounds.end;
Expand Down Expand Up @@ -825,7 +827,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
loop_boundaries,
const Closure& closure) {
using value_type = typename Kokkos::Impl::FunctorAnalysis<
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
void>::value_type;
value_type dummy;
parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy));
}
Expand Down
8 changes: 4 additions & 4 deletions core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,8 @@ class ParallelReduce<CombinedFunctorReducerType,
inline unsigned local_block_size(const FunctorType& f) {
const auto& instance = m_policy.space().impl_internal_space_instance();
auto shmem_functor = [&f](unsigned n) {
return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
WorkTag>(f, n);
return hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
value_type>(f, n);
};

unsigned block_size =
Expand Down Expand Up @@ -331,8 +331,8 @@ class ParallelReduce<CombinedFunctorReducerType,

const int shmem =
::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
false, FunctorType, WorkTag>(m_functor_reducer.get_functor(),
block.y);
false, WorkTag, value_type>(m_functor_reducer.get_functor(),
block.y);

hip_parallel_launch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
Expand Down

0 comments on commit c62a42e

Please sign in to comment.