Skip to content

Commit

Permalink
kokkos#5635: Add parallel_scan with value for CUDA and ThreadVectorRange
Browse files Browse the repository at this point in the history
  • Loading branch information
thearusable authored and cz4rs committed Sep 19, 2023
1 parent 6a95b5f commit 9632055
Showing 1 changed file with 26 additions and 0 deletions.
26 changes: 26 additions & 0 deletions core/src/Cuda/Kokkos_Cuda_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,32 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy));
}

/** \brief Intra-thread vector parallel exclusive prefix sum.
*
* Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
*
* The range [0..N) is mapped to all vector lanes in the
* thread and a scan operation is performed.
* The last call to closure has final == true.
*/
template <typename iType, class Closure, typename ValueType>
KOKKOS_INLINE_FUNCTION void parallel_scan(
const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>&
loop_boundaries,
const Closure& closure, ValueType& return_val) {
// Extract ValueType from the Closure
using closure_value_type = typename Kokkos::Impl::FunctorAnalysis<
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
ValueType>::value_type;
static_assert(std::is_same<closure_value_type, ValueType>::value,
"Non-matching value types of closure and return type");

ValueType accum;
parallel_scan(loop_boundaries, closure, Kokkos::Sum<ValueType>(accum));

return_val = accum;
}

} // namespace Kokkos

namespace Kokkos {
Expand Down

0 comments on commit 9632055

Please sign in to comment.