Allow templated functors in parallel_for, parallel_reduce and paralle…

…l_scan (kokkos#5976) * Allow templated functors in parallel_for, parallel_reduce and parallel_scan * Reorder template arguments for cuda_single_inter_block_reduce_scan_shmem * Add another test to TestFunctorAnalysis.hpp * Document OverrrideValueType some more * Document that reducer functor is templated on purpose * GenericScanFunctor->GenericExclusiveScanFunctor * SizeType->IndexType * Revert unnecessary changes in Test16_ParallelScan.hpp
masterleinad · May 10, 2023 · c62a42e · c62a42e
1 parent fb0c1b8
commit c62a42e
Show file tree

Hide file tree

Showing 35 changed files with 339 additions and 194 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -321,7 +321,7 @@ class ParallelReduce<CombinedFunctorReducerType,
   inline unsigned local_block_size(const FunctorType& f) {
     unsigned n = CudaTraits::WarpSize * 8;
     int shmem_size =
-        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
+        cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
             f, n);
     using closure_type =
         Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
@@ -339,8 +339,9 @@ class ParallelReduce<CombinedFunctorReducerType,
                  m_policy.space().impl_internal_space_instance(), attr, f, 1,
                  shmem_size, 0)))) {
       n >>= 1;
-      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n);
+      shmem_size =
+          cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
+              f, n);
     }
     return n;
   }
@@ -381,8 +382,8 @@ class ParallelReduce<CombinedFunctorReducerType,
       const int shmem =
           UseShflReduction
               ? 0
-              : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                          WorkTag>(
+              : cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                          value_type>(
                     m_functor_reducer.get_functor(), block.y);
 
       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
@@ -428,8 +429,8 @@ class ParallelReduce<CombinedFunctorReducerType,
         m_scratch_space(nullptr),
         m_scratch_flags(nullptr),
         m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy,
-                                           m_functor_reducer.get_functor());
+    check_reduced_view_shmem_size<WorkTag, value_type>(
+        m_policy, m_functor_reducer.get_functor());
   }
 };
 }  // namespace Impl

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -255,7 +255,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
   inline unsigned local_block_size(const FunctorType& f) {
     unsigned n = CudaTraits::WarpSize * 8;
     int shmem_size =
-        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
+        cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
             f, n);
     using closure_type =
         Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
@@ -273,8 +273,9 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
                  m_policy.space().impl_internal_space_instance(), attr, f, 1,
                  shmem_size, 0)))) {
       n >>= 1;
-      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n);
+      shmem_size =
+          cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
+              f, n);
     }
     return n;
   }
@@ -314,8 +315,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
       const int shmem =
           UseShflReduction
               ? 0
-              : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                          WorkTag>(
+              : cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                          value_type>(
                     m_functor_reducer.get_functor(), block.y);
 
       if ((nwork == 0)
@@ -373,8 +374,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
         m_scratch_space(nullptr),
         m_scratch_flags(nullptr),
         m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy,
-                                           m_functor_reducer.get_functor());
+    check_reduced_view_shmem_size<WorkTag, value_type>(
+        m_policy, m_functor_reducer.get_functor());
   }
 };
 
@@ -390,7 +391,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
   using LaunchBounds = typename Policy::launch_bounds;
 
   using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
-                                                 Policy, FunctorType>;
+                                                 Policy, FunctorType, void>;
 
  public:
   using pointer_type   = typename Analysis::pointer_type;
@@ -609,11 +610,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     // testing
 
     unsigned n = CudaTraits::WarpSize * 4;
-    while (n && unsigned(m_policy.space()
-                             .impl_internal_space_instance()
-                             ->m_maxShmemPerBlock) <
-                    cuda_single_inter_block_reduce_scan_shmem<true, FunctorType,
-                                                              WorkTag>(f, n)) {
+    while (n &&
+           unsigned(m_policy.space()
+                        .impl_internal_space_instance()
+                        ->m_maxShmemPerBlock) <
+               cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
+                                                         value_type>(f, n)) {
       n >>= 1;
     }
     return n;
@@ -703,8 +705,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   using WorkRange    = typename Policy::WorkRange;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
-                                                 Policy, FunctorType>;
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN, Policy,
+                                    FunctorType, ReturnType>;
 
  public:
   using value_type     = typename Analysis::value_type;
@@ -931,11 +934,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     // testing
 
     unsigned n = CudaTraits::WarpSize * 4;
-    while (n && unsigned(m_policy.space()
-                             .impl_internal_space_instance()
-                             ->m_maxShmemPerBlock) <
-                    cuda_single_inter_block_reduce_scan_shmem<true, FunctorType,
-                                                              WorkTag>(f, n)) {
+    while (n &&
+           unsigned(m_policy.space()
+                        .impl_internal_space_instance()
+                        ->m_maxShmemPerBlock) <
+               cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
+                                                         value_type>(f, n)) {
       n >>= 1;
     }
     return n;

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -114,7 +114,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
                            const ParallelReduceTag&) const {
     using functor_analysis_type =
         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType>;
+                              TeamPolicyInternal, FunctorType, void>;
     using closure_type = Impl::ParallelReduce<
         CombinedFunctorReducer<FunctorType,
                                typename functor_analysis_type::Reducer>,
@@ -153,7 +153,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
                                    const ParallelReduceTag&) const {
     using functor_analysis_type =
         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType>;
+                              TeamPolicyInternal, FunctorType, void>;
     using closure_type = Impl::ParallelReduce<
         CombinedFunctorReducer<FunctorType,
                                typename functor_analysis_type::Reducer>,
@@ -365,7 +365,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
         typename Impl::DeduceFunctorPatternInterface<ClosureType>::type;
     using Analysis =
         Impl::FunctorAnalysis<Interface, typename ClosureType::Policy,
-                              FunctorType>;
+                              FunctorType, void>;
 
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
@@ -893,8 +893,8 @@ class ParallelReduce<CombinedFunctorReducerType,
     m_team_begin =
         UseShflReduction
             ? 0
-            : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                        WorkTag>(
+            : cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                        value_type>(
                   arg_functor_reducer.get_functor(), m_team_size);
     m_shmem_begin = sizeof(double) * (m_team_size + 2);
     m_shmem_size  = m_policy.scratch_size(0, m_team_size) +

diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -672,34 +672,35 @@ __device__ bool cuda_single_inter_block_reduce_scan(
 }
 
 // Size in bytes required for inter block reduce or scan
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class ArgTag, class ValueType, class FunctorType>
 inline std::enable_if_t<DoScan, unsigned>
 cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
                                           const unsigned BlockSize) {
   using Analysis =
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                            RangePolicy<Cuda, ArgTag>, FunctorType>;
+                            RangePolicy<Cuda, ArgTag>, FunctorType, ValueType>;
 
   return (BlockSize + 2) * Analysis::value_size(functor);
 }
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class ArgTag, class ValueType, class FunctorType>
 inline std::enable_if_t<!DoScan, unsigned>
 cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
                                           const unsigned BlockSize) {
   using Analysis =
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                            RangePolicy<Cuda, ArgTag>, FunctorType>;
+                            RangePolicy<Cuda, ArgTag>, FunctorType, ValueType>;
 
   return (BlockSize + 2) * Analysis::value_size(functor);
 }
 
-template <typename WorkTag, typename Policy, typename FunctorType>
+template <typename WorkTag, typename ValueType, typename Policy,
+          typename FunctorType>
 inline void check_reduced_view_shmem_size(const Policy& policy,
                                           const FunctorType& functor) {
   size_t minBlockSize = CudaTraits::WarpSize * 1;
   unsigned reqShmemSize =
-      cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
+      cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, ValueType>(
           functor, minBlockSize);
   size_t maxShmemPerBlock =
       policy.space().impl_internal_space_instance()->m_maxShmemPerBlock;

diff --git a/core/src/Cuda/Kokkos_Cuda_Task.hpp b/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -1042,7 +1042,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   // Extract value_type from closure
 
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
 
   if (1 < loop_boundaries.thread.team_size()) {
     // make sure all threads perform all loop iterations
@@ -1107,7 +1108,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   // Extract value_type from closure
 
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
 
   if (1 < loop_boundaries.thread.team_size()) {
     // make sure all threads perform all loop iterations

diff --git a/core/src/Cuda/Kokkos_Cuda_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -196,8 +196,9 @@ class CudaTeamMember {
     (void)reducer;
     (void)value;
     KOKKOS_IF_ON_DEVICE(
-        (typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                        TeamPolicy<Cuda>, ReducerType>::Reducer
+        (typename Impl::FunctorAnalysis<
+             Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>,
+             ReducerType, typename ReducerType::value_type>::Reducer
              wrapped_reducer(reducer);
          cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);
          reducer.reference() = value;))
@@ -228,7 +229,8 @@ class CudaTeamMember {
         Impl::CudaJoinFunctor<Type> cuda_join_functor;
         typename Impl::FunctorAnalysis<
             Impl::FunctorPatternInterface::SCAN, TeamPolicy<Cuda>,
-            Impl::CudaJoinFunctor<Type>>::Reducer reducer(cuda_join_functor);
+            Impl::CudaJoinFunctor<Type>, Type>::Reducer
+            reducer(cuda_join_functor);
         Impl::cuda_intra_block_reduce_scan<true>(reducer, base_data + 1);
 
         if (global_accum) {
@@ -688,8 +690,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     const FunctorType& lambda) {
   // Extract value_type from lambda
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void,
-      FunctorType>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
 
   const auto start     = loop_bounds.start;
   const auto end       = loop_bounds.end;
@@ -825,7 +827,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
         loop_boundaries,
     const Closure& closure) {
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
   value_type dummy;
   parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy));
 }

diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -281,8 +281,8 @@ class ParallelReduce<CombinedFunctorReducerType,
   inline unsigned local_block_size(const FunctorType& f) {
     const auto& instance = m_policy.space().impl_internal_space_instance();
     auto shmem_functor   = [&f](unsigned n) {
-      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                      WorkTag>(f, n);
+      return hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                      value_type>(f, n);
     };
 
     unsigned block_size =
@@ -331,8 +331,8 @@ class ParallelReduce<CombinedFunctorReducerType,
 
       const int shmem =
           ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-              false, FunctorType, WorkTag>(m_functor_reducer.get_functor(),
-                                           block.y);
+              false, WorkTag, value_type>(m_functor_reducer.get_functor(),
+                                          block.y);
 
       hip_parallel_launch<ParallelReduce, LaunchBounds>(
           *this, grid, block, shmem,