Trim some fat in CudaInternal (towards multiple GPUs support) (kokk…

…os#6544) * Remove unused Impl::cuda_internal_maximum_concurrent_block_count function * Drop unused CudaInternal::m_shmemPerSM * Drop unused CudaInternal::m_maxBlocksPerSM * Drop unused CudaInternal::m_maxThreadsPerBlock * Drop (unused) Impl::cuda_internal_maximum_warp_count() * Drop CudaInternal::m_maxWarpCount data member * Drop CudaInternal::m_multiProcCount data member * Drop CudaInternal::m_maxThreadsPerSM data member * Drop CudaInternal::m_maxBlock data member * Get rid of Impl::cuda_internal_maximum_grid_count() * Drop CudaInternal::m_maxShmemPerBlock static data member * Drop Impl::cuda_internal_multiprocessor_count() * Drop (unused) CudaTraits::warp_{count,align} static member functions * Fixup exception msg * Fixup tasking get exec space from teh scheduler * Try to get to the device properties some othe eay in cuda tasking
masterleinad · Oct 30, 2023 · 400dd1d · 400dd1d
1 parent 201d1de
commit 400dd1d
Show file tree

Hide file tree

Showing 10 changed files with 53 additions and 118 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -383,8 +383,13 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) {
   // Allocate some initial space.  This will grow as needed.
 
   {
-    const unsigned reduce_block_count =
-        m_maxWarpCount * Impl::CudaTraits::WarpSize;
+    // Maximum number of warps,
+    // at most one warp per thread in a warp for reduction.
+    auto const maxWarpCount = std::min<unsigned>(
+        m_deviceProp.maxThreadsPerBlock / CudaTraits::WarpSize,
+        CudaTraits::WarpSize);
+    unsigned const reduce_block_count =
+        maxWarpCount * Impl::CudaTraits::WarpSize;
 
     (void)scratch_unified(16 * sizeof(size_type));
     (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
@@ -624,30 +629,6 @@ void CudaInternal::finalize() {
 
 //----------------------------------------------------------------------------
 
-Cuda::size_type cuda_internal_multiprocessor_count() {
-  return CudaInternal::singleton().m_multiProcCount;
-}
-
-CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() {
-#if defined(KOKKOS_ARCH_KEPLER)
-  // Compute capability 3.0 through 3.7
-  enum : int { max_resident_blocks_per_multiprocessor = 16 };
-#else
-  // Compute capability 5.0 through 6.2
-  enum : int { max_resident_blocks_per_multiprocessor = 32 };
-#endif
-  return CudaInternal::singleton().m_multiProcCount *
-         max_resident_blocks_per_multiprocessor;
-};
-
-Cuda::size_type cuda_internal_maximum_warp_count() {
-  return CudaInternal::singleton().m_maxWarpCount;
-}
-
-std::array<Cuda::size_type, 3> cuda_internal_maximum_grid_count() {
-  return CudaInternal::singleton().m_maxBlock;
-}
-
 Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance,
                                              const std::size_t size) {
   return instance.impl_internal_space_instance()->scratch_space(size);
@@ -761,38 +742,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   }
 #endif
 
-  //----------------------------------
-  // number of multiprocessors
-  Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;
-
-  //----------------------------------
-  // Maximum number of warps,
-  // at most one warp per thread in a warp for reduction.
-  Impl::CudaInternal::m_maxWarpCount =
-      cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;
-
-  if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
-    Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
-  }
-
-  //----------------------------------
-  // Maximum number of blocks:
-
-  Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
-  Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
-  Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];
-
-  Impl::CudaInternal::m_shmemPerSM       = cudaProp.sharedMemPerMultiprocessor;
-  Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
-  Impl::CudaInternal::m_maxBlocksPerSM =
-      Impl::CudaInternal::m_cudaArch < 500
-          ? 16
-          : (Impl::CudaInternal::m_cudaArch < 750
-                 ? 32
-                 : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
-  Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor;
-  Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;
-
   //----------------------------------
 
   cudaStream_t singleton_stream;

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -55,27 +55,10 @@ struct CudaTraits {
       unsigned long[ConstantMemoryUsage / sizeof(unsigned long)];
 
   static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */;
-
-  KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count(
-      CudaSpace::size_type i) {
-    return (i + WarpIndexMask) >> WarpIndexShift;
-  }
-
-  KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align(
-      CudaSpace::size_type i) {
-    constexpr CudaSpace::size_type Mask = ~WarpIndexMask;
-    return (i + WarpIndexMask) & Mask;
-  }
 };
 
 //----------------------------------------------------------------------------
 
-CudaSpace::size_type cuda_internal_multiprocessor_count();
-CudaSpace::size_type cuda_internal_maximum_warp_count();
-std::array<CudaSpace::size_type, 3> cuda_internal_maximum_grid_count();
-
-CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
-
 CudaSpace::size_type* cuda_internal_scratch_flags(const Cuda&,
                                                   const std::size_t size);
 CudaSpace::size_type* cuda_internal_scratch_space(const Cuda&,
@@ -104,15 +87,7 @@ class CudaInternal {
   inline static int m_cudaDev = -1;
 
   // Device Properties
-  inline static int m_cudaArch                      = -1;
-  inline static unsigned m_multiProcCount           = 0;
-  inline static unsigned m_maxWarpCount             = 0;
-  inline static std::array<size_type, 3> m_maxBlock = {0, 0, 0};
-  inline static int m_shmemPerSM                    = 0;
-  inline static int m_maxShmemPerBlock              = 0;
-  inline static int m_maxBlocksPerSM                = 0;
-  inline static int m_maxThreadsPerSM               = 0;
-  inline static int m_maxThreadsPerBlock            = 0;
+  inline static int m_cudaArch = -1;
   static int concurrency();
 
   inline static cudaDeviceProp m_deviceProp;

diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -21,7 +21,6 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <mutex>
-#include <string>
 #include <cstdint>
 #include <cmath>
 #include <Kokkos_Parallel.hpp>
@@ -118,10 +117,11 @@ inline bool is_empty_launch(dim3 const& grid, dim3 const& block) {
 }
 
 inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
-  if (cuda_instance->m_maxShmemPerBlock < shmem) {
+  int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock;
+  if (maxShmemPerBlock < shmem) {
     Kokkos::Impl::throw_runtime_exception(
-        std::string("CudaParallelLaunch (or graph node creation) FAILED: shared"
-                    " memory request is too large"));
+        "CudaParallelLaunch (or graph node creation) FAILED: shared memory "
+        "request is too large");
   }
 }
 

diff --git a/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp
@@ -40,8 +40,8 @@ template <>
 inline TileSizeProperties get_tile_size_properties<Kokkos::Cuda>(
     const Kokkos::Cuda& space) {
   TileSizeProperties properties;
-  properties.max_threads =
-      space.impl_internal_space_instance()->m_maxThreadsPerSM;
+  properties.max_threads = space.impl_internal_space_instance()
+                               ->m_deviceProp.maxThreadsPerMultiProcessor;
   properties.default_largest_tile_size = 16;
   properties.default_tile_size         = 2;
   properties.max_total_tile_size       = 512;

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -96,7 +96,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
 
   inline void execute() const {
     if (m_rp.m_num_tiles == 0) return;
-    const auto maxblocks = cuda_internal_maximum_grid_count();
+    const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize;
     if (RP::rank == 2) {
       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
       KOKKOS_ASSERT(block.x > 0);
@@ -320,6 +320,8 @@ class ParallelReduce<CombinedFunctorReducerType,
   // Determine block size constrained by shared memory:
   inline unsigned local_block_size(const FunctorType& f) {
     unsigned n = CudaTraits::WarpSize * 8;
+    int const maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     int shmem_size =
         cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
             f, n);
@@ -330,9 +332,7 @@ class ParallelReduce<CombinedFunctorReducerType,
         CudaParallelLaunch<closure_type,
                            LaunchBounds>::get_cuda_func_attributes();
     while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
+        (n && (maxShmemPerBlock < shmem_size)) ||
         (n >
          static_cast<unsigned>(
              Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -94,10 +94,10 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
             0, 0);
     KOKKOS_ASSERT(block_size > 0);
     dim3 block(1, block_size, 1);
+    const int maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0];
     dim3 grid(
-        std::min(
-            typename Policy::index_type((nwork + block.y - 1) / block.y),
-            typename Policy::index_type(cuda_internal_maximum_grid_count()[0])),
+        std::min(typename Policy::index_type((nwork + block.y - 1) / block.y),
+                 typename Policy::index_type(maxGridSizeX)),
         1, 1);
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
     if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
@@ -254,6 +254,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
   // Determine block size constrained by shared memory:
   inline unsigned local_block_size(const FunctorType& f) {
     unsigned n = CudaTraits::WarpSize * 8;
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     int shmem_size =
         cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
             f, n);
@@ -264,9 +266,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
         CudaParallelLaunch<closure_type,
                            LaunchBounds>::get_cuda_func_attributes();
     while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
+        (n && (maxShmemPerBlock < shmem_size)) ||
         (n >
          static_cast<unsigned>(
              Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
@@ -609,11 +609,11 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
     // testing
 
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     unsigned n = CudaTraits::WarpSize * 4;
     while (n &&
-           unsigned(m_policy.space()
-                        .impl_internal_space_instance()
-                        ->m_maxShmemPerBlock) <
+           unsigned(maxShmemPerBlock) <
                cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
                                                          value_type>(f, n)) {
       n >>= 1;
@@ -933,11 +933,11 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
     // testing
 
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     unsigned n = CudaTraits::WarpSize * 4;
     while (n &&
-           unsigned(m_policy.space()
-                        .impl_internal_space_instance()
-                        ->m_maxShmemPerBlock) <
+           unsigned(maxShmemPerBlock) <
                cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
                                                          value_type>(f, n)) {
       n >>= 1;

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -262,7 +262,8 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
         m_tune_team(bool(team_size_request <= 0)),
         m_tune_vector(bool(vector_length_request <= 0)) {
     // Make sure league size is permissible
-    if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0]))
+    const int maxGridSizeX = m_space.cuda_device_prop().maxGridSize[0];
+    if (league_size_ >= maxGridSizeX)
       Impl::throw_runtime_exception(
           "Requested too large league_size for TeamPolicy on Cuda execution "
           "space.");
@@ -575,10 +576,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
                   static_cast<std::int64_t>(m_league_size))));
     }
 
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     const int shmem_size_total = m_shmem_begin + m_shmem_size;
-    if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
-      printf("%i %i\n", internal_space_instance->m_maxShmemPerBlock,
-             shmem_size_total);
+    if (maxShmemPerBlock < shmem_size_total) {
+      printf("%i %i\n", maxShmemPerBlock, shmem_size_total);
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
     }
@@ -935,6 +937,8 @@ class ParallelReduce<CombinedFunctorReducerType,
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
 
+    const int maxShmemPerBlock =
+        m_policy.space().cuda_device_prop().sharedMemPerBlock;
     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
 
     if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
@@ -943,7 +947,7 @@ class ParallelReduce<CombinedFunctorReducerType,
           std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
     }
 
-    if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
+    if (maxShmemPerBlock < shmem_size_total) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much "
                       "L0 scratch memory"));

diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -702,8 +702,7 @@ inline void check_reduced_view_shmem_size(const Policy& policy,
   unsigned reqShmemSize =
       cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, ValueType>(
           functor, minBlockSize);
-  size_t maxShmemPerBlock =
-      policy.space().impl_internal_space_instance()->m_maxShmemPerBlock;
+  size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock;
 
   if (reqShmemSize > maxShmemPerBlock) {
     Kokkos::Impl::throw_runtime_exception(

diff --git a/core/src/Cuda/Kokkos_Cuda_Task.hpp b/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -84,8 +84,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
   KOKKOS_INLINE_FUNCTION
   static void iff_single_thread_recursive_execute(scheduler_type const&) {}
 
-  static int get_max_team_count(execution_space const&) {
-    return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
+  static int get_max_team_count(execution_space const& space) {
+    return space.cuda_device_prop().multiProcessorCount * warps_per_block;
   }
 
   __device__ static void driver(scheduler_type scheduler,
@@ -225,7 +225,9 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
   // FIXME_CUDA_MULTIPLE_DEVICES
   static void execute(scheduler_type const& scheduler) {
     const int shared_per_warp = 2048;
-    const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
+    const int multi_processor_count =
+        scheduler.get_execution_space().cuda_device_prop().multiProcessorCount;
+    const dim3 grid(multi_processor_count, 1, 1);
     const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
     const int shared_total    = shared_per_warp * warps_per_block;
     const cudaStream_t stream = nullptr;
@@ -466,7 +468,11 @@ class TaskQueueSpecializationConstrained<
   static void execute(scheduler_type const& scheduler) {
     const int shared_per_warp = 2048;
     const int warps_per_block = 4;
-    const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
+    const int multi_processor_count =
+        // FIXME not sure why this didn't work
+        // scheduler.get_execution_space().cuda_device_prop().multiProcessorCount;
+        CudaInternal::singleton().m_deviceProp.multiProcessorCount;
+    const dim3 grid(multi_processor_count, 1, 1);
     // const dim3 grid( 1 , 1 , 1 );
     const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
     const int shared_total    = shared_per_warp * warps_per_block;

diff --git a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -77,7 +77,9 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
 
   inline void execute() {
     const int warps_per_block = 4;
-    const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
+    const int multi_processor_count =
+        m_policy.space().cuda_device_prop().multiProcessorCount;
+    const dim3 grid(multi_processor_count, 1, 1);
     const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
     const int shared = 0;