Cuda multi-GPU support: Pass the correct device id to get_cuda_kernel…

…_func_attributes (kokkos#6767) * Pass the correct device id to get_cuda_kernel_func_attributes * Address review comments * Fix another occurence * Fix configure_shmem_preference
masterleinad · Feb 9, 2024 · 3b515c9 · 3b515c9
1 parent 7ff87a5
commit 3b515c9
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 38 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -128,32 +128,32 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
 // These functions need to be templated on DriverType and LaunchBounds
 // so that the static bool is unique for each type combo
 // KernelFuncPtr does not necessarily contain that type information.
-// FIXME_CUDA_MULTIPLE_DEVICES
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
 const cudaFuncAttributes& get_cuda_kernel_func_attributes(
-    const KernelFuncPtr& func) {
+    int cuda_device, const KernelFuncPtr& func) {
   // Only call cudaFuncGetAttributes once for each unique kernel
   // by leveraging static variable initialization rules
-  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
+  static std::map<int, cudaFuncAttributes> func_attr;
+  if (func_attr.find(cuda_device) == func_attr.end()) {
     cudaFuncAttributes attr;
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr,
-                                                                    func)));
-    return attr;
-  };
-  static cudaFuncAttributes func_attr = wrap_get_attributes();
-  return func_attr;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+    func_attr.emplace(cuda_device, attr);
+  }
+  return func_attr[cuda_device];
 }
 
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(const KernelFuncPtr& func,
+inline void configure_shmem_preference(const int cuda_device,
+                                       const KernelFuncPtr& func,
                                        const cudaDeviceProp& device_props,
                                        const size_t block_size, int& shmem,
                                        const size_t occupancy) {
 #ifndef KOKKOS_ARCH_KEPLER
 
   const auto& func_attr =
-      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
+      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(cuda_device,
+                                                                func);
 
   // Compute limits for number of blocks due to registers/SM
   const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
@@ -387,8 +387,8 @@ struct CudaParallelLaunchKernelInvoker<
             driver.get_policy().impl_get_desired_occupancy().value();
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
+            cuda_instance->m_cudaDev, base_t::get_kernel_func(),
+            cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
       }
 
       void const* args[] = {&driver};
@@ -487,8 +487,8 @@ struct CudaParallelLaunchKernelInvoker<
             driver.get_policy().impl_get_desired_occupancy().value();
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
+            cuda_instance->m_cudaDev, base_t::get_kernel_func(),
+            cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
       }
 
       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -668,8 +668,8 @@ struct CudaParallelLaunchImpl<
         Impl::configure_shmem_preference<
             DriverType,
             Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
+            cuda_instance->m_cudaDev, base_t::get_kernel_func(),
+            cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
       }
 
       desul::ensure_cuda_lock_arrays_on_device();
@@ -685,10 +685,10 @@ struct CudaParallelLaunchImpl<
     }
   }
 
-  static cudaFuncAttributes get_cuda_func_attributes() {
+  static cudaFuncAttributes get_cuda_func_attributes(int cuda_device) {
     return get_cuda_kernel_func_attributes<
         DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-        base_t::get_kernel_func());
+        cuda_device, base_t::get_kernel_func());
   }
 };
 

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -41,8 +41,8 @@ namespace Impl {
 template <typename ParallelType, typename Policy, typename LaunchBounds>
 int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
   cudaFuncAttributes attr =
-      CudaParallelLaunch<ParallelType,
-                         LaunchBounds>::get_cuda_func_attributes();
+      CudaParallelLaunch<ParallelType, LaunchBounds>::get_cuda_func_attributes(
+          pol.space().cuda_device());
   auto const& prop = pol.space().cuda_device_prop();
 
   // Limits due to registers/SM, MDRange doesn't have
@@ -332,9 +332,8 @@ class ParallelReduce<CombinedFunctorReducerType,
     using closure_type =
         Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
                              Policy, Kokkos::Cuda>;
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type,
-                           LaunchBounds>::get_cuda_func_attributes();
+    cudaFuncAttributes attr = CudaParallelLaunch<closure_type, LaunchBounds>::
+        get_cuda_func_attributes(m_policy.space().cuda_device());
     while (
         (n && (maxShmemPerBlock < shmem_size)) ||
         (n >

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -85,8 +85,8 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
 
     cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
+        CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes(
+            m_policy.space().cuda_device());
     const int block_size =
         Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
             m_policy.space().impl_internal_space_instance(), attr, m_functor, 1,
@@ -267,9 +267,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
     using closure_type =
         Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
                              Policy, Kokkos::Cuda>;
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type,
-                           LaunchBounds>::get_cuda_func_attributes();
+    cudaFuncAttributes attr = CudaParallelLaunch<closure_type, LaunchBounds>::
+        get_cuda_func_attributes(m_policy.space().cuda_device());
     while (
         (n && (maxShmemPerBlock < shmem_size)) ||
         (n >

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -98,7 +98,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
+            get_cuda_func_attributes(space().cuda_device());
     int block_size =
         Kokkos::Impl::cuda_get_max_block_size<FunctorType,
                                               typename traits::launch_bounds>(
@@ -137,7 +137,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
+            get_cuda_func_attributes(space().cuda_device());
     const int block_size =
         Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
                                               typename traits::launch_bounds>(
@@ -370,7 +370,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
 
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
+            get_cuda_func_attributes(space().cuda_device());
     const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
         space().impl_internal_space_instance(), attr, f,
         (size_t)impl_vector_length(),
@@ -540,8 +540,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     auto internal_space_instance =
         m_policy.space().impl_internal_space_instance();
     cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
+        CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes(
+            internal_space_instance->m_cudaDev);
     m_team_size =
         m_team_size >= 0
             ? m_team_size
@@ -909,9 +909,8 @@ class ParallelReduce<CombinedFunctorReducerType,
         m_vector_size(arg_policy.impl_vector_length()) {
     auto internal_space_instance =
         m_policy.space().impl_internal_space_instance();
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
+    cudaFuncAttributes attr = CudaParallelLaunch<ParallelReduce, LaunchBounds>::
+        get_cuda_func_attributes(internal_space_instance->m_cudaDev);
     m_team_size =
         m_team_size >= 0
             ? m_team_size