Skip to content

Commit

Permalink
Cuda multi-GPU support: Pass the correct device id to get_cuda_kernel…
Browse files Browse the repository at this point in the history
…_func_attributes (kokkos#6767)

* Pass the correct device id to get_cuda_kernel_func_attributes

* Address review comments

* Fix another occurence

* Fix configure_shmem_preference
  • Loading branch information
masterleinad committed Feb 9, 2024
1 parent 7ff87a5 commit 3b515c9
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 38 deletions.
40 changes: 20 additions & 20 deletions core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,32 +128,32 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
// These functions need to be templated on DriverType and LaunchBounds
// so that the static bool is unique for each type combo
// KernelFuncPtr does not necessarily contain that type information.
// FIXME_CUDA_MULTIPLE_DEVICES
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
const cudaFuncAttributes& get_cuda_kernel_func_attributes(
const KernelFuncPtr& func) {
int cuda_device, const KernelFuncPtr& func) {
// Only call cudaFuncGetAttributes once for each unique kernel
// by leveraging static variable initialization rules
auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
static std::map<int, cudaFuncAttributes> func_attr;
if (func_attr.find(cuda_device) == func_attr.end()) {
cudaFuncAttributes attr;
KOKKOS_IMPL_CUDA_SAFE_CALL(
(CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr,
func)));
return attr;
};
static cudaFuncAttributes func_attr = wrap_get_attributes();
return func_attr;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
func_attr.emplace(cuda_device, attr);
}
return func_attr[cuda_device];
}

template <class DriverType, class LaunchBounds, class KernelFuncPtr>
inline void configure_shmem_preference(const KernelFuncPtr& func,
inline void configure_shmem_preference(const int cuda_device,
const KernelFuncPtr& func,
const cudaDeviceProp& device_props,
const size_t block_size, int& shmem,
const size_t occupancy) {
#ifndef KOKKOS_ARCH_KEPLER

const auto& func_attr =
get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(cuda_device,
func);

// Compute limits for number of blocks due to registers/SM
const size_t regs_per_sm = device_props.regsPerMultiprocessor;
Expand Down Expand Up @@ -387,8 +387,8 @@ struct CudaParallelLaunchKernelInvoker<
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);
cuda_instance->m_cudaDev, base_t::get_kernel_func(),
cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
}

void const* args[] = {&driver};
Expand Down Expand Up @@ -487,8 +487,8 @@ struct CudaParallelLaunchKernelInvoker<
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);
cuda_instance->m_cudaDev, base_t::get_kernel_func(),
cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
}

auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
Expand Down Expand Up @@ -668,8 +668,8 @@ struct CudaParallelLaunchImpl<
Impl::configure_shmem_preference<
DriverType,
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);
cuda_instance->m_cudaDev, base_t::get_kernel_func(),
cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy);
}

desul::ensure_cuda_lock_arrays_on_device();
Expand All @@ -685,10 +685,10 @@ struct CudaParallelLaunchImpl<
}
}

static cudaFuncAttributes get_cuda_func_attributes() {
static cudaFuncAttributes get_cuda_func_attributes(int cuda_device) {
return get_cuda_kernel_func_attributes<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func());
cuda_device, base_t::get_kernel_func());
}
};

Expand Down
9 changes: 4 additions & 5 deletions core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ namespace Impl {
template <typename ParallelType, typename Policy, typename LaunchBounds>
int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelType,
LaunchBounds>::get_cuda_func_attributes();
CudaParallelLaunch<ParallelType, LaunchBounds>::get_cuda_func_attributes(
pol.space().cuda_device());
auto const& prop = pol.space().cuda_device_prop();

// Limits due to registers/SM, MDRange doesn't have
Expand Down Expand Up @@ -332,9 +332,8 @@ class ParallelReduce<CombinedFunctorReducerType,
using closure_type =
Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
Policy, Kokkos::Cuda>;
cudaFuncAttributes attr =
CudaParallelLaunch<closure_type,
LaunchBounds>::get_cuda_func_attributes();
cudaFuncAttributes attr = CudaParallelLaunch<closure_type, LaunchBounds>::
get_cuda_func_attributes(m_policy.space().cuda_device());
while (
(n && (maxShmemPerBlock < shmem_size)) ||
(n >
Expand Down
9 changes: 4 additions & 5 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();

cudaFuncAttributes attr =
CudaParallelLaunch<ParallelFor,
LaunchBounds>::get_cuda_func_attributes();
CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes(
m_policy.space().cuda_device());
const int block_size =
Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
m_policy.space().impl_internal_space_instance(), attr, m_functor, 1,
Expand Down Expand Up @@ -267,9 +267,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
using closure_type =
Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
Policy, Kokkos::Cuda>;
cudaFuncAttributes attr =
CudaParallelLaunch<closure_type,
LaunchBounds>::get_cuda_func_attributes();
cudaFuncAttributes attr = CudaParallelLaunch<closure_type, LaunchBounds>::
get_cuda_func_attributes(m_policy.space().cuda_device());
while (
(n && (maxShmemPerBlock < shmem_size)) ||
(n >
Expand Down
15 changes: 7 additions & 8 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
cudaFuncAttributes attr =
CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
get_cuda_func_attributes();
get_cuda_func_attributes(space().cuda_device());
int block_size =
Kokkos::Impl::cuda_get_max_block_size<FunctorType,
typename traits::launch_bounds>(
Expand Down Expand Up @@ -137,7 +137,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
cudaFuncAttributes attr =
CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
get_cuda_func_attributes();
get_cuda_func_attributes(space().cuda_device());
const int block_size =
Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
typename traits::launch_bounds>(
Expand Down Expand Up @@ -370,7 +370,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>

cudaFuncAttributes attr =
CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
get_cuda_func_attributes();
get_cuda_func_attributes(space().cuda_device());
const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
space().impl_internal_space_instance(), attr, f,
(size_t)impl_vector_length(),
Expand Down Expand Up @@ -540,8 +540,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
auto internal_space_instance =
m_policy.space().impl_internal_space_instance();
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelFor,
LaunchBounds>::get_cuda_func_attributes();
CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes(
internal_space_instance->m_cudaDev);
m_team_size =
m_team_size >= 0
? m_team_size
Expand Down Expand Up @@ -909,9 +909,8 @@ class ParallelReduce<CombinedFunctorReducerType,
m_vector_size(arg_policy.impl_vector_length()) {
auto internal_space_instance =
m_policy.space().impl_internal_space_instance();
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelReduce,
LaunchBounds>::get_cuda_func_attributes();
cudaFuncAttributes attr = CudaParallelLaunch<ParallelReduce, LaunchBounds>::
get_cuda_func_attributes(internal_space_instance->m_cudaDev);
m_team_size =
m_team_size >= 0
? m_team_size
Expand Down

0 comments on commit 3b515c9

Please sign in to comment.