Skip to content

Commit

Permalink
Trim some fat in CudaInternal (towards multiple GPUs support) (kokk…
Browse files Browse the repository at this point in the history
…os#6544)

* Remove unused Impl::cuda_internal_maximum_concurrent_block_count function

* Drop unused CudaInternal::m_shmemPerSM

* Drop unused CudaInternal::m_maxBlocksPerSM

* Drop unused CudaInternal::m_maxThreadsPerBlock

* Drop (unused) Impl::cuda_internal_maximum_warp_count()

* Drop CudaInternal::m_maxWarpCount data member

* Drop CudaInternal::m_multiProcCount data member

* Drop CudaInternal::m_maxThreadsPerSM data member

* Drop CudaInternal::m_maxBlock data member

* Get rid of Impl::cuda_internal_maximum_grid_count()

* Drop CudaInternal::m_maxShmemPerBlock static data member

* Drop Impl::cuda_internal_multiprocessor_count()

* Drop (unused) CudaTraits::warp_{count,align} static member functions

* Fixup exception msg

* Fixup tasking get exec space from teh scheduler

* Try to get to the device properties some othe eay in cuda tasking
  • Loading branch information
dalg24 committed Oct 30, 2023
1 parent 201d1de commit 400dd1d
Show file tree
Hide file tree
Showing 10 changed files with 53 additions and 118 deletions.
65 changes: 7 additions & 58 deletions core/src/Cuda/Kokkos_Cuda_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,13 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) {
// Allocate some initial space. This will grow as needed.

{
const unsigned reduce_block_count =
m_maxWarpCount * Impl::CudaTraits::WarpSize;
// Maximum number of warps,
// at most one warp per thread in a warp for reduction.
auto const maxWarpCount = std::min<unsigned>(
m_deviceProp.maxThreadsPerBlock / CudaTraits::WarpSize,
CudaTraits::WarpSize);
unsigned const reduce_block_count =
maxWarpCount * Impl::CudaTraits::WarpSize;

(void)scratch_unified(16 * sizeof(size_type));
(void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
Expand Down Expand Up @@ -624,30 +629,6 @@ void CudaInternal::finalize() {

//----------------------------------------------------------------------------

Cuda::size_type cuda_internal_multiprocessor_count() {
return CudaInternal::singleton().m_multiProcCount;
}

CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() {
#if defined(KOKKOS_ARCH_KEPLER)
// Compute capability 3.0 through 3.7
enum : int { max_resident_blocks_per_multiprocessor = 16 };
#else
// Compute capability 5.0 through 6.2
enum : int { max_resident_blocks_per_multiprocessor = 32 };
#endif
return CudaInternal::singleton().m_multiProcCount *
max_resident_blocks_per_multiprocessor;
};

Cuda::size_type cuda_internal_maximum_warp_count() {
return CudaInternal::singleton().m_maxWarpCount;
}

std::array<Cuda::size_type, 3> cuda_internal_maximum_grid_count() {
return CudaInternal::singleton().m_maxBlock;
}

Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance,
const std::size_t size) {
return instance.impl_internal_space_instance()->scratch_space(size);
Expand Down Expand Up @@ -761,38 +742,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
}
#endif

//----------------------------------
// number of multiprocessors
Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;

//----------------------------------
// Maximum number of warps,
// at most one warp per thread in a warp for reduction.
Impl::CudaInternal::m_maxWarpCount =
cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;

if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
}

//----------------------------------
// Maximum number of blocks:

Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];

Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor;
Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
Impl::CudaInternal::m_maxBlocksPerSM =
Impl::CudaInternal::m_cudaArch < 500
? 16
: (Impl::CudaInternal::m_cudaArch < 750
? 32
: (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor;
Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;

//----------------------------------

cudaStream_t singleton_stream;
Expand Down
27 changes: 1 addition & 26 deletions core/src/Cuda/Kokkos_Cuda_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,10 @@ struct CudaTraits {
unsigned long[ConstantMemoryUsage / sizeof(unsigned long)];

static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */;

KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count(
CudaSpace::size_type i) {
return (i + WarpIndexMask) >> WarpIndexShift;
}

KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align(
CudaSpace::size_type i) {
constexpr CudaSpace::size_type Mask = ~WarpIndexMask;
return (i + WarpIndexMask) & Mask;
}
};

//----------------------------------------------------------------------------

CudaSpace::size_type cuda_internal_multiprocessor_count();
CudaSpace::size_type cuda_internal_maximum_warp_count();
std::array<CudaSpace::size_type, 3> cuda_internal_maximum_grid_count();

CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();

CudaSpace::size_type* cuda_internal_scratch_flags(const Cuda&,
const std::size_t size);
CudaSpace::size_type* cuda_internal_scratch_space(const Cuda&,
Expand Down Expand Up @@ -104,15 +87,7 @@ class CudaInternal {
inline static int m_cudaDev = -1;

// Device Properties
inline static int m_cudaArch = -1;
inline static unsigned m_multiProcCount = 0;
inline static unsigned m_maxWarpCount = 0;
inline static std::array<size_type, 3> m_maxBlock = {0, 0, 0};
inline static int m_shmemPerSM = 0;
inline static int m_maxShmemPerBlock = 0;
inline static int m_maxBlocksPerSM = 0;
inline static int m_maxThreadsPerSM = 0;
inline static int m_maxThreadsPerBlock = 0;
inline static int m_cudaArch = -1;
static int concurrency();

inline static cudaDeviceProp m_deviceProp;
Expand Down
8 changes: 4 additions & 4 deletions core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#ifdef KOKKOS_ENABLE_CUDA

#include <mutex>
#include <string>
#include <cstdint>
#include <cmath>
#include <Kokkos_Parallel.hpp>
Expand Down Expand Up @@ -118,10 +117,11 @@ inline bool is_empty_launch(dim3 const& grid, dim3 const& block) {
}

inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
if (cuda_instance->m_maxShmemPerBlock < shmem) {
int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock;
if (maxShmemPerBlock < shmem) {
Kokkos::Impl::throw_runtime_exception(
std::string("CudaParallelLaunch (or graph node creation) FAILED: shared"
" memory request is too large"));
"CudaParallelLaunch (or graph node creation) FAILED: shared memory "
"request is too large");
}
}

Expand Down
4 changes: 2 additions & 2 deletions core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ template <>
inline TileSizeProperties get_tile_size_properties<Kokkos::Cuda>(
const Kokkos::Cuda& space) {
TileSizeProperties properties;
properties.max_threads =
space.impl_internal_space_instance()->m_maxThreadsPerSM;
properties.max_threads = space.impl_internal_space_instance()
->m_deviceProp.maxThreadsPerMultiProcessor;
properties.default_largest_tile_size = 16;
properties.default_tile_size = 2;
properties.max_total_tile_size = 512;
Expand Down
8 changes: 4 additions & 4 deletions core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {

inline void execute() const {
if (m_rp.m_num_tiles == 0) return;
const auto maxblocks = cuda_internal_maximum_grid_count();
const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize;
if (RP::rank == 2) {
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
KOKKOS_ASSERT(block.x > 0);
Expand Down Expand Up @@ -320,6 +320,8 @@ class ParallelReduce<CombinedFunctorReducerType,
// Determine block size constrained by shared memory:
inline unsigned local_block_size(const FunctorType& f) {
unsigned n = CudaTraits::WarpSize * 8;
int const maxShmemPerBlock =
m_policy.space().cuda_device_prop().sharedMemPerBlock;
int shmem_size =
cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
f, n);
Expand All @@ -330,9 +332,7 @@ class ParallelReduce<CombinedFunctorReducerType,
CudaParallelLaunch<closure_type,
LaunchBounds>::get_cuda_func_attributes();
while (
(n &&
(m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
shmem_size)) ||
(n && (maxShmemPerBlock < shmem_size)) ||
(n >
static_cast<unsigned>(
Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
Expand Down
24 changes: 12 additions & 12 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
0, 0);
KOKKOS_ASSERT(block_size > 0);
dim3 block(1, block_size, 1);
const int maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0];
dim3 grid(
std::min(
typename Policy::index_type((nwork + block.y - 1) / block.y),
typename Policy::index_type(cuda_internal_maximum_grid_count()[0])),
std::min(typename Policy::index_type((nwork + block.y - 1) / block.y),
typename Policy::index_type(maxGridSizeX)),
1, 1);
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
Expand Down Expand Up @@ -254,6 +254,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
// Determine block size constrained by shared memory:
inline unsigned local_block_size(const FunctorType& f) {
unsigned n = CudaTraits::WarpSize * 8;
const int maxShmemPerBlock =
m_policy.space().cuda_device_prop().sharedMemPerBlock;
int shmem_size =
cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
f, n);
Expand All @@ -264,9 +266,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
CudaParallelLaunch<closure_type,
LaunchBounds>::get_cuda_func_attributes();
while (
(n &&
(m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
shmem_size)) ||
(n && (maxShmemPerBlock < shmem_size)) ||
(n >
static_cast<unsigned>(
Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
Expand Down Expand Up @@ -609,11 +609,11 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
// testing

const int maxShmemPerBlock =
m_policy.space().cuda_device_prop().sharedMemPerBlock;
unsigned n = CudaTraits::WarpSize * 4;
while (n &&
unsigned(m_policy.space()
.impl_internal_space_instance()
->m_maxShmemPerBlock) <
unsigned(maxShmemPerBlock) <
cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
value_type>(f, n)) {
n >>= 1;
Expand Down Expand Up @@ -933,11 +933,11 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
// testing

const int maxShmemPerBlock =
m_policy.space().cuda_device_prop().sharedMemPerBlock;
unsigned n = CudaTraits::WarpSize * 4;
while (n &&
unsigned(m_policy.space()
.impl_internal_space_instance()
->m_maxShmemPerBlock) <
unsigned(maxShmemPerBlock) <
cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
value_type>(f, n)) {
n >>= 1;
Expand Down
14 changes: 9 additions & 5 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,8 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
m_tune_team(bool(team_size_request <= 0)),
m_tune_vector(bool(vector_length_request <= 0)) {
// Make sure league size is permissible
if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0]))
const int maxGridSizeX = m_space.cuda_device_prop().maxGridSize[0];
if (league_size_ >= maxGridSizeX)
Impl::throw_runtime_exception(
"Requested too large league_size for TeamPolicy on Cuda execution "
"space.");
Expand Down Expand Up @@ -575,10 +576,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
static_cast<std::int64_t>(m_league_size))));
}

const int maxShmemPerBlock =
m_policy.space().cuda_device_prop().sharedMemPerBlock;
const int shmem_size_total = m_shmem_begin + m_shmem_size;
if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
printf("%i %i\n", internal_space_instance->m_maxShmemPerBlock,
shmem_size_total);
if (maxShmemPerBlock < shmem_size_total) {
printf("%i %i\n", maxShmemPerBlock, shmem_size_total);
Kokkos::Impl::throw_runtime_exception(std::string(
"Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
}
Expand Down Expand Up @@ -935,6 +937,8 @@ class ParallelReduce<CombinedFunctorReducerType,
// Functor's reduce memory, team scan memory, and team shared memory depend
// upon team size.

const int maxShmemPerBlock =
m_policy.space().cuda_device_prop().sharedMemPerBlock;
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;

if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
Expand All @@ -943,7 +947,7 @@ class ParallelReduce<CombinedFunctorReducerType,
std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
}

if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
if (maxShmemPerBlock < shmem_size_total) {
Kokkos::Impl::throw_runtime_exception(
std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much "
"L0 scratch memory"));
Expand Down
3 changes: 1 addition & 2 deletions core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,8 +702,7 @@ inline void check_reduced_view_shmem_size(const Policy& policy,
unsigned reqShmemSize =
cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, ValueType>(
functor, minBlockSize);
size_t maxShmemPerBlock =
policy.space().impl_internal_space_instance()->m_maxShmemPerBlock;
size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock;

if (reqShmemSize > maxShmemPerBlock) {
Kokkos::Impl::throw_runtime_exception(
Expand Down
14 changes: 10 additions & 4 deletions core/src/Cuda/Kokkos_Cuda_Task.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
KOKKOS_INLINE_FUNCTION
static void iff_single_thread_recursive_execute(scheduler_type const&) {}

static int get_max_team_count(execution_space const&) {
return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
static int get_max_team_count(execution_space const& space) {
return space.cuda_device_prop().multiProcessorCount * warps_per_block;
}

__device__ static void driver(scheduler_type scheduler,
Expand Down Expand Up @@ -225,7 +225,9 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
// FIXME_CUDA_MULTIPLE_DEVICES
static void execute(scheduler_type const& scheduler) {
const int shared_per_warp = 2048;
const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
const int multi_processor_count =
scheduler.get_execution_space().cuda_device_prop().multiProcessorCount;
const dim3 grid(multi_processor_count, 1, 1);
const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
const int shared_total = shared_per_warp * warps_per_block;
const cudaStream_t stream = nullptr;
Expand Down Expand Up @@ -466,7 +468,11 @@ class TaskQueueSpecializationConstrained<
static void execute(scheduler_type const& scheduler) {
const int shared_per_warp = 2048;
const int warps_per_block = 4;
const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
const int multi_processor_count =
// FIXME not sure why this didn't work
// scheduler.get_execution_space().cuda_device_prop().multiProcessorCount;
CudaInternal::singleton().m_deviceProp.multiProcessorCount;
const dim3 grid(multi_processor_count, 1, 1);
// const dim3 grid( 1 , 1 , 1 );
const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
const int shared_total = shared_per_warp * warps_per_block;
Expand Down
4 changes: 3 additions & 1 deletion core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,

inline void execute() {
const int warps_per_block = 4;
const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
const int multi_processor_count =
m_policy.space().cuda_device_prop().multiProcessorCount;
const dim3 grid(multi_processor_count, 1, 1);
const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
const int shared = 0;

Expand Down

0 comments on commit 400dd1d

Please sign in to comment.