Skip to content

Commit

Permalink
Drop pre-Kepler logic in Cuda::impl_initialize
Browse files Browse the repository at this point in the history
  • Loading branch information
dalg24 committed Sep 6, 2023
1 parent 7b4d0a6 commit 28061e8
Showing 1 changed file with 76 additions and 91 deletions.
167 changes: 76 additions & 91 deletions core/src/Cuda/Kokkos_Cuda_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -723,105 +723,90 @@ void Cuda::impl_initialize(InitializationSettings const &settings) {
const int cuda_device_id = Impl::get_gpu(settings);
const auto &dev_info = Impl::CudaInternalDevices::singleton();

// Need device capability 3.0 or better
const bool ok_dev = 3 <= dev_info.m_cudaProp[cuda_device_id].major &&
0 <= dev_info.m_cudaProp[cuda_device_id].minor;
if (ok_dev) {
const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];

Impl::CudaInternal::m_cudaDev = cuda_device_id;
Impl::CudaInternal::m_deviceProp = cudaProp;

Kokkos::Impl::cuda_device_synchronize(
"Kokkos::CudaInternal::initialize: Fence on space initialization");

// Query what compute capability architecture a kernel executes:
Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch();

if (Impl::CudaInternal::m_cudaArch == 0) {
std::stringstream ss;
ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n";
std::string msg = ss.str();
Kokkos::abort(msg.c_str());
}
const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];

int compiled_major = Impl::CudaInternal::m_cudaArch / 100;
int compiled_minor = (Impl::CudaInternal::m_cudaArch % 100) / 10;

if ((compiled_major > cudaProp.major) ||
((compiled_major == cudaProp.major) &&
(compiled_minor > cudaProp.minor))) {
std::stringstream ss;
ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for "
"compute capability "
<< compiled_major << "." << compiled_minor
<< " on device with compute capability " << cudaProp.major << "."
<< cudaProp.minor << " is not supported by CUDA!\n";
std::string msg = ss.str();
Kokkos::abort(msg.c_str());
}
if (Kokkos::show_warnings() && (compiled_major != cudaProp.major ||
compiled_minor != cudaProp.minor)) {
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled "
"for compute capability "
<< compiled_major << "." << compiled_minor
<< " on device with compute capability " << cudaProp.major
<< "." << cudaProp.minor
<< " , this will likely reduce potential performance."
<< std::endl;
}
Impl::CudaInternal::m_cudaDev = cuda_device_id;
Impl::CudaInternal::m_deviceProp = cudaProp;

// number of multiprocessors
Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;
Kokkos::Impl::cuda_device_synchronize(
"Kokkos::CudaInternal::initialize: Fence on space initialization");

//----------------------------------
// Maximum number of warps,
// at most one warp per thread in a warp for reduction.
Impl::CudaInternal::m_maxWarpCount =
cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;
// Query what compute capability architecture a kernel executes:
Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch();

if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
}
if (Impl::CudaInternal::m_cudaArch == 0) {
std::stringstream ss;
ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n";
std::string msg = ss.str();
Kokkos::abort(msg.c_str());
}

//----------------------------------
// Maximum number of blocks:

Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];

Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor;
Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
Impl::CudaInternal::m_maxBlocksPerSM =
Impl::CudaInternal::m_cudaArch < 500
? 16
: (Impl::CudaInternal::m_cudaArch < 750
? 32
: (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
Impl::CudaInternal::m_maxThreadsPerSM =
cudaProp.maxThreadsPerMultiProcessor;
Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;
int compiled_major = Impl::CudaInternal::m_cudaArch / 100;
int compiled_minor = (Impl::CudaInternal::m_cudaArch % 100) / 10;

if ((compiled_major > cudaProp.major) ||
((compiled_major == cudaProp.major) &&
(compiled_minor > cudaProp.minor))) {
std::stringstream ss;
ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for "
"compute capability "
<< compiled_major << "." << compiled_minor
<< " on device with compute capability " << cudaProp.major << "."
<< cudaProp.minor << " is not supported by CUDA!\n";
std::string msg = ss.str();
Kokkos::abort(msg.c_str());
}
if (Kokkos::show_warnings() &&
(compiled_major != cudaProp.major || compiled_minor != cudaProp.minor)) {
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled "
"for compute capability "
<< compiled_major << "." << compiled_minor
<< " on device with compute capability " << cudaProp.major << "."
<< cudaProp.minor
<< " , this will likely reduce potential performance."
<< std::endl;
}

//----------------------------------
// number of multiprocessors
Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;

Impl::CudaInternal::m_scratchUnifiedSupported = cudaProp.unifiedAddressing;
//----------------------------------
// Maximum number of warps,
// at most one warp per thread in a warp for reduction.
Impl::CudaInternal::m_maxWarpCount =
cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;

if (Kokkos::show_warnings() &&
!Impl::CudaInternal::m_scratchUnifiedSupported) {
std::cerr << "Kokkos::Cuda device " << cudaProp.name << " capability "
<< cudaProp.major << "." << cudaProp.minor
<< " does not support unified virtual address space"
<< std::endl;
}
} else {
std::ostringstream msg;
msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED: Device ";
msg << dev_info.m_cudaProp[cuda_device_id].major;
msg << ".";
msg << dev_info.m_cudaProp[cuda_device_id].minor;
msg << " has insufficient capability, required 3.0 or better";
Kokkos::Impl::throw_runtime_exception(msg.str());
if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
}

//----------------------------------
// Maximum number of blocks:

Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];

Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor;
Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
Impl::CudaInternal::m_maxBlocksPerSM =
Impl::CudaInternal::m_cudaArch < 500
? 16
: (Impl::CudaInternal::m_cudaArch < 750
? 32
: (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor;
Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;

//----------------------------------

Impl::CudaInternal::m_scratchUnifiedSupported = cudaProp.unifiedAddressing;

if (Kokkos::show_warnings() &&
!Impl::CudaInternal::m_scratchUnifiedSupported) {
std::cerr << "Kokkos::Cuda device " << cudaProp.name << " capability "
<< cudaProp.major << "." << cudaProp.minor
<< " does not support unified virtual address space" << std::endl;
}

cudaStream_t singleton_stream;
Expand Down

0 comments on commit 28061e8

Please sign in to comment.