Drop pre-Kepler logic in Cuda::impl_initialize

masterleinad · Sep 6, 2023 · 28061e8 · 28061e8
1 parent 7b4d0a6
commit 28061e8
Showing 1 changed file with 76 additions and 91 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -723,105 +723,90 @@ void Cuda::impl_initialize(InitializationSettings const &settings) {
   const int cuda_device_id = Impl::get_gpu(settings);
   const auto &dev_info     = Impl::CudaInternalDevices::singleton();
 
-  // Need device capability 3.0 or better
-  const bool ok_dev = 3 <= dev_info.m_cudaProp[cuda_device_id].major &&
-                      0 <= dev_info.m_cudaProp[cuda_device_id].minor;
-  if (ok_dev) {
-    const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];
-
-    Impl::CudaInternal::m_cudaDev    = cuda_device_id;
-    Impl::CudaInternal::m_deviceProp = cudaProp;
-
-    Kokkos::Impl::cuda_device_synchronize(
-        "Kokkos::CudaInternal::initialize: Fence on space initialization");
-
-    // Query what compute capability architecture a kernel executes:
-    Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch();
-
-    if (Impl::CudaInternal::m_cudaArch == 0) {
-      std::stringstream ss;
-      ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n";
-      std::string msg = ss.str();
-      Kokkos::abort(msg.c_str());
-    }
+  const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];
 
-    int compiled_major = Impl::CudaInternal::m_cudaArch / 100;
-    int compiled_minor = (Impl::CudaInternal::m_cudaArch % 100) / 10;
-
-    if ((compiled_major > cudaProp.major) ||
-        ((compiled_major == cudaProp.major) &&
-         (compiled_minor > cudaProp.minor))) {
-      std::stringstream ss;
-      ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for "
-            "compute capability "
-         << compiled_major << "." << compiled_minor
-         << " on device with compute capability " << cudaProp.major << "."
-         << cudaProp.minor << " is not supported by CUDA!\n";
-      std::string msg = ss.str();
-      Kokkos::abort(msg.c_str());
-    }
-    if (Kokkos::show_warnings() && (compiled_major != cudaProp.major ||
-                                    compiled_minor != cudaProp.minor)) {
-      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled "
-                   "for compute capability "
-                << compiled_major << "." << compiled_minor
-                << " on device with compute capability " << cudaProp.major
-                << "." << cudaProp.minor
-                << " , this will likely reduce potential performance."
-                << std::endl;
-    }
+  Impl::CudaInternal::m_cudaDev    = cuda_device_id;
+  Impl::CudaInternal::m_deviceProp = cudaProp;
 
-    // number of multiprocessors
-    Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;
+  Kokkos::Impl::cuda_device_synchronize(
+      "Kokkos::CudaInternal::initialize: Fence on space initialization");
 
-    //----------------------------------
-    // Maximum number of warps,
-    // at most one warp per thread in a warp for reduction.
-    Impl::CudaInternal::m_maxWarpCount =
-        cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;
+  // Query what compute capability architecture a kernel executes:
+  Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch();
 
-    if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
-      Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
-    }
+  if (Impl::CudaInternal::m_cudaArch == 0) {
+    std::stringstream ss;
+    ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n";
+    std::string msg = ss.str();
+    Kokkos::abort(msg.c_str());
+  }
 
-    //----------------------------------
-    // Maximum number of blocks:
-
-    Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
-    Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
-    Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];
-
-    Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor;
-    Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
-    Impl::CudaInternal::m_maxBlocksPerSM =
-        Impl::CudaInternal::m_cudaArch < 500
-            ? 16
-            : (Impl::CudaInternal::m_cudaArch < 750
-                   ? 32
-                   : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
-    Impl::CudaInternal::m_maxThreadsPerSM =
-        cudaProp.maxThreadsPerMultiProcessor;
-    Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;
+  int compiled_major = Impl::CudaInternal::m_cudaArch / 100;
+  int compiled_minor = (Impl::CudaInternal::m_cudaArch % 100) / 10;
+
+  if ((compiled_major > cudaProp.major) ||
+      ((compiled_major == cudaProp.major) &&
+       (compiled_minor > cudaProp.minor))) {
+    std::stringstream ss;
+    ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for "
+          "compute capability "
+       << compiled_major << "." << compiled_minor
+       << " on device with compute capability " << cudaProp.major << "."
+       << cudaProp.minor << " is not supported by CUDA!\n";
+    std::string msg = ss.str();
+    Kokkos::abort(msg.c_str());
+  }
+  if (Kokkos::show_warnings() &&
+      (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor)) {
+    std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled "
+                 "for compute capability "
+              << compiled_major << "." << compiled_minor
+              << " on device with compute capability " << cudaProp.major << "."
+              << cudaProp.minor
+              << " , this will likely reduce potential performance."
+              << std::endl;
+  }
 
-    //----------------------------------
+  // number of multiprocessors
+  Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;
 
-    Impl::CudaInternal::m_scratchUnifiedSupported = cudaProp.unifiedAddressing;
+  //----------------------------------
+  // Maximum number of warps,
+  // at most one warp per thread in a warp for reduction.
+  Impl::CudaInternal::m_maxWarpCount =
+      cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;
 
-    if (Kokkos::show_warnings() &&
-        !Impl::CudaInternal::m_scratchUnifiedSupported) {
-      std::cerr << "Kokkos::Cuda device " << cudaProp.name << " capability "
-                << cudaProp.major << "." << cudaProp.minor
-                << " does not support unified virtual address space"
-                << std::endl;
-    }
-  } else {
-    std::ostringstream msg;
-    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED: Device ";
-    msg << dev_info.m_cudaProp[cuda_device_id].major;
-    msg << ".";
-    msg << dev_info.m_cudaProp[cuda_device_id].minor;
-    msg << " has insufficient capability, required 3.0 or better";
-    Kokkos::Impl::throw_runtime_exception(msg.str());
+  if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
+    Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
+  }
+
+  //----------------------------------
+  // Maximum number of blocks:
+
+  Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
+  Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
+  Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];
+
+  Impl::CudaInternal::m_shmemPerSM       = cudaProp.sharedMemPerMultiprocessor;
+  Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
+  Impl::CudaInternal::m_maxBlocksPerSM =
+      Impl::CudaInternal::m_cudaArch < 500
+          ? 16
+          : (Impl::CudaInternal::m_cudaArch < 750
+                 ? 32
+                 : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
+  Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor;
+  Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;
+
+  //----------------------------------
+
+  Impl::CudaInternal::m_scratchUnifiedSupported = cudaProp.unifiedAddressing;
+
+  if (Kokkos::show_warnings() &&
+      !Impl::CudaInternal::m_scratchUnifiedSupported) {
+    std::cerr << "Kokkos::Cuda device " << cudaProp.name << " capability "
+              << cudaProp.major << "." << cudaProp.minor
+              << " does not support unified virtual address space" << std::endl;
   }
 
   cudaStream_t singleton_stream;