create cudaAPI function wrappers

nliber · Jul 27, 2023 · 4d8629f · 4d8629f
1 parent 2f12ebb
commit 4d8629f
Show file tree

Hide file tree

Showing 7 changed files with 558 additions and 137 deletions.
diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -43,7 +43,8 @@
 cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() {
   static cudaStream_t s = nullptr;
   if (s == nullptr) {
-    cudaStreamCreate(&s);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_stream_create_wrapper(&s)));
   }
   return s;
 }
@@ -66,19 +67,22 @@ static std::atomic<int> num_uvm_allocations(0);
 }  // namespace
 
 void DeepCopyCuda(void *dst, const void *src, size_t n) {
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
+  KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper(
+      dst, src, n, cudaMemcpyDefault)));
 }
 
 void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src,
                        size_t n) {
   KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
+      (instance.impl_internal_space_instance()->cuda_memcpy_async_wrapper(
+          dst, src, n, cudaMemcpyDefault, instance.cuda_stream())));
 }
 
 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
   cudaStream_t s = cuda_get_deep_copy_stream();
   KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
+      (CudaInternal::singleton().cuda_memcpy_async_wrapper(
+          dst, src, n, cudaMemcpyDefault, s)));
   Impl::cuda_stream_synchronize(
       s,
       Kokkos::Tools::Experimental::SpecialSynchronizationCases::
@@ -171,25 +175,36 @@ void *impl_allocate_common(const Cuda &exec_space, const char *arg_label,
   if (arg_alloc_size >= memory_threshold_g) {
     if (exec_space_provided) {
       cudaStream_t stream = exec_space.cuda_stream();
-      error_code          = cudaMallocAsync(&ptr, arg_alloc_size, stream);
+      error_code =
+          exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper(
+              &ptr, arg_alloc_size, stream);
       exec_space.fence("Kokkos::Cuda: backend fence after async malloc");
     } else {
-      error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
+      error_code =
+          exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper(
+              &ptr, arg_alloc_size, 0);
       Impl::cuda_device_synchronize(
           "Kokkos::Cuda: backend fence after async malloc");
     }
   } else {
-    error_code = cudaMalloc(&ptr, arg_alloc_size);
+    error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
+        &ptr, arg_alloc_size);
   }
 #else
-  (void)exec_space;
-  (void)exec_space_provided;
-  auto error_code = cudaMalloc(&ptr, arg_alloc_size);
+  cudaError_t error_code;
+  if (exec_space_provided) {
+    error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
+        &ptr, arg_alloc_size);
+  } else {
+    error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper(
+        &ptr, arg_alloc_size);
+  }
 #endif
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
-    cudaGetLastError();  // This is the only way to clear the last error, which
-                         // we should do here since we're turning it into an
-                         // exception here
+    // This is the only way to clear the last error, which
+    // we should do here since we're turning it into an
+    // exception here
+    exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper();
     throw Experimental::CudaRawMemoryAllocationFailure(
         arg_alloc_size, error_code,
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -240,18 +255,22 @@ void *CudaUVMSpace::impl_allocate(
     Kokkos::Impl::num_uvm_allocations++;
 
     auto error_code =
-        cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
+        Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper(
+            &ptr, arg_alloc_size, cudaMemAttachGlobal);
 
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
     if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
-      cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
-                    cudaCpuDeviceId);
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper(
+              ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
+              cudaCpuDeviceId)));
 #endif
 
     if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
-      cudaGetLastError();  // This is the only way to clear the last error,
-                           // which we should do here since we're turning it
-                           // into an exception here
+      // This is the only way to clear the last error, which
+      // we should do here since we're turning it into an
+      // exception here
+      Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
       throw Experimental::CudaRawMemoryAllocationFailure(
           arg_alloc_size, error_code,
           Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -281,11 +300,13 @@ void *CudaHostPinnedSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
-  auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
+  auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper(
+      &ptr, arg_alloc_size, cudaHostAllocDefault);
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
-    cudaGetLastError();  // This is the only way to clear the last error, which
-                         // we should do here since we're turning it into an
-                         // exception here
+    // This is the only way to clear the last error, which
+    // we should do here since we're turning it into an
+    // exception here
+    Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
     throw Experimental::CudaRawMemoryAllocationFailure(
         arg_alloc_size, error_code,
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -327,14 +348,18 @@ void CudaSpace::impl_deallocate(
     if (arg_alloc_size >= memory_threshold_g) {
       Impl::cuda_device_synchronize(
           "Kokkos::Cuda: backend fence before async free");
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (Impl::CudaInternal::singleton().cuda_free_async_wrapper(
+              arg_alloc_ptr, 0)));
       Impl::cuda_device_synchronize(
           "Kokkos::Cuda: backend fence after async free");
     } else {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
     }
 #else
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
 #endif
   } catch (...) {
   }
@@ -353,10 +378,7 @@ void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
 }
 void CudaUVMSpace::impl_deallocate(
     const char *arg_label, void *const arg_alloc_ptr,
-    const size_t arg_alloc_size
-
-    ,
-    const size_t arg_logical_size,
+    const size_t arg_alloc_size, const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   Cuda::impl_static_fence(
       "Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation");
@@ -369,7 +391,8 @@ void CudaUVMSpace::impl_deallocate(
   try {
     if (arg_alloc_ptr != nullptr) {
       Kokkos::Impl::num_uvm_allocations--;
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
     }
   } catch (...) {
   }
@@ -399,7 +422,8 @@ void CudaHostPinnedSpace::impl_deallocate(
                                       reported_size);
   }
   try {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL((
+        Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr)));
   } catch (...) {
   }
 }
@@ -570,16 +594,19 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                            bool to_device) {
   if ((ptr == nullptr) || (bytes == 0)) return;
   cudaPointerAttributes attr;
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL((
+      space.impl_internal_space_instance()->cuda_pointer_get_attributes_wrapper(
+          &attr, ptr)));
   // I measured this and it turns out prefetching towards the host slows
   // DualView syncs down. Probably because the latency is not too bad in the
   // first place for the pull down. If we want to change that provde
   // cudaCpuDeviceId as the device if to_device is false
   bool is_managed = attr.type == cudaMemoryTypeManaged;
   if (to_device && is_managed &&
       space.cuda_device_prop().concurrentManagedAccess) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync(
-        ptr, bytes, space.cuda_device(), space.cuda_stream()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (space.impl_internal_space_instance()->cuda_mem_prefetch_async_wrapper(
+            ptr, bytes, space.cuda_device(), space.cuda_stream())));
   }
 }
 

diff --git a/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
@@ -33,6 +33,7 @@
 #include <Cuda/Kokkos_Cuda.hpp>
 #include <cuda_runtime_api.h>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Instance.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -55,8 +56,11 @@ struct GraphImpl<Kokkos::Cuda> {
     constexpr size_t error_log_size = 256;
     cudaGraphNode_t error_node      = nullptr;
     char error_log[error_log_size];
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate(
-        &m_graph_exec, m_graph, &error_node, error_log, error_log_size));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph,
+                                              &error_node, error_log,
+                                              error_log_size)));
     // TODO @graphs print out errors
   }
 
@@ -83,24 +87,31 @@ struct GraphImpl<Kokkos::Cuda> {
     m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
     KOKKOS_EXPECTS(bool(m_graph))
     if (bool(m_graph_exec)) {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (m_execution_space.impl_internal_space_instance()
+               ->cuda_graph_exec_destroy_wrapper(m_graph_exec)));
     }
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_destroy_wrapper(m_graph)));
   };
 
   explicit GraphImpl(Kokkos::Cuda arg_instance)
       : m_execution_space(std::move(arg_instance)) {
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_create_wrapper(&m_graph, cuda_graph_flags_t{0})));
   }
 
   void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
     // All of the predecessors are just added as normal, so all we need to
     // do here is add an empty node
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
-                              /* dependencies = */ nullptr,
-                              /* numDependencies = */ 0));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_add_empty_node_wrapper(
+                 &(arg_node_ptr->node_details_t::node), m_graph,
+                 /* dependencies = */ nullptr,
+                 /* numDependencies = */ 0)));
   }
 
   template <class NodeImpl>
@@ -146,15 +157,19 @@ struct GraphImpl<Kokkos::Cuda> {
     KOKKOS_EXPECTS(bool(cuda_node))
 
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_add_dependencies_wrapper(m_graph, &pred_cuda_node,
+                                                   &cuda_node, 1)));
   }
 
   void submit() {
     if (!bool(m_graph_exec)) {
       _instantiate_graph();
     }
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream()));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_launch_wrapper(m_graph_exec,
+                                         m_execution_space.cuda_stream())));
   }
 
   execution_space const& get_execution_space() const noexcept {
@@ -167,9 +182,11 @@ struct GraphImpl<Kokkos::Cuda> {
     auto rv = std::make_shared<root_node_impl_t>(
         get_execution_space(), _graph_node_is_root_ctor_tag{});
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
-                              /* dependencies = */ nullptr,
-                              /* numDependencies = */ 0));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_add_empty_node_wrapper(&(rv->node_details_t::node),
+                                                 m_graph,
+                                                 /* dependencies = */ nullptr,
+                                                 /* numDependencies = */ 0)));
     KOKKOS_ENSURES(bool(rv->node_details_t::node))
     return rv;
   }