Skip to content

Commit

Permalink
create cudaAPI function wrappers
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Conrad Clevenger authored and tcclevenger committed Jul 27, 2023
1 parent 2f12ebb commit 4d8629f
Show file tree
Hide file tree
Showing 7 changed files with 558 additions and 137 deletions.
97 changes: 62 additions & 35 deletions core/src/Cuda/Kokkos_CudaSpace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() {
static cudaStream_t s = nullptr;
if (s == nullptr) {
cudaStreamCreate(&s);
KOKKOS_IMPL_CUDA_SAFE_CALL(
(CudaInternal::singleton().cuda_stream_create_wrapper(&s)));
}
return s;
}
Expand All @@ -66,19 +67,22 @@ static std::atomic<int> num_uvm_allocations(0);
} // namespace

void DeepCopyCuda(void *dst, const void *src, size_t n) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper(
dst, src, n, cudaMemcpyDefault)));
}

void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src,
size_t n) {
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
(instance.impl_internal_space_instance()->cuda_memcpy_async_wrapper(
dst, src, n, cudaMemcpyDefault, instance.cuda_stream())));
}

void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
cudaStream_t s = cuda_get_deep_copy_stream();
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
(CudaInternal::singleton().cuda_memcpy_async_wrapper(
dst, src, n, cudaMemcpyDefault, s)));
Impl::cuda_stream_synchronize(
s,
Kokkos::Tools::Experimental::SpecialSynchronizationCases::
Expand Down Expand Up @@ -171,25 +175,36 @@ void *impl_allocate_common(const Cuda &exec_space, const char *arg_label,
if (arg_alloc_size >= memory_threshold_g) {
if (exec_space_provided) {
cudaStream_t stream = exec_space.cuda_stream();
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
error_code =
exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper(
&ptr, arg_alloc_size, stream);
exec_space.fence("Kokkos::Cuda: backend fence after async malloc");
} else {
error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
error_code =
exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper(
&ptr, arg_alloc_size, 0);
Impl::cuda_device_synchronize(
"Kokkos::Cuda: backend fence after async malloc");
}
} else {
error_code = cudaMalloc(&ptr, arg_alloc_size);
error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
&ptr, arg_alloc_size);
}
#else
(void)exec_space;
(void)exec_space_provided;
auto error_code = cudaMalloc(&ptr, arg_alloc_size);
cudaError_t error_code;
if (exec_space_provided) {
error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
&ptr, arg_alloc_size);
} else {
error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper(
&ptr, arg_alloc_size);
}
#endif
if (error_code != cudaSuccess) { // TODO tag as unlikely branch
cudaGetLastError(); // This is the only way to clear the last error, which
// we should do here since we're turning it into an
// exception here
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
// exception here
exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper();
throw Experimental::CudaRawMemoryAllocationFailure(
arg_alloc_size, error_code,
Experimental::RawMemoryAllocationFailure::AllocationMechanism::
Expand Down Expand Up @@ -240,18 +255,22 @@ void *CudaUVMSpace::impl_allocate(
Kokkos::Impl::num_uvm_allocations++;

auto error_code =
cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper(
&ptr, arg_alloc_size, cudaMemAttachGlobal);

#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
cudaCpuDeviceId);
KOKKOS_IMPL_CUDA_SAFE_CALL(
(Impl::CudaInternal::singleton().cuda_mem_advise_wrapper(
ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
cudaCpuDeviceId)));
#endif

if (error_code != cudaSuccess) { // TODO tag as unlikely branch
cudaGetLastError(); // This is the only way to clear the last error,
// which we should do here since we're turning it
// into an exception here
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
// exception here
Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
throw Experimental::CudaRawMemoryAllocationFailure(
arg_alloc_size, error_code,
Experimental::RawMemoryAllocationFailure::AllocationMechanism::
Expand Down Expand Up @@ -281,11 +300,13 @@ void *CudaHostPinnedSpace::impl_allocate(
const Kokkos::Tools::SpaceHandle arg_handle) const {
void *ptr = nullptr;

auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper(
&ptr, arg_alloc_size, cudaHostAllocDefault);
if (error_code != cudaSuccess) { // TODO tag as unlikely branch
cudaGetLastError(); // This is the only way to clear the last error, which
// we should do here since we're turning it into an
// exception here
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
// exception here
Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
throw Experimental::CudaRawMemoryAllocationFailure(
arg_alloc_size, error_code,
Experimental::RawMemoryAllocationFailure::AllocationMechanism::
Expand Down Expand Up @@ -327,14 +348,18 @@ void CudaSpace::impl_deallocate(
if (arg_alloc_size >= memory_threshold_g) {
Impl::cuda_device_synchronize(
"Kokkos::Cuda: backend fence before async free");
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
KOKKOS_IMPL_CUDA_SAFE_CALL(
(Impl::CudaInternal::singleton().cuda_free_async_wrapper(
arg_alloc_ptr, 0)));
Impl::cuda_device_synchronize(
"Kokkos::Cuda: backend fence after async free");
} else {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
KOKKOS_IMPL_CUDA_SAFE_CALL(
(Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
}
#else
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
KOKKOS_IMPL_CUDA_SAFE_CALL(
(Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
#endif
} catch (...) {
}
Expand All @@ -353,10 +378,7 @@ void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
}
void CudaUVMSpace::impl_deallocate(
const char *arg_label, void *const arg_alloc_ptr,
const size_t arg_alloc_size

,
const size_t arg_logical_size,
const size_t arg_alloc_size, const size_t arg_logical_size,
const Kokkos::Tools::SpaceHandle arg_handle) const {
Cuda::impl_static_fence(
"Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation");
Expand All @@ -369,7 +391,8 @@ void CudaUVMSpace::impl_deallocate(
try {
if (arg_alloc_ptr != nullptr) {
Kokkos::Impl::num_uvm_allocations--;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
KOKKOS_IMPL_CUDA_SAFE_CALL(
(Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
}
} catch (...) {
}
Expand Down Expand Up @@ -399,7 +422,8 @@ void CudaHostPinnedSpace::impl_deallocate(
reported_size);
}
try {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
KOKKOS_IMPL_CUDA_SAFE_CALL((
Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr)));
} catch (...) {
}
}
Expand Down Expand Up @@ -570,16 +594,19 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
bool to_device) {
if ((ptr == nullptr) || (bytes == 0)) return;
cudaPointerAttributes attr;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
KOKKOS_IMPL_CUDA_SAFE_CALL((
space.impl_internal_space_instance()->cuda_pointer_get_attributes_wrapper(
&attr, ptr)));
// I measured this and it turns out prefetching towards the host slows
// DualView syncs down. Probably because the latency is not too bad in the
// first place for the pull down. If we want to change that provde
// cudaCpuDeviceId as the device if to_device is false
bool is_managed = attr.type == cudaMemoryTypeManaged;
if (to_device && is_managed &&
space.cuda_device_prop().concurrentManagedAccess) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync(
ptr, bytes, space.cuda_device(), space.cuda_stream()));
KOKKOS_IMPL_CUDA_SAFE_CALL(
(space.impl_internal_space_instance()->cuda_mem_prefetch_async_wrapper(
ptr, bytes, space.cuda_device(), space.cuda_stream())));
}
}

Expand Down
43 changes: 30 additions & 13 deletions core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <Cuda/Kokkos_Cuda.hpp>
#include <cuda_runtime_api.h>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Instance.hpp>

namespace Kokkos {
namespace Impl {
Expand All @@ -55,8 +56,11 @@ struct GraphImpl<Kokkos::Cuda> {
constexpr size_t error_log_size = 256;
cudaGraphNode_t error_node = nullptr;
char error_log[error_log_size];
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate(
&m_graph_exec, m_graph, &error_node, error_log, error_log_size));
KOKKOS_IMPL_CUDA_SAFE_CALL(
(m_execution_space.impl_internal_space_instance()
->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph,
&error_node, error_log,
error_log_size)));
// TODO @graphs print out errors
}

Expand All @@ -83,24 +87,31 @@ struct GraphImpl<Kokkos::Cuda> {
m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
KOKKOS_EXPECTS(bool(m_graph))
if (bool(m_graph_exec)) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
KOKKOS_IMPL_CUDA_SAFE_CALL(
(m_execution_space.impl_internal_space_instance()
->cuda_graph_exec_destroy_wrapper(m_graph_exec)));
}
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
KOKKOS_IMPL_CUDA_SAFE_CALL(
(m_execution_space.impl_internal_space_instance()
->cuda_graph_destroy_wrapper(m_graph)));
};

explicit GraphImpl(Kokkos::Cuda arg_instance)
: m_execution_space(std::move(arg_instance)) {
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
(m_execution_space.impl_internal_space_instance()
->cuda_graph_create_wrapper(&m_graph, cuda_graph_flags_t{0})));
}

void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
// All of the predecessors are just added as normal, so all we need to
// do here is add an empty node
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
/* dependencies = */ nullptr,
/* numDependencies = */ 0));
(m_execution_space.impl_internal_space_instance()
->cuda_graph_add_empty_node_wrapper(
&(arg_node_ptr->node_details_t::node), m_graph,
/* dependencies = */ nullptr,
/* numDependencies = */ 0)));
}

template <class NodeImpl>
Expand Down Expand Up @@ -146,15 +157,19 @@ struct GraphImpl<Kokkos::Cuda> {
KOKKOS_EXPECTS(bool(cuda_node))

KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1));
(m_execution_space.impl_internal_space_instance()
->cuda_graph_add_dependencies_wrapper(m_graph, &pred_cuda_node,
&cuda_node, 1)));
}

void submit() {
if (!bool(m_graph_exec)) {
_instantiate_graph();
}
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream()));
(m_execution_space.impl_internal_space_instance()
->cuda_graph_launch_wrapper(m_graph_exec,
m_execution_space.cuda_stream())));
}

execution_space const& get_execution_space() const noexcept {
Expand All @@ -167,9 +182,11 @@ struct GraphImpl<Kokkos::Cuda> {
auto rv = std::make_shared<root_node_impl_t>(
get_execution_space(), _graph_node_is_root_ctor_tag{});
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
/* dependencies = */ nullptr,
/* numDependencies = */ 0));
(m_execution_space.impl_internal_space_instance()
->cuda_graph_add_empty_node_wrapper(&(rv->node_details_t::node),
m_graph,
/* dependencies = */ nullptr,
/* numDependencies = */ 0)));
KOKKOS_ENSURES(bool(rv->node_details_t::node))
return rv;
}
Expand Down

0 comments on commit 4d8629f

Please sign in to comment.