Skip to content

Commit

Permalink
view(uvm): fence if need in allocation (kokkos#6005)
Browse files Browse the repository at this point in the history
  • Loading branch information
romintomasetti committed May 4, 2023
1 parent 56ef02c commit 5c2d948
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 112 deletions.
7 changes: 4 additions & 3 deletions containers/src/Kokkos_DualView.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
: modified_flags(t_modified_flags("DualView::modified_flags")),
d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) {
// without UVM, host View mirrors
if (Kokkos::Impl::has_type<Impl::WithoutInitializing_t, P...>::value)
if constexpr (Kokkos::Impl::has_type<Impl::WithoutInitializing_t,
P...>::value)
h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view);
else
h_view = Kokkos::create_mirror_view(d_view);
Expand Down Expand Up @@ -576,8 +577,8 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
impl_report_host_sync();
}
}
if (std::is_same<typename t_host::memory_space,
typename t_dev::memory_space>::value) {
if constexpr (std::is_same<typename t_host::memory_space,
typename t_dev::memory_space>::value) {
typename t_dev::execution_space().fence(
"Kokkos::DualView<>::sync: fence after syncing DualView");
typename t_host::execution_space().fence(
Expand Down
25 changes: 0 additions & 25 deletions containers/src/Kokkos_DynRankView.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,37 +1089,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
"execution space");
}

//------------------------------------------------------------
#if defined(KOKKOS_ENABLE_CUDA)
// If allocating in CudaUVMSpace must fence before and after
// the allocation to protect against possible concurrent access
// on the CPU and the GPU.
// Fence using the trait's executon space (which will be Kokkos::Cuda)
// to avoid incomplete type errors from usng Kokkos::Cuda directly.
if (std::is_same<Kokkos::CudaUVMSpace,
typename traits::device_type::memory_space>::value) {
typename traits::device_type::memory_space::execution_space().fence(
"Kokkos::DynRankView<>::DynRankView: fence before UVM allocation");
}
#endif
//------------------------------------------------------------

Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
prop_copy,
Impl::DynRankDimTraits<typename traits::specialize>::
template createLayout<traits, P...>(arg_prop, arg_layout),
Impl::ViewCtorProp<P...>::has_execution_space);

//------------------------------------------------------------
#if defined(KOKKOS_ENABLE_CUDA)
if (std::is_same<Kokkos::CudaUVMSpace,
typename traits::device_type::memory_space>::value) {
typename traits::device_type::memory_space::execution_space().fence(
"Kokkos::DynRankView<>::DynRankView: fence after UVM allocation");
}
#endif
//------------------------------------------------------------

// Setup and initialization complete, start tracking
m_track.assign_allocated_record_to_uninitialized(record);
}
Expand Down
25 changes: 0 additions & 25 deletions containers/src/Kokkos_OffsetView.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1191,35 +1191,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
"execution space");
}

//------------------------------------------------------------
#if defined(KOKKOS_ENABLE_CUDA)
// If allocating in CudaUVMSpace must fence before and after
// the allocation to protect against possible concurrent access
// on the CPU and the GPU.
// Fence using the trait's executon space (which will be Kokkos::Cuda)
// to avoid incomplete type errors from usng Kokkos::Cuda directly.
if (std::is_same<Kokkos::CudaUVMSpace,
typename traits::device_type::memory_space>::value) {
typename traits::device_type::memory_space::execution_space().fence(
"Kokkos::OffsetView::OffsetView(): fence before UVM allocation");
}
#endif
//------------------------------------------------------------

Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
prop_copy, arg_layout,
Kokkos::Impl::ViewCtorProp<P...>::has_execution_space);

//------------------------------------------------------------
#if defined(KOKKOS_ENABLE_CUDA)
if (std::is_same<Kokkos::CudaUVMSpace,
typename traits::device_type::memory_space>::value) {
typename traits::device_type::memory_space::execution_space().fence(
"Kokkos::OffsetView::OffsetView(): fence after UVM allocation");
}
#endif
//------------------------------------------------------------

// Setup and initialization complete, start tracking
m_track.assign_allocated_record_to_uninitialized(record);

Expand Down
54 changes: 24 additions & 30 deletions containers/unit_tests/TestWithoutInitializing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@

#include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp>

/// Some tests are skipped for @c CudaUVM memory space.
/// @todo To be revised according to the future of @c KOKKOS_ENABLE_CUDA_UVM.
///@{
#ifdef KOKKOS_ENABLE_CUDA
#define GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE \
if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, \
Kokkos::CudaUVMSpace>) \
GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
#else
#define GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
#endif
///@}

TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels());
Expand Down Expand Up @@ -125,11 +138,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) {
}

TEST(TEST_CATEGORY, realloc_exec_space_dualview) {
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<typename TEST_EXECSPACE::memory_space,
Kokkos::CudaUVMSpace>::value)
GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
#endif
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE

using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableFences());
Expand Down Expand Up @@ -221,11 +230,8 @@ TEST(TEST_CATEGORY, resize_exec_space_dynrankview) {
}

TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) {
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<typename TEST_EXECSPACE::memory_space,
Kokkos::CudaUVMSpace>::value)
GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
#endif
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE

// FIXME_THREADS The Threads backend fences every parallel_for
#ifdef KOKKOS_ENABLE_THREADS
if (std::is_same<TEST_EXECSPACE, Kokkos::Threads>::value)
Expand Down Expand Up @@ -363,11 +369,8 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) {
}

TEST(TEST_CATEGORY, realloc_exec_space_scatterview) {
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<typename TEST_EXECSPACE::memory_space,
Kokkos::CudaUVMSpace>::value)
GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
#endif
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE

// FIXME_THREADS The Threads backend fences every parallel_for
#ifdef KOKKOS_ENABLE_THREADS
if (std::is_same<typename TEST_EXECSPACE, Kokkos::Threads>::value)
Expand Down Expand Up @@ -477,11 +480,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynrankview_viewctor) {
}

TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynrankview) {
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<typename TEST_EXECSPACE::memory_space,
Kokkos::CudaUVMSpace>::value)
return;
#endif
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE

using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
Config::EnableFences());
Expand Down Expand Up @@ -584,11 +584,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_offsetview_view_ctor) {
}

TEST(TEST_CATEGORY, create_mirror_view_and_copy_offsetview) {
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<typename TEST_EXECSPACE::memory_space,
Kokkos::CudaUVMSpace>::value)
return;
#endif
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE

using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
Config::EnableFences());
Expand Down Expand Up @@ -659,11 +656,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {
}

TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<typename TEST_EXECSPACE::memory_space,
Kokkos::CudaUVMSpace>::value)
return;
#endif
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE

using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
Config::EnableFences());
Expand Down
11 changes: 7 additions & 4 deletions core/src/Cuda/Kokkos_CudaSpace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,11 @@ void *impl_allocate_common(const Cuda &exec_space, const char *arg_label,
if (exec_space_provided) {
cudaStream_t stream = exec_space.cuda_stream();
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
exec_space.fence("Kokkos::Cuda: backend fence after async malloc");
} else {
error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
Impl::cuda_device_synchronize(
"Kokkos::Cuda: backend fence after async malloc");
}
} else {
error_code = cudaMalloc(&ptr, arg_alloc_size);
Expand Down Expand Up @@ -324,9 +325,11 @@ void CudaSpace::impl_deallocate(
#error CUDART_VERSION undefined!
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
Impl::cuda_device_synchronize(
"Kokkos::Cuda: backend fence before async free");
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
Impl::cuda_device_synchronize(
"Kokkos::Cuda: backend fence after async free");
} else {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
}
Expand Down
25 changes: 0 additions & 25 deletions core/src/Kokkos_View.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1418,34 +1418,9 @@ class View : public ViewTraits<DataType, Properties...> {
std::is_same<typename traits::specialize, void>::value, i0, i1, i2, i3,
i4, i5, i6, i7, alloc_name);

//------------------------------------------------------------
#if defined(KOKKOS_ENABLE_CUDA)
// If allocating in CudaUVMSpace must fence before and after
// the allocation to protect against possible concurrent access
// on the CPU and the GPU.
// Fence using the trait's execution space (which will be Kokkos::Cuda)
// to avoid incomplete type errors from using Kokkos::Cuda directly.
if (std::is_same<Kokkos::CudaUVMSpace,
typename traits::device_type::memory_space>::value) {
typename traits::device_type::memory_space::execution_space().fence(
"Kokkos::View<...>::View: fence before allocating UVM");
}
#endif
//------------------------------------------------------------

Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
prop_copy, arg_layout, Impl::ViewCtorProp<P...>::has_execution_space);

//------------------------------------------------------------
#if defined(KOKKOS_ENABLE_CUDA)
if (std::is_same<Kokkos::CudaUVMSpace,
typename traits::device_type::memory_space>::value) {
typename traits::device_type::memory_space::execution_space().fence(
"Kokkos::View<...>::View: fence after allocating UVM");
}
#endif
//------------------------------------------------------------

// Setup and initialization complete, start tracking
m_track.m_tracker.assign_allocated_record_to_uninitialized(record);
}
Expand Down

0 comments on commit 5c2d948

Please sign in to comment.