Skip to content

Commit

Permalink
Update the OpenACC parallel_reduce() constructs with Range/MDRange/Te…
Browse files Browse the repository at this point in the history
…am (kokkos#6072)

* Update the OpenACC parallel_reduce() constructs with Range/MDRange/Team
Policy to support reductions on device data.

* Update as suggested by the code review.

* Add comments as suggested by the code review.

* Undo the unit test CMake change.

* Update the OpenACC parallel_reduce() implementations to correctly handle
the cases where the number of iterations is zero.
Update reduction-related unit tests to disable unsupported tests for the
OpenACC backend.
Update CMakeLists.txt in the unit test to enable reduction-related unit
tests supported by the OpenACC backend.

* Re-enabled supported unit tests.

* Disable TestOpenACC_Reducers_a.cpp since it fails when compiled by NVHPC V22.5 or older

* Disable unsupported unit test.
  • Loading branch information
seyonglee committed May 8, 2023
1 parent cf82edc commit 1c0e3bf
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 55 deletions.
33 changes: 28 additions & 5 deletions core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,29 +51,40 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
CombinedFunctorReducerType m_functor_reducer;
Policy m_policy;
Pointer m_result_ptr;
bool m_result_ptr_on_device;

public:
template <class ViewType>
ParallelReduce(const CombinedFunctorReducerType& functor_reducer,
const Policy& policy, const ViewType& result)
: m_functor_reducer(functor_reducer),
m_policy(policy),
m_result_ptr(result.data()) {}
m_result_ptr(result.data()),
m_result_ptr_on_device(
MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
typename ViewType::memory_space>::accessible) {}

void execute() const {
static_assert(1 < Policy::rank && Policy::rank < 7);
static_assert(Policy::inner_direction == Iterate::Left ||
Policy::inner_direction == Iterate::Right);
constexpr int rank = Policy::rank;
ValueType val;
const ReducerType& reducer = m_functor_reducer.get_reducer();
reducer.init(&val);

for (int i = 0; i < rank; ++i) {
if (m_policy.m_lower[i] >= m_policy.m_upper[i]) {
if (m_result_ptr_on_device) {
acc_memcpy_to_device(m_result_ptr, &val, sizeof(ValueType));
} else {
*m_result_ptr = val;
}
return;
}
}

ValueType val;
const ReducerType& reducer = m_functor_reducer.get_reducer();
reducer.init(&val);
int const async_arg = m_policy.space().acc_async_queue();

Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper(
Kokkos::Experimental::Impl::FunctorAdapter<
Expand All @@ -85,8 +96,20 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
Sum<ValueType>, typename ReducerType::functor_type>(val),
m_policy);

// OpenACC backend supports only built-in Reducer types; thus
// reducer.final() below is a no-op.
reducer.final(&val);
*m_result_ptr = val;
// acc_wait(async_arg) in the below if-else statements is needed because the
// above OpenACC compute kernel can be executed asynchronously and val is a
// local host variable.
if (m_result_ptr_on_device) {
acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(ValueType),
async_arg);
acc_wait(async_arg);
} else {
acc_wait(async_arg);
*m_result_ptr = val;
}
}
};

Expand Down
33 changes: 28 additions & 5 deletions core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,26 +52,37 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
CombinedFunctorReducerType m_functor_reducer;
Policy m_policy;
Pointer m_result_ptr;
bool m_result_ptr_on_device;

public:
template <class ViewType>
ParallelReduce(CombinedFunctorReducerType const& functor_reducer,
Policy const& policy, ViewType const& result)
: m_functor_reducer(functor_reducer),
m_policy(policy),
m_result_ptr(result.data()) {}
m_result_ptr(result.data()),
m_result_ptr_on_device(
MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
typename ViewType::memory_space>::accessible) {}

void execute() const {
auto const begin = m_policy.begin();
auto const end = m_policy.end();

ValueType val;
ReducerType const& reducer = m_functor_reducer.get_reducer();
reducer.init(&val);

if (end <= begin) {
if (m_result_ptr_on_device == false) {
*m_result_ptr = val;
} else {
acc_memcpy_to_device(m_result_ptr, &val, sizeof(ValueType));
}
return;
}

ValueType val;
ReducerType const& reducer = m_functor_reducer.get_reducer();
reducer.init(&val);
int const async_arg = m_policy.space().acc_async_queue();

Kokkos::Experimental::Impl::OpenACCParallelReduceHelper(
Kokkos::Experimental::Impl::FunctorAdapter<
Expand All @@ -83,8 +94,20 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
Sum<ValueType>, typename ReducerType::functor_type>(val),
m_policy);

// OpenACC backend supports only built-in Reducer types; thus
// reducer.final() below is a no-op.
reducer.final(&val);
*m_result_ptr = val;
// acc_wait(async_arg) in the below if-else statements is needed because the
// above OpenACC compute kernel can be executed asynchronously and val is a
// local host variable.
if (m_result_ptr_on_device == false) {
acc_wait(async_arg);
*m_result_ptr = val;
} else {
acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(ValueType),
async_arg);
acc_wait(async_arg);
}
}
};

Expand Down
38 changes: 31 additions & 7 deletions core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,37 +63,61 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
CombinedFunctorReducerType m_functor_reducer;
Policy m_policy;
pointer_type m_result_ptr;
bool m_result_ptr_on_device;

public:
void execute() const {
auto league_size = m_policy.league_size();
auto team_size = m_policy.team_size();
auto vector_length = m_policy.impl_vector_length();

value_type tmp;
int const async_arg = m_policy.space().acc_async_queue();
value_type val;
const ReducerType& reducer = m_functor_reducer.get_reducer();
reducer.init(&tmp);
reducer.init(&val);
if (league_size <= 0) {
if (m_result_ptr_on_device == false) {
*m_result_ptr = val;
} else {
acc_memcpy_to_device(m_result_ptr, &val, sizeof(value_type));
}
return;
}

Kokkos::Experimental::Impl::OpenACCParallelReduceTeamHelper(
Kokkos::Experimental::Impl::FunctorAdapter<
FunctorType, Policy, KOKKOS_IMPL_OPENACC_LOOP_CLAUSE>(
m_functor_reducer.get_functor()),
std::conditional_t<
std::is_same_v<FunctorType, typename ReducerType::functor_type>,
Sum<value_type>, typename ReducerType::functor_type>(tmp),
Sum<value_type>, typename ReducerType::functor_type>(val),
m_policy);

reducer.final(&tmp);

m_result_ptr[0] = tmp;
// OpenACC backend supports only built-in Reducer types; thus
// reducer.final() below is a no-op.
reducer.final(&val);
// acc_wait(async_arg) in the below if-else statements is needed because the
// above OpenACC compute kernel can be executed asynchronously and val is a
// local host variable.
if (m_result_ptr_on_device == false) {
acc_wait(async_arg);
*m_result_ptr = val;
} else {
acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(value_type),
async_arg);
acc_wait(async_arg);
}
}

template <class ViewType>
ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
const Policy& arg_policy, const ViewType& arg_result_view)
: m_functor_reducer(arg_functor_reducer),
m_policy(arg_policy),
m_result_ptr(arg_result_view.data()) {}
m_result_ptr(arg_result_view.data()),
m_result_ptr_on_device(
MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
typename ViewType::memory_space>::accessible) {}
};

namespace Kokkos {
Expand Down
39 changes: 1 addition & 38 deletions core/unit_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -367,23 +367,16 @@ if(Kokkos_ENABLE_OPENACC)
list(REMOVE_ITEM OpenACC_SOURCES
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamCombinedReducers.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamMDRange.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamReductionScan.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScan.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_e.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_a.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_b.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_subview.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewOfClass.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_WorkGraph.cpp
Expand Down Expand Up @@ -492,61 +485,31 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions1.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions2.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_a.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_b.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_c.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_d.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_e.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_f.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_g.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reduce.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_b.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_c.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp #fails if NVHPC V22.5 or lower.
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_e.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_a.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c01.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c04.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c06.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c07.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c09.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c10.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c12.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c13.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_a.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_b.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_c.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_d.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_f.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp
${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp
)
Expand Down
6 changes: 6 additions & 0 deletions core/unit_test/TestReduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,10 @@ class TestReduceDynamic {

TestReduceDynamic(const size_type nwork) {
run_test_dynamic(nwork);
#ifndef KOKKOS_ENABLE_OPENACC
// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
run_test_dynamic_minmax(nwork);
#endif
run_test_dynamic_final(nwork);
}

Expand Down Expand Up @@ -542,6 +545,8 @@ TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) {

// FIXME_OPENMPTARGET: Not yet implemented.
#ifndef KOKKOS_ENABLE_OPENMPTARGET
// FIXME_OPENACC: Not yet implemented.
#ifndef KOKKOS_ENABLE_OPENACC
TEST(TEST_CATEGORY, int_combined_reduce) {
using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>;
constexpr uint64_t nw = 1000;
Expand Down Expand Up @@ -619,4 +624,5 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) {
}
}
#endif
#endif
} // namespace Test
18 changes: 18 additions & 0 deletions core/unit_test/TestReducers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -982,14 +982,23 @@ struct TestReducers {
test_sum(10001);
test_prod(35);
test_min(10003);
#if !defined(KOKKOS_ENABLE_OPENACC)
// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
test_minloc(10003);
#endif
test_max(10007);
#if !defined(KOKKOS_ENABLE_OPENACC)
// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
test_maxloc(10007);
#endif
#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \
(KOKKOS_COMPILER_CLANG < 1300)
// FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version.
#else
#if !defined(KOKKOS_ENABLE_OPENACC)
// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
test_minmaxloc(10007);
#endif
#endif
}

Expand All @@ -1000,14 +1009,23 @@ struct TestReducers {
test_sum(10001);
test_prod(sizeof(Scalar) > 4 ? 35 : 19); // avoid int overflow (see above)
test_min(10003);
#if !defined(KOKKOS_ENABLE_OPENACC)
// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
test_minloc(10003);
#endif
test_max(10007);
#if !defined(KOKKOS_ENABLE_OPENACC)
// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
test_maxloc(10007);
#endif
#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \
(KOKKOS_COMPILER_CLANG < 1300)
// FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version.
#else
#if !defined(KOKKOS_ENABLE_OPENACC)
// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
test_minmaxloc(10007);
#endif
#endif
test_BAnd(35);
test_BOr(35);
Expand Down

0 comments on commit 1c0e3bf

Please sign in to comment.