Skip to content

Commit

Permalink
Change the default execution policy behavior of the OpenACC backend f…
Browse files Browse the repository at this point in the history
…rom synchronous to asynchronous executions.

- Change the default OpenACC async_arg value from acc_async_sync to acc_async_noval.
- Add acc_wait(async_arg) to scalar reduction operations (parallel_reduce()).
  • Loading branch information
seyonglee committed Jan 31, 2024
1 parent d2913cb commit eecd917
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 11 deletions.
26 changes: 17 additions & 9 deletions core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
// value checking is added as a safeguard. (The current NVHPC (V22.5)
// supports OpenACC V2.7.)
if (n > 0) {
acc_memcpy_device(dst, const_cast<void*>(src), n);
acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval);
}
}
DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst,
Expand All @@ -52,15 +52,15 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
ExecutionSpace> {
DeepCopy(void* dst, const void* src, size_t n) {
if (n > 0) {
acc_memcpy_device(dst, const_cast<void*>(src), n);
acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval);
}
}
DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
exec.fence(
"Kokkos::Impl::DeepCopy<OpenACCSpace, OpenACCSpace, "
"ExecutionSpace>::DeepCopy: fence before copy");
if (n > 0) {
acc_memcpy_device(dst, const_cast<void*>(src), n);
acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval);
}
}
};
Expand All @@ -70,7 +70,9 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
Kokkos::HostSpace,
Kokkos::Experimental::OpenACC> {
DeepCopy(void* dst, const void* src, size_t n) {
if (n > 0) acc_memcpy_to_device(dst, const_cast<void*>(src), n);
if (n > 0)
acc_memcpy_to_device_async(dst, const_cast<void*>(src), n,
acc_async_noval);
}
DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst,
const void* src, size_t n) {
Expand All @@ -85,15 +87,17 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
Kokkos::HostSpace, ExecutionSpace> {
DeepCopy(void* dst, const void* src, size_t n) {
if (n > 0) {
acc_memcpy_to_device(dst, const_cast<void*>(src), n);
acc_memcpy_to_device_async(dst, const_cast<void*>(src), n,
acc_async_noval);
}
}
DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
exec.fence(
"Kokkos::Impl::DeepCopy<OpenACCSpace, HostSpace, "
"ExecutionSpace>::DeepCopy: fence before copy");
if (n > 0) {
acc_memcpy_to_device(dst, const_cast<void*>(src), n);
acc_memcpy_to_device_async(dst, const_cast<void*>(src), n,
acc_async_noval);
}
}
};
Expand All @@ -104,7 +108,8 @@ struct Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
Kokkos::Experimental::OpenACC> {
DeepCopy(void* dst, const void* src, size_t n) {
if (n > 0) {
acc_memcpy_from_device(dst, const_cast<void*>(src), n);
acc_memcpy_from_device_async(dst, const_cast<void*>(src), n,
acc_async_noval);
}
}
DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst,
Expand All @@ -120,14 +125,17 @@ template <class ExecutionSpace>
struct Kokkos::Impl::DeepCopy<
Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace, ExecutionSpace> {
DeepCopy(void* dst, const void* src, size_t n) {
if (n > 0) acc_memcpy_from_device(dst, const_cast<void*>(src), n);
if (n > 0)
acc_memcpy_from_device_async(dst, const_cast<void*>(src), n,
acc_async_noval);
}
DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
exec.fence(
"Kokkos::Impl::DeepCopy<HostSpace, OpenACCSpace, "
"ExecutionSpace>::DeepCopy: fence before copy");
if (n > 0) {
acc_memcpy_from_device(dst, const_cast<void*>(src), n);
acc_memcpy_from_device_async(dst, const_cast<void*>(src), n,
acc_async_noval);
}
}
};
Expand Down
4 changes: 2 additions & 2 deletions core/src/OpenACC/Kokkos_OpenACC_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ class OpenACCInternal {

public:
static int m_acc_device_num;
int m_async_arg = acc_async_sync;
int m_async_arg = acc_async_noval;

OpenACCInternal() = default;

static OpenACCInternal& singleton();

bool verify_is_initialized(const char* const label) const;

void initialize(int async_arg = acc_async_sync);
void initialize(int async_arg = acc_async_noval);
void finalize();
bool is_initialized() const;

Expand Down
10 changes: 10 additions & 0 deletions core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
functor(i0, i1, val); \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand All @@ -159,6 +160,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
functor(i0, i1, val); \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -188,6 +190,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
} \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -217,6 +220,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
} \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -248,6 +252,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
} \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -279,6 +284,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
} \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -314,6 +320,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
} \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -349,6 +356,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
} \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -388,6 +396,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
} \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -427,6 +436,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
} \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
} // namespace Kokkos::Experimental::Impl
Expand Down
2 changes: 2 additions & 0 deletions core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
functor(i, val); \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
\
Expand Down Expand Up @@ -169,6 +170,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
functor(i, val); \
} \
} \
acc_wait(async_arg); \
aval = val; \
} \
} // namespace Kokkos::Experimental::Impl
Expand Down
1 change: 1 addition & 0 deletions core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
vector_length); \
functor(team, val); \
} \
acc_wait(async_arg); \
aval = val; \
} \
} // namespace Kokkos::Experimental::Impl
Expand Down

0 comments on commit eecd917

Please sign in to comment.