Skip to content

Commit

Permalink
SYCL: Implement DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION path (k…
Browse files Browse the repository at this point in the history
…okkos#6534)

* SYCL: Implement DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION path

* Sync with desul

* [ci skip] Try improving comments

* Configure DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION instead of compiler flag

* Print SYCL_EXT_ONEAPI_DEVICE_GLOBAL in configuration
  • Loading branch information
masterleinad committed Nov 8, 2023
1 parent 8008496 commit 26464df
Show file tree
Hide file tree
Showing 13 changed files with 113 additions and 24 deletions.
13 changes: 9 additions & 4 deletions cmake/kokkos_arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -585,16 +585,20 @@ IF (KOKKOS_ENABLE_SYCL)
ENDIF()

# Check support for device_global variables
# FIXME_SYCL Even if SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we still can't
# use device global variables with shared libraries
IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS)
# FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device
# global variables with shared libraries using the "non-separable compilation"
# implementation. Otherwise, the feature is not supported when building shared
# libraries. Thus, we don't even check for support if shared libraries are
# requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined.
IF(KOKKOS_ENABLE_SYCL)
STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}")
INCLUDE(CheckCXXSymbolExists)
CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL)
IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL)
SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON)
# Use the non-separable compilation implementation to support shared libraries as well.
COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED)
ELSE()
ELSEIF(NOT BUILD_SHARED_LIBS)
INCLUDE(CheckCXXSourceCompiles)
CHECK_CXX_SOURCE_COMPILES("
#include <sycl/sycl.hpp>
Expand All @@ -614,6 +618,7 @@ IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS)
KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)

IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
# Only the separable compilation implementation is supported.
COMPILER_SPECIFIC_FLAGS(
DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
)
Expand Down
3 changes: 3 additions & 0 deletions core/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ IF (NOT desul_FOUND)
ENDIF()
IF(KOKKOS_ENABLE_SYCL)
SET(DESUL_ATOMICS_ENABLE_SYCL ON)
IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL)
SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON)
ENDIF()
ENDIF()
IF(KOKKOS_ENABLE_OPENMPTARGET)
SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP
Expand Down
5 changes: 5 additions & 0 deletions core/src/SYCL/Kokkos_SYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const {
#else
os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n";
#endif
#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL
os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n";
#else
os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n";
#endif

#ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n";
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,

const BarePolicy bare_policy(m_policy);

desul::ensure_sycl_lock_arrays_on_device(q);

auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
const auto range = compute_ranges();
const sycl::range<3> global_range = range.get_global_range();
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
const Kokkos::Experimental::SYCL& space = policy.space();
sycl::queue& q = space.sycl_queue();

desul::ensure_sycl_lock_arrays_on_device(q);

auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
cgh.depends_on(memcpy_event);
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
const Kokkos::Experimental::SYCL& space = policy.space();
sycl::queue& q = space.sycl_queue();

desul::ensure_sycl_lock_arrays_on_device(q);

auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
// FIXME_SYCL accessors seem to need a size greater than zero at least for
// host queues
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,

sycl::event last_reduction_event;

desul::ensure_sycl_lock_arrays_on_device(q);

// If n_tiles==0 we only call init() and final() working with the global
// scratch memory but don't copy back to m_result_ptr yet.
if (n_tiles == 0) {
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,

sycl::event last_reduction_event;

desul::ensure_sycl_lock_arrays_on_device(q);

// If size<=1 we only call init(), the functor and possibly final once
// working with the global scratch memory but don't copy back to
// m_result_ptr yet.
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,

sycl::event last_reduction_event;

desul::ensure_sycl_lock_arrays_on_device(q);

// If size<=1 we only call init(), the functor and possibly final once
// working with the global scratch memory but don't copy back to
// m_result_ptr yet.
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ class ParallelScanSYCLBase {
sycl::device_ptr<value_type> global_mem;
sycl::device_ptr<value_type> group_results;

desul::ensure_sycl_lock_arrays_on_device(q);

auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) {
sycl::local_accessor<unsigned int> num_teams_done(1, cgh);

Expand Down
1 change: 1 addition & 0 deletions tpls/desul/Config.hpp.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ SPDX-License-Identifier: (BSD-3-Clause)
#cmakedefine DESUL_ATOMICS_ENABLE_HIP
#cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
#cmakedefine DESUL_ATOMICS_ENABLE_SYCL
#cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
#cmakedefine DESUL_ATOMICS_ENABLE_OPENMP

#endif
80 changes: 74 additions & 6 deletions tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,35 @@ void finalize_lock_arrays_sycl(sycl::queue q);
* \brief This global variable in SYCL space is what kernels use to get access
* to the lock arrays.
*
* There is only one single instance of this global variable for the entire
* executable, whose definition will be in Kokkos_SYCL_Locks.cpp (and whose
* declaration here must be extern). This one instance will be initialized
* by initialize_host_sycl_lock_arrays and need not be modified afterwards.
* When relocatable device code is enabled, there is only one single instance of this
* global variable for the entire executable, whose definition will be in
* Kokkos_SYCL_Locks.cpp (and whose declaration here must then be extern). This one
* instance will be initialized by initialize_host_sycl_lock_arrays and need not be
* modified afterwards.
*
* When relocatable device code is disabled, an instance of this variable will be
* created in every translation unit that sees this header file (we make this clear by
* marking it static, meaning no other translation unit can link to it). Since the
* Kokkos_SYCL_Locks.cpp translation unit cannot initialize the instances in other
* translation units, we must update this SYCL global variable based on the Host global
* variable prior to running any kernels that will use it. That is the purpose of the
* ensure_sycl_lock_arrays_on_device function.
*/
SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE;
#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
SYCL_EXTERNAL extern
#else
static
#endif
sycl_device_global<int32_t*>
SYCL_SPACE_ATOMIC_LOCKS_DEVICE;

SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE;
#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
SYCL_EXTERNAL extern
#else
static
#endif
sycl_device_global<int32_t*>
SYCL_SPACE_ATOMIC_LOCKS_NODE;

#define SYCL_SPACE_ATOMIC_MASK 0x1FFFF

Expand Down Expand Up @@ -128,6 +149,34 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) {
lock_node_ref.exchange(0);
}

#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
inline
#else
inline static
#endif
void
copy_sycl_lock_arrays_to_device(sycl::queue q) {
static bool once = [&q]() {
#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL
q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE,
&SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h,
sizeof(int32_t*));
q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE,
&SYCL_SPACE_ATOMIC_LOCKS_NODE_h,
sizeof(int32_t*));
#else
auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h;
auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h;
q.single_task([=] {
SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr;
SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr;
});
#endif
return true;
}();
(void)once;
}

#else // not supported

template <typename /*AlwaysInt*/ = int>
Expand Down Expand Up @@ -155,7 +204,26 @@ inline bool lock_address_sycl(void*, MemoryScopeNode) {
inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); }

inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); }

#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
inline
#else
inline static
#endif
void
copy_sycl_lock_arrays_to_device(sycl::queue) {
}

#endif
} // namespace Impl

#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
inline void ensure_sycl_lock_arrays_on_device(sycl::queue) {}
#else
static inline void ensure_sycl_lock_arrays_on_device(sycl::queue q) {
Impl::copy_sycl_lock_arrays_to_device(q);
}
#endif

} // namespace desul
#endif
21 changes: 7 additions & 14 deletions tpls/desul/src/Lock_Array_SYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ SPDX-License-Identifier: (BSD-3-Clause)

namespace desul::Impl {

#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
SYCL_EXTERNAL
sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE;
SYCL_EXTERNAL
sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE;
#endif

int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
Expand All @@ -31,19 +33,7 @@ void init_lock_arrays_sycl<int>(sycl::queue q) {
SYCL_SPACE_ATOMIC_LOCKS_NODE_h =
sycl::malloc_host<int32_t>(SYCL_SPACE_ATOMIC_MASK + 1, q);

// FIXME_SYCL Once supported, the following should be replaced by
// q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE,
// &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h,
// sizeof(int32_t*));
// q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE,
// &SYCL_SPACE_ATOMIC_LOCKS_NODE_h,
// sizeof(int32_t*));
auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h;
auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h;
q.single_task([=] {
SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr;
SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr;
});
copy_sycl_lock_arrays_to_device(q);

q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h,
0,
Expand All @@ -63,7 +53,10 @@ void finalize_lock_arrays_sycl<int>(sycl::queue q) {
sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q);
SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION
copy_sycl_lock_arrays_to_device(q);
#endif
}

} // namespace desul::Impl
} // namespace desul::Impl
#endif

0 comments on commit 26464df

Please sign in to comment.