Skip to content

Commit

Permalink
SYCL: Prepare Parallel* for Graphs (kokkos#6988)
Browse files Browse the repository at this point in the history
* SYCL: Make Parallel* copyable

* Address review comments

* Refactor Team policies further

* Fix alias for SYCL TeamPolicy ParallelReduce

* Improve const-correctness in Kokkos_SYCL_ParallelReduce_Team

* Fix up Kokkos_SYCL_ParallelReduce_Team.hpp
  • Loading branch information
masterleinad committed May 8, 2024
1 parent d61d75a commit 50a862c
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 108 deletions.
6 changes: 0 additions & 6 deletions core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,6 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
functor_wrapper.register_event(event);
}

ParallelFor(const ParallelFor&) = delete;
ParallelFor(ParallelFor&&) = delete;
ParallelFor& operator=(const ParallelFor&) = delete;
ParallelFor& operator=(ParallelFor&&) = delete;
~ParallelFor() = default;

ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
: m_functor(arg_functor),
m_policy(arg_policy),
Expand Down
6 changes: 0 additions & 6 deletions core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,6 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
functor_wrapper.register_event(event);
}

ParallelFor(const ParallelFor&) = delete;
ParallelFor(ParallelFor&&) = delete;
ParallelFor& operator=(const ParallelFor&) = delete;
ParallelFor& operator=(ParallelFor&&) = delete;
~ParallelFor() = default;

ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
: m_functor(arg_functor), m_policy(arg_policy) {}
};
Expand Down
53 changes: 25 additions & 28 deletions core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ template <typename FunctorType, typename... Properties>
class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
Kokkos::Experimental::SYCL> {
public:
using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
using Policy = TeamPolicy<Properties...>;
using functor_type = FunctorType;
using size_type = ::Kokkos::Experimental::SYCL::size_type;

Expand All @@ -44,19 +44,14 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
size_type const m_vector_size;
int m_shmem_begin;
int m_shmem_size;
sycl_device_ptr<char> m_global_scratch_ptr;
size_t m_scratch_size[2];
// Only let one ParallelFor instance at a time use the team scratch memory.
// The constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_scratch_buffers_lock;
int m_scratch_pool_id = -1;

template <typename FunctorWrapper>
sycl::event sycl_direct_launch(const Policy& policy,
sycl::event sycl_direct_launch(const sycl_device_ptr<char> global_scratch_ptr,
const FunctorWrapper& functor_wrapper,
const sycl::event& memcpy_event) const {
// Convenience references
const Kokkos::Experimental::SYCL& space = policy.space();
const Kokkos::Experimental::SYCL& space = m_policy.space();
sycl::queue& q = space.sycl_queue();

desul::ensure_sycl_lock_arrays_on_device(q);
Expand All @@ -72,7 +67,6 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
// Avoid capturing *this since it might not be trivially copyable
const auto shmem_begin = m_shmem_begin;
const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
sycl_device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;

auto lambda = [=](sycl::nd_item<2> item) {
const member_type team_member(
Expand Down Expand Up @@ -125,28 +119,39 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
inline void execute() const {
if (m_league_size == 0) return;

auto& space = *m_policy.space().impl_internal_space_instance();
auto& instance = *m_policy.space().impl_internal_space_instance();

// Only let one instance at a time resize the instance's scratch memory
// allocations.
std::scoped_lock<std::mutex> team_scratch_lock(
instance.m_team_scratch_mutex);

// Functor's reduce memory, team scan memory, and team shared memory depend
// upon team size.
int scratch_pool_id = instance.acquire_team_scratch_space();
const sycl_device_ptr<char> global_scratch_ptr =
static_cast<sycl_device_ptr<char>>(instance.resize_team_scratch_space(
scratch_pool_id,
static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));

Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
indirectKernelMem = space.get_indirect_kernel_mem();
indirectKernelMem = instance.get_indirect_kernel_mem();

auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
m_functor, indirectKernelMem);

sycl::event event = sycl_direct_launch(m_policy, functor_wrapper,
sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper,
functor_wrapper.get_copy_event());
functor_wrapper.register_event(event);
space.register_team_scratch_event(m_scratch_pool_id, event);
instance.register_team_scratch_event(scratch_pool_id, event);
}

ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
: m_functor(arg_functor),
m_policy(arg_policy),
m_league_size(arg_policy.league_size()),
m_team_size(arg_policy.team_size()),
m_vector_size(arg_policy.impl_vector_length()),
m_scratch_buffers_lock(arg_policy.space()
.impl_internal_space_instance()
->m_team_scratch_mutex) {
m_vector_size(arg_policy.impl_vector_length()) {
// FIXME_SYCL optimize
if (m_team_size < 0)
m_team_size =
Expand All @@ -159,22 +164,14 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);

// Functor's reduce memory, team scan memory, and team shared memory depend
// upon team size.
auto& space = *m_policy.space().impl_internal_space_instance();
m_scratch_pool_id = space.acquire_team_scratch_space();
m_global_scratch_ptr =
static_cast<sycl_device_ptr<char>>(space.resize_team_scratch_space(
m_scratch_pool_id,
static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));

if (static_cast<int>(space.m_maxShmemPerBlock) <
const auto& instance = *m_policy.space().impl_internal_space_instance();
if (static_cast<int>(instance.m_maxShmemPerBlock) <
m_shmem_size - m_shmem_begin) {
std::stringstream out;
out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
"Requested "
<< m_shmem_size - m_shmem_begin << " bytes but maximum is "
<< space.m_maxShmemPerBlock << '\n';
<< instance.m_maxShmemPerBlock << '\n';
Kokkos::Impl::throw_runtime_exception(out.str());
}

Expand Down
14 changes: 7 additions & 7 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
m_result_ptr(v.data()),
m_result_ptr_device_accessible(
MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
typename View::memory_space>::accessible),
m_scratch_buffers_lock(
m_space.impl_internal_space_instance()->m_mutexScratchSpace) {}
typename View::memory_space>::accessible) {}

private:
template <typename CombinedFunctorReducerWrapper>
Expand Down Expand Up @@ -330,6 +328,12 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
void execute() const {
Kokkos::Experimental::Impl::SYCLInternal& instance =
*m_space.impl_internal_space_instance();

// Only let one instance at a time resize the instance's scratch memory
// allocations.
std::scoped_lock<std::mutex> scratch_buffers_lock(
instance.m_mutexScratchSpace);

using IndirectKernelMem =
Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem();
Expand All @@ -349,10 +353,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const Kokkos::Experimental::SYCL& m_space;
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;

// Only let one ParallelReduce instance at a time use the host scratch memory.
// The constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_scratch_buffers_lock;
};

#endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */
14 changes: 7 additions & 7 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
m_result_ptr(v.data()),
m_result_ptr_device_accessible(
MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
typename View::memory_space>::accessible),
m_scratch_buffers_lock(
p.space().impl_internal_space_instance()->m_mutexScratchSpace) {}
typename View::memory_space>::accessible) {}

private:
template <typename PolicyType, typename CombinedFunctorReducerWrapper>
Expand Down Expand Up @@ -347,6 +345,12 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
void execute() const {
Kokkos::Experimental::Impl::SYCLInternal& instance =
*m_policy.space().impl_internal_space_instance();

// Only let one instance at a time resize the instance's scratch memory
// allocations.
std::scoped_lock<std::mutex> scratch_buffers_lock(
instance.m_mutexScratchSpace);

using IndirectKernelMem =
Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem();
Expand All @@ -366,10 +370,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const Policy m_policy;
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;

// Only let one ParallelReduce instance at a time use the host scratch memory.
// The constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_scratch_buffers_lock;
};

#endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */
84 changes: 38 additions & 46 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
Kokkos::TeamPolicy<Properties...>,
Kokkos::Experimental::SYCL> {
public:
using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
using Policy = TeamPolicy<Properties...>;
using FunctorType = typename CombinedFunctorReducerType::functor_type;
using ReducerType = typename CombinedFunctorReducerType::reducer_type;

Expand All @@ -54,24 +54,18 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const bool m_result_ptr_device_accessible;
size_type m_shmem_begin;
size_type m_shmem_size;
sycl_device_ptr<char> m_global_scratch_ptr;
size_t m_scratch_size[2];
const size_type m_league_size;
int m_team_size;
const size_type m_vector_size;
// Only let one ParallelReduce instance at a time use the team scratch memory
// and the host scratch memory. The constructor acquires the mutex which is
// released in the destructor.
std::scoped_lock<std::mutex> m_scratch_buffers_lock;
int m_scratch_pool_id = -1;

template <typename PolicyType, typename CombinedFunctorReducerWrapper>
template <typename CombinedFunctorReducerWrapper>
sycl::event sycl_direct_launch(
const PolicyType& policy,
const sycl_device_ptr<char> global_scratch_ptr,
const CombinedFunctorReducerWrapper& functor_reducer_wrapper,
const sycl::event& memcpy_event) const {
// Convenience references
const Kokkos::Experimental::SYCL& space = policy.space();
const Kokkos::Experimental::SYCL& space = m_policy.space();
Kokkos::Experimental::Impl::SYCLInternal& instance =
*space.impl_internal_space_instance();
sycl::queue& q = space.sycl_queue();
Expand Down Expand Up @@ -113,7 +107,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
// Avoid capturing *this since it might not be trivially copyable
const auto shmem_begin = m_shmem_begin;
const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
sycl_device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;

#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
cgh.depends_on(memcpy_event);
Expand Down Expand Up @@ -170,7 +163,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const auto shmem_begin = m_shmem_begin;
const auto league_size = m_league_size;
const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
sycl_device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
sycl::local_accessor<unsigned int> num_teams_done(1, cgh);

auto team_reduction_factory =
Expand Down Expand Up @@ -386,6 +378,22 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
inline void execute() {
Kokkos::Experimental::Impl::SYCLInternal& instance =
*m_policy.space().impl_internal_space_instance();

// Only let one instance at a time resize the instance's scratch memory
// allocations.
std::scoped_lock<std::mutex> scratch_buffers_lock(
instance.m_mutexScratchSpace);
std::scoped_lock<std::mutex> team_scratch_lock(
instance.m_team_scratch_mutex);

// Functor's reduce memory, team scan memory, and team shared memory depend
// upon team size.
int scratch_pool_id = instance.acquire_team_scratch_space();
const sycl_device_ptr<char> global_scratch_ptr =
static_cast<sycl_device_ptr<char>>(instance.resize_team_scratch_space(
scratch_pool_id,
static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));

using IndirectKernelMem =
Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem();
Expand All @@ -395,14 +403,24 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
indirectKernelMem);

sycl::event event =
sycl_direct_launch(m_policy, functor_reducer_wrapper,
sycl_direct_launch(global_scratch_ptr, functor_reducer_wrapper,
functor_reducer_wrapper.get_copy_event());
functor_reducer_wrapper.register_event(event);
instance.register_team_scratch_event(m_scratch_pool_id, event);
instance.register_team_scratch_event(scratch_pool_id, event);
}

private:
void initialize() {
template <class ViewType>
ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer,
Policy const& arg_policy, ViewType const& arg_result)
: m_functor_reducer(arg_functor_reducer),
m_policy(arg_policy),
m_result_ptr(arg_result.data()),
m_result_ptr_device_accessible(
MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
typename ViewType::memory_space>::accessible),
m_league_size(arg_policy.league_size()),
m_team_size(arg_policy.team_size()),
m_vector_size(arg_policy.impl_vector_length()) {
// FIXME_SYCL optimize
if (m_team_size < 0)
m_team_size = m_policy.team_size_recommended(
Expand All @@ -423,22 +441,15 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);

// Functor's reduce memory, team scan memory, and team shared memory depend
// upon team size.
auto& space = *m_policy.space().impl_internal_space_instance();
m_scratch_pool_id = space.acquire_team_scratch_space();
m_global_scratch_ptr =
static_cast<sycl_device_ptr<char>>(space.resize_team_scratch_space(
m_scratch_pool_id,
static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));

if (static_cast<int>(space.m_maxShmemPerBlock) <
const Kokkos::Experimental::Impl::SYCLInternal& instance =
*m_policy.space().impl_internal_space_instance();
if (static_cast<int>(instance.m_maxShmemPerBlock) <
m_shmem_size - m_shmem_begin) {
std::stringstream out;
out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
"Requested "
<< m_shmem_size - m_shmem_begin << " bytes but maximum is "
<< space.m_maxShmemPerBlock << '\n';
<< instance.m_maxShmemPerBlock << '\n';
Kokkos::Impl::throw_runtime_exception(out.str());
}

Expand All @@ -448,25 +459,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
Kokkos::Impl::throw_runtime_exception(
"Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
}

public:
template <class ViewType>
ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer,
Policy const& arg_policy, ViewType const& arg_result)
: m_functor_reducer(arg_functor_reducer),
m_policy(arg_policy),
m_result_ptr(arg_result.data()),
m_result_ptr_device_accessible(
MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
typename ViewType::memory_space>::accessible),
m_league_size(arg_policy.league_size()),
m_team_size(arg_policy.team_size()),
m_vector_size(arg_policy.impl_vector_length()),
m_scratch_buffers_lock(arg_policy.space()
.impl_internal_space_instance()
->m_team_scratch_mutex) {
initialize();
}
};

#endif

0 comments on commit 50a862c

Please sign in to comment.