Skip to content

Commit

Permalink
SYCL: Use host-pinned memory to copy reduction/scan result (kokkos#6500)
Browse files Browse the repository at this point in the history
* SYCL: Use host-pinned memory to copy reduction/scan result

* Remove unused variable

* m_shared_memory_lock -> m_host_scratch_lock; improve comments

* Add comment for choosing memcpy over fence+deep_copy

* m_[host_]scratch_lock->m_scratch_buffers_lock
  • Loading branch information
masterleinad committed Oct 26, 2023
1 parent 6056c6b commit 0975671
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 65 deletions.
30 changes: 29 additions & 1 deletion core/src/SYCL/Kokkos_SYCL_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Kokkos::View<uint32_t*, SYCLDeviceUSMSpace> sycl_global_unique_token_locks(
}

SYCLInternal::~SYCLInternal() {
if (!was_finalized || m_scratchSpace || m_scratchFlags) {
if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) {
std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call "
"Kokkos::Experimental::SYCL::finalize()"
<< std::endl;
Expand Down Expand Up @@ -199,11 +199,15 @@ void SYCLInternal::finalize() {
using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
if (nullptr != m_scratchSpace)
RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace));
if (nullptr != m_scratchHost)
RecordSYCL::decrement(RecordSYCL::get_record(m_scratchHost));
if (nullptr != m_scratchFlags)
RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags));
m_syclDev = -1;
m_scratchSpaceCount = 0;
m_scratchSpace = nullptr;
m_scratchHostCount = 0;
m_scratchHost = nullptr;
m_scratchFlagsCount = 0;
m_scratchFlags = nullptr;

Expand Down Expand Up @@ -250,6 +254,30 @@ sycl::device_ptr<void> SYCLInternal::scratch_space(const std::size_t size) {
return m_scratchSpace;
}

sycl::host_ptr<void> SYCLInternal::scratch_host(const std::size_t size) {
if (verify_is_initialized("scratch_unified") &&
m_scratchHostCount < scratch_count(size)) {
m_scratchHostCount = scratch_count(size);

using Record = Kokkos::Impl::SharedAllocationRecord<
Kokkos::Experimental::SYCLHostUSMSpace, void>;

if (m_scratchHost) Record::decrement(Record::get_record(m_scratchHost));

std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort(
m_scratchHostCount, sizeScratchGrain);
Record* const r = Record::allocate(
Kokkos::Experimental::SYCLHostUSMSpace(*m_queue),
"Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size);

Record::increment(r);

m_scratchHost = reinterpret_cast<size_type*>(r->data());
}

return m_scratchHost;
}

sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) {
if (verify_is_initialized("scratch_flags") &&
m_scratchFlagsCount < scratch_count(size)) {
Expand Down
3 changes: 3 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class SYCLInternal {

sycl::device_ptr<void> scratch_space(const std::size_t size);
sycl::device_ptr<void> scratch_flags(const std::size_t size);
sycl::host_ptr<void> scratch_host(const std::size_t size);
int acquire_team_scratch_space();
sycl::device_ptr<void> resize_team_scratch_space(int scratch_pool_id,
std::int64_t bytes,
Expand All @@ -60,6 +61,8 @@ class SYCLInternal {

std::size_t m_scratchSpaceCount = 0;
sycl::device_ptr<size_type> m_scratchSpace = nullptr;
std::size_t m_scratchHostCount = 0;
sycl::host_ptr<size_type> m_scratchHost = nullptr;
std::size_t m_scratchFlagsCount = 0;
sycl::device_ptr<size_type> m_scratchFlags = nullptr;
// mutex to access shared memory
Expand Down
12 changes: 6 additions & 6 deletions core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
int m_shmem_size;
sycl::device_ptr<char> m_global_scratch_ptr;
size_t m_scratch_size[2];
// Only let one ParallelFor/Reduce modify the team scratch memory. The
// constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_scratch_lock;
// Only let one ParallelFor instance at a time use the team scratch memory.
// The constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_scratch_buffers_lock;
int m_scratch_pool_id = -1;

template <typename FunctorWrapper>
Expand Down Expand Up @@ -141,9 +141,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
m_league_size(arg_policy.league_size()),
m_team_size(arg_policy.team_size()),
m_vector_size(arg_policy.impl_vector_length()),
m_scratch_lock(arg_policy.space()
.impl_internal_space_instance()
->m_team_scratch_mutex) {
m_scratch_buffers_lock(arg_policy.space()
.impl_internal_space_instance()
->m_team_scratch_mutex) {
// FIXME_SYCL optimize
if (m_team_size < 0)
m_team_size =
Expand Down
37 changes: 24 additions & 13 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
m_result_ptr_device_accessible(
MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
typename View::memory_space>::accessible),
m_shared_memory_lock(
m_scratch_buffers_lock(
m_space.impl_internal_space_instance()->m_mutexScratchSpace) {}

private:
Expand All @@ -95,6 +95,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const unsigned int value_count =
m_functor_reducer.get_reducer().value_count();
sycl::device_ptr<value_type> results_ptr;
auto host_result_ptr =
(m_result_ptr && !m_result_ptr_device_accessible)
? static_cast<sycl::host_ptr<value_type>>(
instance.scratch_host(sizeof(value_type) * value_count))
: nullptr;

sycl::event last_reduction_event;

Expand All @@ -109,8 +114,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
#endif
results_ptr = static_cast<sycl::device_ptr<value_type>>(
instance.scratch_space(sizeof(value_type) * value_count));
sycl::global_ptr<value_type> device_accessible_result_ptr =
m_result_ptr_device_accessible ? m_result_ptr : nullptr;
auto device_accessible_result_ptr =
m_result_ptr_device_accessible
? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
: static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
cgh.single_task([=]() {
const CombinedFunctorReducerType& functor_reducer =
functor_reducer_wrapper.get_functor();
Expand Down Expand Up @@ -148,8 +155,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,

results_ptr = static_cast<sycl::device_ptr<value_type>>(
instance.scratch_space(sizeof(value_type) * value_count * n_wgroups));
sycl::global_ptr<value_type> device_accessible_result_ptr =
m_result_ptr_device_accessible ? m_result_ptr : nullptr;
auto device_accessible_result_ptr =
m_result_ptr_device_accessible
? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
: static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
instance.scratch_flags(sizeof(unsigned int)));

Expand Down Expand Up @@ -296,11 +305,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
// At this point, the reduced value is written to the entry in results_ptr
// and all that is left is to copy it back to the given result pointer if
// necessary.
if (m_result_ptr && !m_result_ptr_device_accessible) {
Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
Kokkos::Experimental::SYCLDeviceUSMSpace>(
m_space, m_result_ptr, results_ptr,
sizeof(*m_result_ptr) * value_count);
// Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
if (host_result_ptr) {
m_space.fence(
"Kokkos::Impl::ParallelReduce<SYCL, MDRangePolicy>::execute: result "
"not device-accessible");
std::memcpy(m_result_ptr, host_result_ptr,
sizeof(value_type) * value_count);
}

return last_reduction_event;
Expand Down Expand Up @@ -335,9 +346,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;

// Only let one Parallel/Scan modify the shared memory. The
// constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_shared_memory_lock;
// Only let one ParallelReduce instance at a time use the host scratch memory.
// The constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_scratch_buffers_lock;
};

#endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */
31 changes: 20 additions & 11 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
m_result_ptr_device_accessible(
MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
typename View::memory_space>::accessible),
m_shared_memory_lock(
m_scratch_buffers_lock(
p.space().impl_internal_space_instance()->m_mutexScratchSpace) {}

private:
Expand All @@ -70,8 +70,15 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const unsigned int value_count =
m_functor_reducer.get_reducer().value_count();
sycl::device_ptr<value_type> results_ptr = nullptr;
sycl::global_ptr<value_type> device_accessible_result_ptr =
m_result_ptr_device_accessible ? m_result_ptr : nullptr;
auto host_result_ptr =
(m_result_ptr && !m_result_ptr_device_accessible)
? static_cast<sycl::host_ptr<value_type>>(
instance.scratch_host(sizeof(value_type) * value_count))
: nullptr;
auto device_accessible_result_ptr =
m_result_ptr_device_accessible
? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
: static_cast<sycl::global_ptr<value_type>>(host_result_ptr);

sycl::event last_reduction_event;

Expand Down Expand Up @@ -320,11 +327,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
// At this point, the reduced value is written to the entry in results_ptr
// and all that is left is to copy it back to the given result pointer if
// necessary.
if (m_result_ptr && !m_result_ptr_device_accessible) {
Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
Kokkos::Experimental::SYCLDeviceUSMSpace>(
space, m_result_ptr, results_ptr,
sizeof(*m_result_ptr) * value_count);
// Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
if (host_result_ptr) {
space.fence(
"Kokkos::Impl::ParallelReduce<SYCL, RangePolicy>::execute: result "
"not device-accessible");
std::memcpy(m_result_ptr, host_result_ptr,
sizeof(*m_result_ptr) * value_count);
}

return last_reduction_event;
Expand Down Expand Up @@ -354,9 +363,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;

// Only let one Parallel/Scan modify the shared memory. The
// constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_shared_memory_lock;
// Only let one ParallelReduce instance at a time use the host scratch memory.
// The constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_scratch_buffers_lock;
};

#endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */
43 changes: 28 additions & 15 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const size_type m_league_size;
int m_team_size;
const size_type m_vector_size;
// Only let one ParallelFor/Reduce modify the team scratch memory. The
// constructor acquires the mutex which is released in the destructor.
std::scoped_lock<std::mutex> m_scratch_lock;
// Only let one ParallelReduce instance at a time use the team scratch memory
// and the host scratch memory. The constructor acquires the mutex which is
// released in the destructor.
std::scoped_lock<std::mutex> m_scratch_buffers_lock;
int m_scratch_pool_id = -1;

template <typename PolicyType, typename CombinedFunctorReducerWrapper>
Expand All @@ -79,6 +80,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
m_functor_reducer.get_reducer().value_count();
std::size_t size = std::size_t(m_league_size) * m_team_size * m_vector_size;
value_type* results_ptr = nullptr;
auto host_result_ptr =
(m_result_ptr && !m_result_ptr_device_accessible)
? static_cast<sycl::host_ptr<value_type>>(
instance.scratch_host(sizeof(value_type) * value_count))
: nullptr;

sycl::event last_reduction_event;

Expand All @@ -89,8 +95,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
results_ptr =
static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
sizeof(value_type) * std::max(value_count, 1u)));
sycl::global_ptr<value_type> device_accessible_result_ptr =
m_result_ptr_device_accessible ? m_result_ptr : nullptr;
auto device_accessible_result_ptr =
m_result_ptr_device_accessible
? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
: static_cast<sycl::global_ptr<value_type>>(host_result_ptr);

auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
// FIXME_SYCL accessors seem to need a size greater than zero at least
Expand Down Expand Up @@ -164,8 +172,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
auto team_reduction_factory =
[&](sycl::local_accessor<value_type, 1> local_mem,
sycl::device_ptr<value_type> results_ptr) {
sycl::global_ptr<value_type> device_accessible_result_ptr =
m_result_ptr_device_accessible ? m_result_ptr : nullptr;
auto device_accessible_result_ptr =
m_result_ptr_device_accessible
? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
: static_cast<sycl::global_ptr<value_type>>(
host_result_ptr);
auto lambda = [=](sycl::nd_item<2> item) {
auto n_wgroups = item.get_group_range()[1];
int wgroup_size =
Expand Down Expand Up @@ -358,11 +369,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
// At this point, the reduced value is written to the entry in results_ptr
// and all that is left is to copy it back to the given result pointer if
// necessary.
if (m_result_ptr && !m_result_ptr_device_accessible) {
Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
Kokkos::Experimental::SYCLDeviceUSMSpace>(
space, m_result_ptr, results_ptr,
sizeof(*m_result_ptr) * value_count);
// Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
if (host_result_ptr) {
space.fence(
"Kokkos::Impl::ParallelReduce<SYCL, TeamPolicy>::execute: result not "
"device-accessible");
std::memcpy(m_result_ptr, host_result_ptr,
sizeof(*m_result_ptr) * value_count);
}

return last_reduction_event;
Expand Down Expand Up @@ -448,9 +461,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
m_league_size(arg_policy.league_size()),
m_team_size(arg_policy.team_size()),
m_vector_size(arg_policy.impl_vector_length()),
m_scratch_lock(arg_policy.space()
.impl_internal_space_instance()
->m_team_scratch_mutex) {
m_scratch_buffers_lock(arg_policy.space()
.impl_internal_space_instance()
->m_team_scratch_mutex) {
initialize();
}
};
Expand Down

0 comments on commit 0975671

Please sign in to comment.