Skip to content

Commit

Permalink
Avoid unnecessary zero-memset of the scratch flags in SYCL (kokkos#6739)
Browse files Browse the repository at this point in the history
* SYCL: Homogenize scratch_flags with CUDA and HIP

* Add comments for CUDA and HIP

* Fix typo [ci skip]

---------

Co-authored-by: Daniel Arndt <arndtd@ornl.gov>
  • Loading branch information
dalg24 and masterleinad committed Jan 25, 2024
1 parent d560c47 commit 650ac40
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 4 deletions.
3 changes: 3 additions & 0 deletions core/src/Cuda/Kokkos_Cuda_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,9 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
m_scratchFlags = static_cast<size_type *>(
mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size));

// We only zero-initialize the allocation when we actually allocate.
// It's the responsibility of the features using scratch_flags,
// namely parallel_reduce and parallel_scan, to reset the used values to 0.
KOKKOS_IMPL_CUDA_SAFE_CALL(
(cuda_memset_wrapper(m_scratchFlags, 0, alloc_size)));
}
Expand Down
3 changes: 3 additions & 0 deletions core/src/HIP/Kokkos_HIP_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,9 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
m_scratchFlags = static_cast<size_type *>(
mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size));

// We only zero-initialize the allocation when we actually allocate.
// It's the responsibility of the features using scratch_flags,
// namely parallel_reduce and parallel_scan, to reset the used values to 0.
KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size));
}

Expand Down
12 changes: 8 additions & 4 deletions core/src/SYCL/Kokkos_SYCL_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,16 @@ sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) {
m_scratchFlagsCount, sizeScratchGrain);
m_scratchFlags = static_cast<size_type*>(mem_space.allocate(
"Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size));
}
auto memset_event = m_queue->memset(m_scratchFlags, 0,
m_scratchFlagsCount * sizeScratchGrain);

// We only zero-initialize the allocation when we actually allocate.
// It's the responsibility of the features using scratch_flags,
// namely parallel_reduce and parallel_scan, to reset the used values to 0.
auto memset_event = m_queue->memset(m_scratchFlags, 0,
m_scratchFlagsCount * sizeScratchGrain);
#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
m_queue->ext_oneapi_submit_barrier(std::vector{memset_event});
m_queue->ext_oneapi_submit_barrier(std::vector{memset_event});
#endif
}

return m_scratchFlags;
}
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
}
item.barrier(sycl::access::fence_space::local_space);
if (num_teams_done[0] == n_wgroups) {
if (local_id == 0) *scratch_flags = 0;
if (local_id >= static_cast<int>(n_wgroups))
reducer.init(&local_mem[local_id * value_count]);
else {
Expand Down Expand Up @@ -279,6 +280,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
}
item.barrier(sycl::access::fence_space::local_space);
if (num_teams_done[0] == n_wgroups) {
if (local_id == 0) *scratch_flags = 0;
if (local_id >= static_cast<int>(n_wgroups))
reducer.init(&local_value);
else {
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
}
item.barrier(sycl::access::fence_space::local_space);
if (num_teams_done[0] == n_wgroups) {
if (local_id == 0) *scratch_flags = 0;
if (local_id >= n_wgroups)
reducer.init(&local_mem[local_id * value_count]);
else {
Expand Down Expand Up @@ -219,6 +220,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
}
item.barrier(sycl::access::fence_space::local_space);
if (num_teams_done[0] == n_wgroups) {
if (local_id == 0) *scratch_flags = 0;
if (local_id >= n_wgroups)
reducer.init(&local_value);
else {
Expand Down
2 changes: 2 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
}
sycl::group_barrier(item.get_group());
if (num_teams_done[0] == n_wgroups) {
if (local_id == 0) *scratch_flags = 0;
if (local_id >= n_wgroups)
reducer.init(&local_mem[local_id * value_count]);
else {
Expand Down Expand Up @@ -281,6 +282,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
}
item.barrier(sycl::access::fence_space::local_space);
if (num_teams_done[0] == n_wgroups) {
if (local_id == 0) *scratch_flags = 0;
if (local_id >= n_wgroups)
reducer.init(&local_value);
else {
Expand Down
1 change: 1 addition & 0 deletions core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ class ParallelScanSYCLBase {
}
item.barrier(sycl::access::fence_space::global_space);
if (num_teams_done[0] == n_wgroups) {
if (local_id == 0) *scratch_flags = 0;
value_type total;
reducer.init(&total);

Expand Down

0 comments on commit 650ac40

Please sign in to comment.