Skip to content

Commit

Permalink
add more warp sync for cuda reductions
Browse files Browse the repository at this point in the history
  • Loading branch information
tcclevenger committed Jan 24, 2024
1 parent e1415f8 commit 57126af
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 0 deletions.
5 changes: 5 additions & 0 deletions core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,11 @@ class ParallelReduce<CombinedFunctorReducerType,

if (CudaTraits::WarpSize < word_count.value) {
__syncthreads();
} else {
// In the above call to final(), shared might have been updated by a
// single thread within a warp without synchronization. Synchronize
// threads within warp to avoid potential race condition.
__syncwarp(0xffffffff);
}

for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
Expand Down
5 changes: 5 additions & 0 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,11 @@ class ParallelReduce<CombinedFunctorReducerType,

if (CudaTraits::WarpSize < word_count.value) {
__syncthreads();
} else {
// In the above call to final(), shared might have been updated by a
// single thread within a warp without synchronization. Synchronize
// threads within warp to avoid potential race condition.
__syncwarp(0xffffffff);
}

for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
Expand Down

0 comments on commit 57126af

Please sign in to comment.