From 39079046e525f36fc357737a12677903fbdae8a5 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 10 Jul 2024 18:17:13 +0200 Subject: [PATCH] Revert "[oneDPL] Remove local (in-group) atomic usage from __parallel_find_or (#1668)" This reverts commit a1798c3bb598b13b62f01a5408838f0d2096f28e. --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 130 +++++++----------- .../oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h | 5 - 2 files changed, 50 insertions(+), 85 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ea3eb27ced5..24ec9f57462 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -965,8 +965,6 @@ struct __parallel_find_forward_tag using _AtomicType = oneapi::dpl::__internal::__difference_t<_RangeType>; #endif - using _LocalResultsReduceOp = __dpl_sycl::__minimum<_AtomicType>; - // The template parameter is intended to unify __init_value in tags. template constexpr static _AtomicType @@ -975,21 +973,12 @@ struct __parallel_find_forward_tag return __val; } - // As far as we make search from begin to the end of data, we should save the first (minimal) found state - // in the __save_state_to (local state) / __save_state_to_atomic (global state) methods. - - template + template static void - __save_state_to_atomic(__dpl_sycl::__atomic_ref<_AtomicType, _Space>& __atomic, _AtomicType __new_state) + __save_state_to_atomic(_TAtomic& __found, _AtomicType __new_state) { - __atomic.fetch_min(__new_state); - } - - template - static void - __save_state_to(_TFoundState& __found, _AtomicType __new_state) - { - __found = std::min(__found, __new_state); + // As far as we make search from begin to the end of data, we should save the first (minimal) found state. + __found.fetch_min(__new_state); } }; @@ -1004,29 +993,18 @@ struct __parallel_find_backward_tag using _AtomicType = oneapi::dpl::__internal::__difference_t<_RangeType>; #endif - using _LocalResultsReduceOp = __dpl_sycl::__maximum<_AtomicType>; - template constexpr static _AtomicType __init_value(_DiffType) { return _AtomicType{-1}; } - // As far as we make search from end to the begin of data, we should save the last (maximal) found state - // in the __save_state_to (local state) / __save_state_to_atomic (global state) methods. - - template + template static void - __save_state_to_atomic(__dpl_sycl::__atomic_ref<_AtomicType, _Space>& __atomic, _AtomicType __new_state) + __save_state_to_atomic(_TAtomic& __found, _AtomicType __new_state) { - __atomic.fetch_max(__new_state); - } - - template - static void - __save_state_to(_TFoundState& __found, _AtomicType __new_state) - { - __found = std::max(__found, __new_state); + // As far as we make search from end to the begin of data, we should save the last (maximal) found state. + __found.fetch_max(__new_state); } }; @@ -1035,8 +1013,6 @@ struct __parallel_or_tag { using _AtomicType = int32_t; - using _LocalResultsReduceOp = __dpl_sycl::__bit_or<_AtomicType>; - // The template parameter is intended to unify __init_value in tags. template constexpr static _AtomicType __init_value(_DiffType) @@ -1044,22 +1020,12 @@ struct __parallel_or_tag return 0; } - // Store that a match was found. Its position is not relevant for or semantics - // in the __save_state_to (local state) / __save_state_to_atomic (global state) methods. - static constexpr _AtomicType __found_state = 1; - - template + template static void - __save_state_to_atomic(__dpl_sycl::__atomic_ref<_AtomicType, _Space>& __atomic, _AtomicType /*__new_state*/) + __save_state_to_atomic(_TAtomic& __found, _AtomicType /*__new_state*/) { - __atomic.store(__found_state); - } - - template - static void - __save_state_to(_TFoundState& __found, _AtomicType /*__new_state*/) - { - __found = __found_state; + // Store that a match was found. Its position is not relevant for or semantics. + __found.store(1); } }; @@ -1086,11 +1052,11 @@ struct __early_exit_find_or { _Pred __pred; - template void operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, - _LocalFoundState& __found_local, _BrickTag __brick_tag, _Ranges&&... __rngs) const + _LocalAtomic& __found_local, _BrickTag __brick_tag, _Ranges&&... __rngs) const { // There are 3 possible tag types here: // - __parallel_find_forward_tag : in case when we find the first value in the data; @@ -1115,22 +1081,27 @@ struct __early_exit_find_or bool __something_was_found = false; for (_IterSize __i = 0; !__something_was_found && __i < __n_iter; ++__i) { + //in case of find-semantic __shifted_idx must be the same type as the atomic for a correct comparison + using _ShiftedIdxType = ::std::conditional_t<_OrTagType::value, decltype(__init_index + __i * __shift), + decltype(__found_local.load())>; + _IterSize __current_iter = __i; if constexpr (__is_backward_tag(__brick_tag)) __current_iter = __n_iter - 1 - __i; - const auto __shifted_idx = __init_index + __current_iter * __shift; - + _ShiftedIdxType __shifted_idx = __init_index + __current_iter * __shift; + // TODO:[Performance] the issue with atomic load (in comparison with __shifted_idx for early exit) + // should be investigated later, with other HW if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) { - // Update local found state - _BrickTag::__save_state_to(__found_local, __shifted_idx); + // Update local (for group) atomic state with the found index + _BrickTag::__save_state_to_atomic(__found_local, __shifted_idx); // This break is mandatory from the performance point of view. // This break is safe for all our cases: // 1) __parallel_find_forward_tag : when we search for the first matching data entry, we process data from start to end (forward direction). // This means that after first found entry there is no reason to process data anymore. - // 2) __parallel_find_backward_tag : when we search for the last matching data entry, we process data from end to start (backward direction). + // 2) __parallel_find_backward_tag : when we search for the last matching data entry, we process data from end to start (backward direction). // This means that after the first found entry there is no reason to process data anymore too. // 3) __parallel_or_tag : when we search for any matching data entry, we process data from start to end (forward direction). // This means that after the first found entry there is no reason to process data anymore too. @@ -1168,20 +1139,15 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli assert(__rng_n > 0); // TODO: find a way to generalize getting of reliable work-group size - std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec); + auto __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec); #if _ONEDPL_COMPILE_KERNEL auto __kernel = __internal::__kernel_compiler<_FindOrKernel>::__compile(__exec); __wgroup_size = ::std::min(__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel)); #endif - -#if _ONEDPL_FPGA_EMU - // Limit the maximum work-group size to minimize the cost of work-group reduction. - // Limiting this also helps to avoid huge work-group sizes on some devices (e.g., FPGU emulation). - __wgroup_size = std::min(__wgroup_size, (std::size_t)2048); -#endif auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); auto __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size); + // TODO: try to change __n_groups with another formula for more perfect load balancing __n_groups = ::std::min(__n_groups, decltype(__n_groups)(__max_cu)); auto __n_iter = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); @@ -1195,13 +1161,15 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // scope is to copy data back to __result after destruction of temporary sycl:buffer { - sycl::buffer<_AtomicType, 1> __result_sycl_buf(&__result, 1); // temporary storage for global atomic + auto __temp = sycl::buffer<_AtomicType, 1>(&__result, 1); // temporary storage for global atomic // main parallel_for __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - auto __result_sycl_buf_acc = __result_sycl_buf.template get_access(__cgh); + auto __temp_acc = __temp.template get_access(__cgh); + // create local accessor to connect atomic with + __dpl_sycl::__local_accessor<_AtomicType> __temp_local(1, __cgh); #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif @@ -1214,27 +1182,29 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli [=](sycl::nd_item __item_id) { auto __local_idx = __item_id.get_local_id(0); - // 1. Set initial value to local found state - _AtomicType __found_local = __init_value; + __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( + *__dpl_sycl::__get_accessor_ptr(__temp_acc)); + __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::local_space> __found_local( + *__dpl_sycl::__get_accessor_ptr(__temp_local)); - // 2. Find any element that satisfies pred - __pred(__item_id, __n_iter, __wgroup_size, __found_local, __brick_tag, __rngs...); + // 1. Set initial value to local atomic + if (__local_idx == 0) + __found_local.store(__init_value); + __dpl_sycl::__group_barrier(__item_id); - // 3. Reduce over group: find __dpl_sycl::__minimum (for the __parallel_find_forward_tag), - // find __dpl_sycl::__maximum (for the __parallel_find_backward_tag) - // or update state with __dpl_sycl::__bit_or (for the __parallel_or_tag) - // inside all our group items - __found_local = __dpl_sycl::__reduce_over_group(__item_id.get_group(), __found_local, - typename _BrickTag::_LocalResultsReduceOp{}); + // 2. Find any element that satisfies pred and set local atomic value to global atomic + __pred(__item_id, __n_iter, __wgroup_size, __found_local, __brick_tag, __rngs...); + __dpl_sycl::__group_barrier(__item_id); - // Set local found state value value to global atomic - if (__local_idx == 0 && __found_local != __init_value) + // Set local atomic value to global atomic + if (__local_idx == 0) { - __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( - *__dpl_sycl::__get_accessor_ptr(__result_sycl_buf_acc)); - - // Update global (for all groups) atomic state with the found index - _BrickTag::__save_state_to_atomic(__found, __found_local); + const auto __found_local_state = __found_local.load(); + if (__found_local_state != __init_value) + { + // Update global (for all groups) atomic state with the found index + _BrickTag::__save_state_to_atomic(__found, __found_local_state); + } } }); }); @@ -1242,7 +1212,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli } if constexpr (__or_tag_check) - return __result != __init_value; + return __result; else return __result != __init_value ? __result : __rng_n; } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h index 043338e096c..a21d5d7a1ec 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h @@ -130,8 +130,6 @@ using __maximum = sycl::maximum<_T>; template using __minimum = sycl::minimum<_T>; -template -using __bit_or = sycl::bit_or<_T>; #else // _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT template using __plus = sycl::ONEAPI::plus<_T>; @@ -141,9 +139,6 @@ using __maximum = sycl::ONEAPI::maximum<_T>; template using __minimum = sycl::ONEAPI::minimum<_T>; - -template -using __bit_or = sycl::ONEAPI::bit_or<_T>; #endif // _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT template