Skip to content

Commit

Permalink
[libc++][PSTL] Add a GCD backend
Browse files Browse the repository at this point in the history
Reviewed By: ldionne, #libc

Spies: arichardson, mgrang, krytarowski, libcxx-commits, h-vetinari

Differential Revision: https://reviews.llvm.org/D151717
  • Loading branch information
philnik777 committed Jul 12, 2023
1 parent fb90d5f commit 2b2e7f6
Show file tree
Hide file tree
Showing 18 changed files with 386 additions and 8 deletions.
4 changes: 3 additions & 1 deletion libcxx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -797,9 +797,11 @@ if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "serial")
config_define(1 _LIBCPP_PSTL_CPU_BACKEND_SERIAL)
elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "std_thread")
config_define(1 _LIBCPP_PSTL_CPU_BACKEND_THREAD)
elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
else()
message(FATAL_ERROR "LIBCXX_PSTL_CPU_BACKEND is set to ${LIBCXX_PSTL_CPU_BACKEND}, which is not a valid backend.
Valid backends are: serial, std_thread")
Valid backends are: serial, std_thread and libdispatch")
endif()

if (LIBCXX_ABI_DEFINES)
Expand Down
1 change: 1 addition & 0 deletions libcxx/cmake/caches/Apple.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
set(LIBCXX_ENABLE_SHARED ON CACHE BOOL "")
set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "")
set(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS ON CACHE BOOL "")
set(LIBCXX_PSTL_CPU_BACKEND libdispatch)

set(LIBCXX_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
set(LIBCXXABI_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ set(files
__algorithm/pstl_backends/cpu_backends/fill.h
__algorithm/pstl_backends/cpu_backends/find_if.h
__algorithm/pstl_backends/cpu_backends/for_each.h
__algorithm/pstl_backends/cpu_backends/libdispatch.h
__algorithm/pstl_backends/cpu_backends/merge.h
__algorithm/pstl_backends/cpu_backends/serial.h
__algorithm/pstl_backends/cpu_backends/stable_sort.h
Expand Down
3 changes: 2 additions & 1 deletion libcxx/include/__algorithm/pstl_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ struct __select_backend<std::execution::unsequenced_policy> {
};
# endif

# if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
# if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) || \
defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
template <>
struct __select_backend<std::execution::parallel_policy> {
using type = __cpu_backend_tag;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# include <__algorithm/pstl_backends/cpu_backends/serial.h>
#elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
# include <__algorithm/pstl_backends/cpu_backends/thread.h>
#elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
# include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
#else
# error "Invalid CPU backend choice"
#endif
Expand Down
226 changes: 226 additions & 0 deletions libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H

#include <__algorithm/lower_bound.h>
#include <__algorithm/upper_bound.h>
#include <__atomic/atomic.h>
#include <__config>
#include <__exception/terminate.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/move_iterator.h>
#include <__memory/construct_at.h>
#include <__memory/unique_ptr.h>
#include <__memory_resource/memory_resource.h>
#include <__numeric/reduce.h>
#include <__utility/exception_guard.h>
#include <__utility/move.h>
#include <__utility/terminate_on_exception.h>
#include <cstddef>
#include <new>
#include <vector>

#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

_LIBCPP_BEGIN_NAMESPACE_STD

namespace __par_backend {
inline namespace __libdispatch {

// ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
// we.
// TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
_LIBCPP_EXPORTED_FROM_ABI void
__dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;

template <class _Func>
_LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
__libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
(*static_cast<_Func*>(__context))(__chunk);
});
}

struct __chunk_partitions {
ptrdiff_t __chunk_count_; // includes the first chunk
ptrdiff_t __chunk_size_;
ptrdiff_t __first_chunk_size_;
};

[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI pmr::memory_resource* __get_memory_resource();
[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size);

template <class _RandomAccessIterator, class _Functor>
_LIBCPP_HIDE_FROM_ABI void
__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
auto __partitions = __libdispatch::__partition_chunks(__last - __first);

// Perform the chunked execution.
__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
auto __index =
__chunk == 0
? 0
: (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
__func(__first + __index, __first + __index + __this_chunk_size);
});
}

template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
struct __merge_range {
__merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
: __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}

_RandomAccessIterator1 __mid1_;
_RandomAccessIterator2 __mid2_;
_RandomAccessIteratorOut __result_;
};

template <typename _RandomAccessIterator1,
typename _RandomAccessIterator2,
typename _RandomAccessIterator3,
typename _Compare,
typename _LeafMerge>
_LIBCPP_HIDE_FROM_ABI void __parallel_merge(
_RandomAccessIterator1 __first1,
_RandomAccessIterator1 __last1,
_RandomAccessIterator2 __first2,
_RandomAccessIterator2 __last2,
_RandomAccessIterator3 __result,
_Compare __comp,
_LeafMerge __leaf_merge) {
__chunk_partitions __partitions =
__libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));

if (__partitions.__chunk_count_ == 0)
return;

if (__partitions.__chunk_count_ == 1) {
__leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
return;
}

using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;

vector<__merge_range_t> __ranges;
__ranges.reserve(__partitions.__chunk_count_ + 1);

// TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
std::__terminate_on_exception([&] {
__ranges.emplace_back(__first1, __first2, __result);

bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;

auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
auto [__mid1, __mid2] = [&] {
if (__iterate_first_range) {
auto __m1 = __first1 + __chunk_size;
auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
return std::make_pair(__m1, __m2);
} else {
auto __m2 = __first2 + __chunk_size;
auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
return std::make_pair(__m1, __m2);
}
}();

__result += (__mid1 - __first1) + (__mid2 - __first2);
__first1 = __mid1;
__first2 = __mid2;
return {std::move(__mid1), std::move(__mid2), __result};
};

// handle first chunk
__ranges.emplace_back(__compute_chunk(__partitions.__first_chunk_size_));

// handle 2 -> N - 1 chunks
for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
__ranges.emplace_back(__compute_chunk(__partitions.__chunk_size_));

// handle last chunk
__ranges.emplace_back(__last1, __last2, __result);

__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
auto __first_iters = __ranges[__index];
auto __last_iters = __ranges[__index + 1];
__leaf_merge(
__first_iters.__mid1_,
__last_iters.__mid1_,
__first_iters.__mid2_,
__last_iters.__mid2_,
__first_iters.__result_,
__comp);
});
});
}

template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
_LIBCPP_HIDE_FROM_ABI _Value __parallel_transform_reduce(
_RandomAccessIterator __first,
_RandomAccessIterator __last,
_Transform __transform,
_Value __init,
_Combiner __combiner,
_Reduction __reduction) {
auto __partitions = __libdispatch::__partition_chunks(__last - __first);

auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
std::destroy_n(__ptr, __count);
std::allocator<_Value>().deallocate(__ptr, __count);
};

// TODO: use __uninitialized_buffer
// TODO: allocate one element per worker instead of one element per chunk
unique_ptr<_Value[], decltype(__destroy)> __values(
std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);

// __dispatch_apply is noexcept
__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
auto __index =
__chunk == 0
? 0
: (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
if (__this_chunk_size != 1) {
std::__construct_at(
__values.get() + __chunk,
__reduction(__first + __index + 2,
__first + __index + __this_chunk_size,
__combiner(__transform(__first + __index), __transform(__first + __index + 1))));
} else {
std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
}
});

return std::__terminate_on_exception([&] {
return std::reduce(
std::make_move_iterator(__values.get()),
std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
std::move(__init),
__combiner);
});
}

// TODO: parallelize this
template <class _RandomAccessIterator, class _Comp, class _LeafSort>
_LIBCPP_HIDE_FROM_ABI void __parallel_stable_sort(
_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
__leaf_sort(__first, __last, __comp);
}

_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}

} // namespace __libdispatch
} // namespace __par_backend

_LIBCPP_END_NAMESPACE_STD

#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
[__transform](_ForwardIterator __iter) { return __transform(*__iter); },
std::move(__init),
__reduce,
[=](_ForwardIterator __brick_first, _ForwardIterator __brick_last, _Tp __brick_init) {
[__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{},
std::move(__brick_first),
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/__config_site.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
// PSTL backends
#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH

// Hardening.
#cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT
Expand Down
3 changes: 2 additions & 1 deletion libcxx/include/__numeric/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <__config>
#include <__functional/operations.h>
#include <__iterator/iterator_traits.h>
#include <__utility/move.h>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
Expand All @@ -25,7 +26,7 @@ template <class _InputIterator, class _Tp, class _BinaryOp>
_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp reduce(_InputIterator __first, _InputIterator __last,
_Tp __init, _BinaryOp __b) {
for (; __first != __last; ++__first)
__init = __b(__init, *__first);
__init = __b(std::move(__init), *__first);
return __init;
}

Expand Down
1 change: 1 addition & 0 deletions libcxx/include/__utility/terminate_on_exception.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <__config>
#include <__exception/terminate.h>
#include <new>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
Expand Down
3 changes: 3 additions & 0 deletions libcxx/include/module.modulemap.in
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,9 @@ module std [system] {
module pstl_backends_cpu_backends_for_each {
private header "__algorithm/pstl_backends/cpu_backends/for_each.h"
}
module pstl_backends_cpu_backends_libdispatch {
private header "__algorithm/pstl_backends/cpu_backends/libdispatch.h"
}
module pstl_backends_cpu_backends_merge {
private header "__algorithm/pstl_backends/cpu_backends/merge.h"
}
Expand Down
4 changes: 4 additions & 0 deletions libcxx/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,10 @@ set(LIBCXX_EXPERIMENTAL_SOURCES
experimental/memory_resource.cpp
)

if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
list(APPEND LIBCXX_EXPERIMENTAL_SOURCES pstl/libdispatch.cpp)
endif()

add_library(cxx_experimental STATIC ${LIBCXX_EXPERIMENTAL_SOURCES})
target_link_libraries(cxx_experimental PUBLIC cxx-headers)
if (LIBCXX_ENABLE_SHARED)
Expand Down
Loading

0 comments on commit 2b2e7f6

Please sign in to comment.