Skip to content

Commit

Permalink
slim API and move code to impl
Browse files Browse the repository at this point in the history
  • Loading branch information
fnrizzi committed Jun 23, 2023
1 parent 5b6cb80 commit afc873b
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 161 deletions.
175 changes: 14 additions & 161 deletions algorithms/src/sorting/Kokkos_SortPublicAPI.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,57 +18,10 @@
#define KOKKOS_SORT_PUBLIC_API_HPP_

#include "./impl/Kokkos_SortImpl.hpp"
#include "Kokkos_BinOpsPublicAPI.hpp"
#include "Kokkos_BinSortPublicAPI.hpp"
#include <std_algorithms/Kokkos_BeginEnd.hpp>
#include <Kokkos_Core.hpp>
#include <algorithm>

#if defined(KOKKOS_ENABLE_CUDA)

// Workaround for `Instruction 'shfl' without '.sync' is not supported on
// .target sm_70 and higher from PTX ISA version 6.4`.
// Also see https://github.com/NVIDIA/cub/pull/170.
#if !defined(CUB_USE_COOPERATIVE_GROUPS)
#define CUB_USE_COOPERATIVE_GROUPS
#endif

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wshadow"

#if defined(KOKKOS_COMPILER_CLANG)
// Some versions of Clang fail to compile Thrust, failing with errors like
// this:
// <snip>/thrust/system/cuda/detail/core/agent_launcher.h:557:11:
// error: use of undeclared identifier 'va_printf'
// The exact combination of versions for Clang and Thrust (or CUDA) for this
// failure was not investigated, however even very recent version combination
// (Clang 10.0.0 and Cuda 10.0) demonstrated failure.
//
// Defining _CubLog here locally allows us to avoid that code path, however
// disabling some debugging diagnostics
#pragma push_macro("_CubLog")
#ifdef _CubLog
#undef _CubLog
#endif
#define _CubLog
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#pragma pop_macro("_CubLog")
#else
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#endif

#pragma GCC diagnostic pop

#endif

#if defined(KOKKOS_ENABLE_ONEDPL)
#include <oneapi/dpl/execution>
#include <oneapi/dpl/algorithm>
#endif

namespace Kokkos {

// ---------------------------------------------------------------
Expand All @@ -77,130 +30,30 @@ namespace Kokkos {

// clang-format off
template <class ExecutionSpace, class DataType, class... Properties>
std::enable_if_t<
(Kokkos::is_execution_space<ExecutionSpace>::value) &&
(!SpaceAccessibility<
HostSpace, typename Kokkos::View<DataType, Properties...>::memory_space
>::accessible)
>
// clang-format on
sort(const ExecutionSpace& exec,
const Kokkos::View<DataType, Properties...>& view) {
// Although we are using BinSort below, which could work on rank-2 views,
// for now view must be rank-1 because the Impl::min_max_functor
// used below only works for rank-1 views
using ViewType = Kokkos::View<DataType, Properties...>;
static_assert(ViewType::rank == 1,
"Kokkos::sort: currently only supports rank-1 Views.");

if (view.extent(0) == 0) {
return;
}

Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
parallel_reduce("Kokkos::Sort::FindExtent",
Kokkos::RangePolicy<typename ViewType::execution_space>(
exec, 0, view.extent(0)),
Impl::min_max_functor<ViewType>(view), reducer);
if (result.min_val == result.max_val) return;
// For integral types the number of bins may be larger than the range
// in which case we can exactly have one unique value per bin
// and then don't need to sort bins.
bool sort_in_bins = true;
// TODO: figure out better max_bins then this ...
int64_t max_bins = view.extent(0) / 2;
if (std::is_integral<typename ViewType::non_const_value_type>::value) {
// Cast to double to avoid possible overflow when using integer
auto const max_val = static_cast<double>(result.max_val);
auto const min_val = static_cast<double>(result.min_val);
// using 10M as the cutoff for special behavior (roughly 40MB for the count
// array)
if ((max_val - min_val) < 10000000) {
max_bins = max_val - min_val + 1;
sort_in_bins = false;
}
}
if (std::is_floating_point<typename ViewType::non_const_value_type>::value) {
KOKKOS_ASSERT(std::isfinite(static_cast<double>(result.max_val) -
static_cast<double>(result.min_val)));
}

using CompType = BinOp1D<ViewType>;
BinSort<ViewType, CompType> bin_sort(
view, CompType(max_bins, result.min_val, result.max_val), sort_in_bins);
bin_sort.create_permute_vector(exec);
bin_sort.sort(exec, view);
}

#if defined(KOKKOS_ENABLE_ONEDPL)
template <class DataType, class... Properties>
void sort(const Experimental::SYCL& space,
void sort([[maybe_unused]] const ExecutionSpace& exec,
const Kokkos::View<DataType, Properties...>& view) {
// constraints
using ViewType = Kokkos::View<DataType, Properties...>;
static_assert(SpaceAccessibility<Experimental::SYCL,
typename ViewType::memory_space>::accessible,
"SYCL execution space is not able to access the memory space "
"of the View argument!");

// Can't use Experimental::begin/end here since the oneDPL then assumes that
// the data is on the host.
static_assert(
ViewType::rank == 1 &&
(std::is_same<typename ViewType::array_layout, LayoutRight>::value ||
std::is_same<typename ViewType::array_layout, LayoutLeft>::value),
"SYCL sort only supports contiguous rank-1 Views.");

if (view.extent(0) == 0) {
return;
}

auto queue = space.sycl_queue();
auto policy = oneapi::dpl::execution::make_device_policy(queue);
const int n = view.extent(0);
oneapi::dpl::sort(policy, view.data(), view.data() + n);
}
#endif

// clang-format off
template <class ExecutionSpace, class DataType, class... Properties>
std::enable_if_t<
(Kokkos::is_execution_space<ExecutionSpace>::value) &&
(SpaceAccessibility<
HostSpace, typename Kokkos::View<DataType, Properties...>::memory_space
>::accessible)
>
// clang-format on
sort(const ExecutionSpace&, const Kokkos::View<DataType, Properties...>& view) {
using ViewType = Kokkos::View<DataType, Properties...>;
using MemSpace = typename ViewType::memory_space;
static_assert(ViewType::rank == 1,
"Kokkos::sort: currently only supports rank-1 Views.");
static_assert(SpaceAccessibility<ExecutionSpace, MemSpace>::accessible,
"Kokkos::sort: execution space instance is not able to access "
"the memory space of the "
"View argument!");

if (view.extent(0) == 0) {
if (view.extent(0) <= 1) {
return;
}
auto first = Experimental::begin(view);
auto last = Experimental::end(view);
std::sort(first, last);
}

#if defined(KOKKOS_ENABLE_CUDA)
template <class DataType, class... Properties>
void sort(const Cuda& space,
const Kokkos::View<DataType, Properties...>& view) {
using ViewType = Kokkos::View<DataType, Properties...>;
static_assert(ViewType::rank == 1,
"Kokkos::sort: currently only supports rank-1 Views.");

if (view.extent(0) == 0) {
return;
if constexpr (SpaceAccessibility<HostSpace, MemSpace>::accessible) {
auto first = ::Kokkos::Experimental::begin(view);
auto last = ::Kokkos::Experimental::end(view);
std::sort(first, last);
} else {
Impl::sort_device_view_without_comparator(exec, view);
}
const auto exec = thrust::cuda::par.on(space.cuda_stream());
auto first = Experimental::begin(view);
auto last = Experimental::end(view);
thrust::sort(exec, first, last);
}
#endif

template <class DataType, class... Properties>
void sort(const Kokkos::View<DataType, Properties...>& view) {
Expand Down
177 changes: 177 additions & 0 deletions algorithms/src/sorting/impl/Kokkos_SortImpl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,56 @@
#ifndef KOKKOS_SORT_FREE_FUNCS_IMPL_HPP_
#define KOKKOS_SORT_FREE_FUNCS_IMPL_HPP_

#include "../Kokkos_BinOpsPublicAPI.hpp"
#include "../Kokkos_BinSortPublicAPI.hpp"
#include <std_algorithms/Kokkos_BeginEnd.hpp>
#include <Kokkos_Core.hpp>

#if defined(KOKKOS_ENABLE_CUDA)

// Workaround for `Instruction 'shfl' without '.sync' is not supported on
// .target sm_70 and higher from PTX ISA version 6.4`.
// Also see https://github.com/NVIDIA/cub/pull/170.
#if !defined(CUB_USE_COOPERATIVE_GROUPS)
#define CUB_USE_COOPERATIVE_GROUPS
#endif

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wshadow"

#if defined(KOKKOS_COMPILER_CLANG)
// Some versions of Clang fail to compile Thrust, failing with errors like
// this:
// <snip>/thrust/system/cuda/detail/core/agent_launcher.h:557:11:
// error: use of undeclared identifier 'va_printf'
// The exact combination of versions for Clang and Thrust (or CUDA) for this
// failure was not investigated, however even very recent version combination
// (Clang 10.0.0 and Cuda 10.0) demonstrated failure.
//
// Defining _CubLog here locally allows us to avoid that code path, however
// disabling some debugging diagnostics
#pragma push_macro("_CubLog")
#ifdef _CubLog
#undef _CubLog
#endif
#define _CubLog
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#pragma pop_macro("_CubLog")
#else
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#endif

#pragma GCC diagnostic pop

#endif

#if defined(KOKKOS_ENABLE_ONEDPL)
#include <oneapi/dpl/execution>
#include <oneapi/dpl/algorithm>
#endif

namespace Kokkos {
namespace Impl {

Expand All @@ -37,6 +85,135 @@ struct min_max_functor {
}
};

template <class ExecutionSpace, class DataType, class... Properties>
void sort_via_binsort(const ExecutionSpace& exec,
const Kokkos::View<DataType, Properties...>& view) {
// Although we are using BinSort below, which could work on rank-2 views,
// for now view must be rank-1 because the min_max_functor
// used below only works for rank-1 views
using ViewType = Kokkos::View<DataType, Properties...>;
static_assert(ViewType::rank == 1,
"Kokkos::sort: currently only supports rank-1 Views.");

if (view.extent(0) == 0) {
return;
}

Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
parallel_reduce("Kokkos::Sort::FindExtent",
Kokkos::RangePolicy<typename ViewType::execution_space>(
exec, 0, view.extent(0)),
min_max_functor<ViewType>(view), reducer);
if (result.min_val == result.max_val) return;
// For integral types the number of bins may be larger than the range
// in which case we can exactly have one unique value per bin
// and then don't need to sort bins.
bool sort_in_bins = true;
// TODO: figure out better max_bins then this ...
int64_t max_bins = view.extent(0) / 2;
if (std::is_integral<typename ViewType::non_const_value_type>::value) {
// Cast to double to avoid possible overflow when using integer
auto const max_val = static_cast<double>(result.max_val);
auto const min_val = static_cast<double>(result.min_val);
// using 10M as the cutoff for special behavior (roughly 40MB for the count
// array)
if ((max_val - min_val) < 10000000) {
max_bins = max_val - min_val + 1;
sort_in_bins = false;
}
}
if (std::is_floating_point<typename ViewType::non_const_value_type>::value) {
KOKKOS_ASSERT(std::isfinite(static_cast<double>(result.max_val) -
static_cast<double>(result.min_val)));
}

using CompType = BinOp1D<ViewType>;
BinSort<ViewType, CompType> bin_sort(
view, CompType(max_bins, result.min_val, result.max_val), sort_in_bins);
bin_sort.create_permute_vector(exec);
bin_sort.sort(exec, view);
}

#if defined(KOKKOS_ENABLE_CUDA)
template <class DataType, class... Properties>
void sort_cudathrust(const Cuda& space,
const Kokkos::View<DataType, Properties...>& view) {
using ViewType = Kokkos::View<DataType, Properties...>;
static_assert(ViewType::rank == 1,
"Kokkos::sort: currently only supports rank-1 Views.");

if (view.extent(0) == 0) {
return;
}
const auto exec = thrust::cuda::par.on(space.cuda_stream());
auto first = ::Kokkos::Experimental::begin(view);
auto last = ::Kokkos::Experimental::end(view);
thrust::sort(exec, first, last);
}
#endif

#if defined(KOKKOS_ENABLE_ONEDPL)
template <class DataType, class... Properties>
void sort_onedpl(const Experimental::SYCL& space,
const Kokkos::View<DataType, Properties...>& view) {
using ViewType = Kokkos::View<DataType, Properties...>;
static_assert(SpaceAccessibility<Experimental::SYCL,
typename ViewType::memory_space>::accessible,
"SYCL execution space is not able to access the memory space "
"of the View argument!");

// Can't use Experimental::begin/end here since the oneDPL then assumes that
// the data is on the host.
static_assert(
ViewType::rank == 1 &&
(std::is_same<typename ViewType::array_layout, LayoutRight>::value ||
std::is_same<typename ViewType::array_layout, LayoutLeft>::value),
"SYCL sort only supports contiguous rank-1 Views.");

if (view.extent(0) == 0) {
return;
}

auto queue = space.sycl_queue();
auto policy = oneapi::dpl::execution::make_device_policy(queue);
const int n = view.extent(0);
oneapi::dpl::sort(policy, view.data(), view.data() + n);
}
#endif

// --------------------------------------------------
//
// specialize cases for sorting without comparator
//
// --------------------------------------------------

#if defined(KOKKOS_ENABLE_CUDA)
template <class DataType, class... Properties>
void sort_device_view_without_comparator(
const Cuda& exec, const Kokkos::View<DataType, Properties...>& view) {
sort_cudathrust(exec, view);
}
#endif

#if defined(KOKKOS_ENABLE_ONEDPL)
template <class DataType, class... Properties>
void sort_device_view_without_comparator(
const Experimental::SYCL& exec,
const Kokkos::View<DataType, Properties...>& view) {
sort_onedpl(exec, view);
}
#endif

// fallback case
template <class ExecutionSpace, class DataType, class... Properties>
std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>
sort_device_view_without_comparator(
const ExecutionSpace& exec,
const Kokkos::View<DataType, Properties...>& view) {
sort_via_binsort(exec, view);
}

} // namespace Impl
} // namespace Kokkos
#endif

0 comments on commit afc873b

Please sign in to comment.