Skip to content

Commit

Permalink
SIMD: Add abs() for all int types (kokkos#6069)
Browse files Browse the repository at this point in the history
* Added abs() for all simd types except for avx2 int64_t

* Fixed abs() for AVX2 int64_t

* Added abs() test for device side

* Changed from using Kokkos::abs usage to manual impl

* Removed [[nodiscard]] from abs() in AVX2

* Added a comment about _mm256_abs_epi64 being not available in AVX2

* Converted scalar abs() to use Kokkos::abs() to avoid spurious warnings from nvc++

* Update simd/src/Kokkos_SIMD_Scalar.hpp based on a review

Co-authored-by: Damien L-G <dalg24+github@gmail.com>

* Fix scalar abs()

---------

Co-authored-by: Damien L-G <dalg24+github@gmail.com>
  • Loading branch information
ldh4 and dalg24 committed Jul 5, 2023
1 parent 07a0f0c commit 35c1e3c
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 40 deletions.
22 changes: 22 additions & 0 deletions simd/src/Kokkos_SIMD_AVX2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,13 @@ class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
_mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int32_t, simd_abi::avx2_fixed_size<4>> abs(
simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& a) {
__m128i const rhs = static_cast<__m128i>(a);
return simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(_mm_abs_epi32(rhs));
}

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int32_t, simd_abi::avx2_fixed_size<4>>
condition(simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down Expand Up @@ -784,6 +791,15 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
_mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
}

// Manually computing absolute values, because _mm256_abs_epi64
// is not in AVX2; it's available in AVX512.
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::avx2_fixed_size<4>> abs(
simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(
[&](std::size_t i) { return (a[i] < 0) ? -a[i] : a[i]; });
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::avx2_fixed_size<4>> condition(
simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down Expand Up @@ -909,6 +925,12 @@ simd<std::int64_t, simd_abi::avx2_fixed_size<4>>::simd(
_mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> abs(
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& a) {
return a;
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> condition(
simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down
32 changes: 29 additions & 3 deletions simd/src/Kokkos_SIMD_AVX512.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,14 @@ class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> {
return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(0) - a;
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int32_t, simd_abi::avx512_fixed_size<8>> abs(
simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& a) {
__m256i const rhs = static_cast<__m256i>(a);
return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
_mm256_abs_epi32(rhs));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int32_t, simd_abi::avx512_fixed_size<8>> condition(
simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>> const& a,
Expand Down Expand Up @@ -341,6 +349,12 @@ class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> {
_mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> abs(
simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& a) {
return a;
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> condition(
simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& a,
Expand Down Expand Up @@ -470,6 +484,14 @@ class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> {
return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(0) - a;
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::avx512_fixed_size<8>> abs(
simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& a) {
__m512i const rhs = static_cast<__m512i>(a);
return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
_mm512_abs_epi64(rhs));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::avx512_fixed_size<8>> condition(
simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>> const& a,
Expand Down Expand Up @@ -605,6 +627,12 @@ class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> {
_mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> abs(
simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& a) {
return a;
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> condition(
simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& a,
Expand Down Expand Up @@ -751,9 +779,7 @@ KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx512_fixed_size<8>> abs(
simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
__m512d const rhs = static_cast<__m512d>(a);
return simd<double, simd_abi::avx512_fixed_size<8>>(reinterpret_cast<__m512d>(
_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
reinterpret_cast<__m512i>(rhs))));
return simd<double, simd_abi::avx512_fixed_size<8>>(_mm512_abs_pd(rhs));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
Expand Down
20 changes: 20 additions & 0 deletions simd/src/Kokkos_SIMD_NEON.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,13 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> {
vadd_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int32_t, simd_abi::neon_fixed_size<2>> abs(
simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& a) {
return simd<std::int32_t, simd_abi::neon_fixed_size<2>>(
vabs_s32(static_cast<int32x2_t>(a)));
}

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int32_t, simd_abi::neon_fixed_size<2>>
condition(simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>> const& a,
Expand Down Expand Up @@ -719,6 +726,13 @@ class simd<std::int64_t, simd_abi::neon_fixed_size<2>> {
vaddq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::neon_fixed_size<2>> abs(
simd<std::int64_t, simd_abi::neon_fixed_size<2>> const& a) {
return simd<std::int64_t, simd_abi::neon_fixed_size<2>>(
vabsq_s64(static_cast<int64x2_t>(a)));
}

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::neon_fixed_size<2>>
condition(simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>> const& a,
Expand Down Expand Up @@ -859,6 +873,12 @@ simd<std::int64_t, simd_abi::neon_fixed_size<2>>::simd(
simd<std::uint64_t, simd_abi::neon_fixed_size<2>> const& other)
: m_value(vreinterpretq_s64_u64(static_cast<uint64x2_t>(other))) {}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::neon_fixed_size<2>> abs(
simd<std::uint64_t, simd_abi::neon_fixed_size<2>> const& a) {
return a;
}

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::neon_fixed_size<2>>
condition(simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>> const& a,
Expand Down
5 changes: 4 additions & 1 deletion simd/src/Kokkos_SIMD_Scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,10 @@ template <class T>
template <class T>
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> abs(
simd<T, simd_abi::scalar> const& a) {
return simd<T, simd_abi::scalar>(std::abs(static_cast<T>(a)));
if constexpr (std::is_signed_v<T>) {
return (a < 0 ? -a : a);
}
return a;
}

template <class T>
Expand Down
132 changes: 96 additions & 36 deletions simd/unit_tests/TestSIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,8 @@ class load_as_scalars {
};

template <class Abi, class Loader, class BinaryOp, class T>
void host_check_binary_op_one_loader(BinaryOp binary_op, std::size_t n,
T const* first_args,
T const* second_args) {
void host_check_math_op_one_loader(BinaryOp binary_op, std::size_t n,
T const* first_args, T const* second_args) {
Loader loader;
using simd_type = Kokkos::Experimental::simd<T, Abi>;
std::size_t constexpr width = simd_type::size();
Expand All @@ -180,8 +179,29 @@ void host_check_binary_op_one_loader(BinaryOp binary_op, std::size_t n,
}
}

template <class Abi, class Loader, class UnaryOp, class T>
void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n,
T const* args) {
Loader loader;
using simd_type = Kokkos::Experimental::simd<T, Abi>;
std::size_t constexpr width = simd_type::size();
for (std::size_t i = 0; i < n; i += width) {
std::size_t const nremaining = n - i;
std::size_t const nlanes = Kokkos::min(nremaining, width);
simd_type arg;
bool const loaded_arg = loader.host_load(args + i, nlanes, arg);
if (!loaded_arg) continue;
simd_type expected_result;
for (std::size_t lane = 0; lane < nlanes; ++lane) {
expected_result[lane] = unary_op.on_host_serial(arg[lane]);
}
simd_type const computed_result = unary_op.on_host(arg);
host_check_equality(expected_result, computed_result, nlanes);
}
}

template <class Abi, class Loader, class BinaryOp, class T>
KOKKOS_INLINE_FUNCTION void device_check_binary_op_one_loader(
KOKKOS_INLINE_FUNCTION void device_check_math_op_one_loader(
BinaryOp binary_op, std::size_t n, T const* first_args,
T const* second_args) {
Loader loader;
Expand All @@ -208,28 +228,43 @@ KOKKOS_INLINE_FUNCTION void device_check_binary_op_one_loader(
}
}

template <class Abi, class BinaryOp, class T>
inline void host_check_binary_op_all_loaders(BinaryOp binary_op, std::size_t n,
T const* first_args,
T const* second_args) {
host_check_binary_op_one_loader<Abi, load_element_aligned>(
binary_op, n, first_args, second_args);
host_check_binary_op_one_loader<Abi, load_masked>(binary_op, n, first_args,
second_args);
host_check_binary_op_one_loader<Abi, load_as_scalars>(
binary_op, n, first_args, second_args);
template <class Abi, class Loader, class UnaryOp, class T>
KOKKOS_INLINE_FUNCTION void device_check_math_op_one_loader(UnaryOp unary_op,
std::size_t n,
T const* args) {
Loader loader;
using simd_type = Kokkos::Experimental::simd<T, Abi>;
std::size_t constexpr width = simd_type::size();
for (std::size_t i = 0; i < n; i += width) {
std::size_t const nremaining = n - i;
std::size_t const nlanes = Kokkos::min(nremaining, width);
simd_type arg;
bool const loaded_arg = loader.device_load(args + i, nlanes, arg);
if (!loaded_arg) continue;
simd_type expected_result;
for (std::size_t lane = 0; lane < nlanes; ++lane) {
expected_result[lane] = unary_op.on_device_serial(arg[lane]);
}
simd_type const computed_result = unary_op.on_device(arg);
device_check_equality(expected_result, computed_result, nlanes);
}
}

template <class Abi, class Op, class... T>
inline void host_check_math_op_all_loaders(Op op, std::size_t n,
T const*... args) {
host_check_math_op_one_loader<Abi, load_element_aligned>(op, n, args...);
host_check_math_op_one_loader<Abi, load_masked>(op, n, args...);
host_check_math_op_one_loader<Abi, load_as_scalars>(op, n, args...);
}

template <class Abi, class BinaryOp, class T>
KOKKOS_INLINE_FUNCTION void device_check_binary_op_all_loaders(
BinaryOp binary_op, std::size_t n, T const* first_args,
T const* second_args) {
device_check_binary_op_one_loader<Abi, load_element_aligned>(
binary_op, n, first_args, second_args);
device_check_binary_op_one_loader<Abi, load_masked>(binary_op, n, first_args,
second_args);
device_check_binary_op_one_loader<Abi, load_as_scalars>(
binary_op, n, first_args, second_args);
template <class Abi, class Op, class... T>
KOKKOS_INLINE_FUNCTION void device_check_math_op_all_loaders(Op op,
std::size_t n,
T const*... args) {
device_check_math_op_one_loader<Abi, load_element_aligned>(op, n, args...);
device_check_math_op_one_loader<Abi, load_masked>(op, n, args...);
device_check_math_op_one_loader<Abi, load_as_scalars>(op, n, args...);
}

class plus {
Expand Down Expand Up @@ -280,18 +315,41 @@ class divides {
}
};

class absolutes {
public:
template <typename T>
auto on_host(T const& a) const {
return Kokkos::Experimental::abs(a);
}
template <typename T>
auto on_host_serial(T const& a) const {
if constexpr (std::is_signed_v<T>) {
return Kokkos::abs<T>(a);
}
return a;
}
template <typename T>
KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
return Kokkos::Experimental::abs(a);
}
template <typename T>
KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
return Kokkos::abs<T>(a);
}
};

template <typename Abi, typename DataType, size_t n>
inline void host_check_all_math_ops(const DataType (&first_args)[n],
const DataType (&second_args)[n]) {
host_check_binary_op_all_loaders<Abi>(plus(), n, first_args, second_args);
host_check_binary_op_all_loaders<Abi>(minus(), n, first_args, second_args);
host_check_binary_op_all_loaders<Abi>(multiplies(), n, first_args,
second_args);
host_check_math_op_all_loaders<Abi>(plus(), n, first_args, second_args);
host_check_math_op_all_loaders<Abi>(minus(), n, first_args, second_args);
host_check_math_op_all_loaders<Abi>(multiplies(), n, first_args, second_args);

// TODO: Place fallback division implementations for all simd integer types
if constexpr (std::is_same_v<DataType, double>)
host_check_binary_op_all_loaders<Abi>(divides(), n, first_args,
second_args);
host_check_math_op_all_loaders<Abi>(divides(), n, first_args, second_args);

host_check_math_op_all_loaders<Abi>(absolutes(), n, first_args);
}

template <typename Abi, typename DataType>
Expand Down Expand Up @@ -385,14 +443,16 @@ inline void host_check_condition() {
template <typename Abi, typename DataType, size_t n>
KOKKOS_INLINE_FUNCTION void device_check_all_math_ops(
const DataType (&first_args)[n], const DataType (&second_args)[n]) {
device_check_binary_op_all_loaders<Abi>(plus(), n, first_args, second_args);
device_check_binary_op_all_loaders<Abi>(minus(), n, first_args, second_args);
device_check_binary_op_all_loaders<Abi>(multiplies(), n, first_args,
second_args);
device_check_math_op_all_loaders<Abi>(plus(), n, first_args, second_args);
device_check_math_op_all_loaders<Abi>(minus(), n, first_args, second_args);
device_check_math_op_all_loaders<Abi>(multiplies(), n, first_args,
second_args);

if constexpr (std::is_same_v<DataType, double>)
device_check_binary_op_all_loaders<Abi>(divides(), n, first_args,
second_args);
device_check_math_op_all_loaders<Abi>(divides(), n, first_args,
second_args);

device_check_math_op_all_loaders<Abi>(absolutes(), n, first_args);
}

template <typename Abi, typename DataType>
Expand Down

0 comments on commit 35c1e3c

Please sign in to comment.