Skip to content

Commit

Permalink
SIMD: make binary op tests to test against all data types (kokkos#5913)
Browse files Browse the repository at this point in the history
* Making simd tests work for all available data types
Added missing functions in AVX512

* AVX2

- uint64_t type compatibility problem with long long

* Filled in NEON backend for testing

* clang formatted

* Updated AVX2 on uint64_t

* clang-formatted

* Remove simd constructors that are either not specified by TS Extension for parallelism v2
or unused

* Refactor public mask/value functions in all const_where_exp structs
to an impl namespace

* Refactor size check to be tested with the rest of unit tests for all dataa types

* clang-formatted

* Fix friend decl in scalar abi

* Fixed friend decl in simd scalar

* Converted recursive calls to a fold expression

* Fixes to avx512 based on reviews

* Some consistency changes

* Reinterpret csat for masked loads in avx2 to avoid compiler warnings

* Modified device-side unit tests to check against all data types as well

* Minor refactoring on function signatures that takes in refs to test arrays

* Inserted a todo on integer divisions in the test file

* Removed a whitespace

* Fixed a few template parameter lists in the unit test file
  • Loading branch information
ldh4 committed Jun 2, 2023
1 parent 62ba94c commit 915c174
Show file tree
Hide file tree
Showing 7 changed files with 720 additions and 149 deletions.
17 changes: 14 additions & 3 deletions simd/src/Kokkos_SIMD.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ using host_native = avx2_fixed_size<4>;
#elif defined(__ARM_NEON)
using host_native = neon_fixed_size<2>;
#else
using host_native = scalar;
using host_native = scalar;
#endif

template <class T>
Expand Down Expand Up @@ -136,14 +136,25 @@ namespace Impl {
template <class... Abis>
class abi_set {};

template <typename... Ts>
class data_types {};

#if defined(KOKKOS_ARCH_AVX512XEON)
using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx512_fixed_size<8>>;
using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx512_fixed_size<8>>;
using data_type_set = data_types<std::int32_t, std::uint32_t, std::int64_t,
std::uint64_t, double>;
#elif defined(KOKKOS_ARCH_AVX2)
using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx2_fixed_size<4>>;
using data_type_set =
data_types<std::int32_t, std::int64_t, std::uint64_t, double>;
#elif defined(__ARM_NEON)
using host_abi_set = abi_set<simd_abi::scalar, simd_abi::neon_fixed_size<2>>;
using data_type_set =
data_types<std::int32_t, std::int64_t, std::uint64_t, double>;
#else
using host_abi_set = abi_set<simd_abi::scalar>;
using host_abi_set = abi_set<simd_abi::scalar>;
using data_type_set = data_types<std::int32_t, std::uint32_t, std::int64_t,
std::uint64_t, double>;
#endif

using device_abi_set = abi_set<simd_abi::scalar>;
Expand Down
239 changes: 206 additions & 33 deletions simd/src/Kokkos_SIMD_AVX2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,9 +390,6 @@ class simd<double, simd_abi::avx2_fixed_size<4>> {
bool> = false>
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
: m_value(_mm256_set1_pd(value_type(value))) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(double a, double b, double c,
double d)
: m_value(_mm256_setr_pd(a, b, c, d)) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
__m256d const& value_in)
: m_value(value_in) {}
Expand Down Expand Up @@ -587,19 +584,17 @@ class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
bool> = false>
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
: m_value(_mm_set1_epi32(value_type(value))) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(std::int32_t a, std::int32_t b,
std::int32_t c, std::int32_t d)
: m_value(_mm_setr_epi32(a, b, c, d)) {}
template <class G,
std::enable_if_t<
std::is_invocable_r_v<value_type, G,
std::integral_constant<std::size_t, 0>>,
bool> = false>
KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
: simd(gen(std::integral_constant<std::size_t, 0>()),
gen(std::integral_constant<std::size_t, 1>()),
gen(std::integral_constant<std::size_t, 2>()),
gen(std::integral_constant<std::size_t, 3>())) {}
: m_value(_mm_setr_epi32(gen(std::integral_constant<std::size_t, 0>()),
gen(std::integral_constant<std::size_t, 1>()),
gen(std::integral_constant<std::size_t, 2>()),
gen(std::integral_constant<std::size_t, 3>()))) {
}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
__m128i const& value_in)
: m_value(value_in) {}
Expand Down Expand Up @@ -700,19 +695,17 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
bool> = false>
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
: m_value(_mm256_set1_epi64x(value_type(value))) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(std::int64_t a, std::int64_t b,
std::int64_t c, std::int64_t d)
: m_value(_mm256_setr_epi64x(a, b, c, d)) {}
template <class G,
std::enable_if_t<
std::is_invocable_r_v<value_type, G,
std::integral_constant<std::size_t, 0>>,
bool> = false>
KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
: simd(gen(std::integral_constant<std::size_t, 0>()),
gen(std::integral_constant<std::size_t, 1>()),
gen(std::integral_constant<std::size_t, 2>()),
gen(std::integral_constant<std::size_t, 3>())) {}
: m_value(_mm256_setr_epi64x(
gen(std::integral_constant<std::size_t, 0>()),
gen(std::integral_constant<std::size_t, 1>()),
gen(std::integral_constant<std::size_t, 2>()),
gen(std::integral_constant<std::size_t, 3>()))) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
__m256i const& value_in)
: m_value(value_in) {}
Expand Down Expand Up @@ -783,6 +776,14 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
return simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(0) - a;
}

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::avx2_fixed_size<4>>
operator+(simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& lhs,
simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& rhs) {
return simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(
_mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::avx2_fixed_size<4>> condition(
simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down Expand Up @@ -816,6 +817,17 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> {
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
: m_value(_mm256_set1_epi64x(
Kokkos::bit_cast<std::int64_t>(value_type(value)))) {}
template <class G,
std::enable_if_t<
std::is_invocable_r_v<value_type, G,
std::integral_constant<std::size_t, 0>>,
bool> = false>
KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
: m_value(_mm256_setr_epi64x(
gen(std::integral_constant<std::size_t, 0>()),
gen(std::integral_constant<std::size_t, 1>()),
gen(std::integral_constant<std::size_t, 2>()),
gen(std::integral_constant<std::size_t, 3>()))) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m256i const& value_in)
: m_value(value_in) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
Expand All @@ -831,6 +843,11 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> {
operator[](std::size_t i) const {
return reinterpret_cast<value_type const*>(&m_value)[i];
}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
element_aligned_tag) {
m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr),
static_cast<__m256i>(mask_type(true)));
}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
operator>>(unsigned int rhs) const {
return _mm256_srli_epi64(m_value, rhs);
Expand Down Expand Up @@ -876,6 +893,22 @@ simd<std::int64_t, simd_abi::avx2_fixed_size<4>>::simd(
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& other)
: m_value(static_cast<__m256i>(other)) {}

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>
operator+(simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& lhs,
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& rhs) {
return simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>(
_mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
}

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>
operator-(simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& lhs,
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& rhs) {
return simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>(
_mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> condition(
simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down Expand Up @@ -910,14 +943,7 @@ class const_where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
public:
const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
: m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
mask() const {
return m_mask;
}
[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
value() const {
return m_value;
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
void copy_to(double* mem, element_aligned_tag) const {
_mm256_maskstore_pd(mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)),
Expand All @@ -931,6 +957,12 @@ class const_where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
if (m_mask[lane]) mem[index[lane]] = m_value[lane];
}
}

friend constexpr auto const& Impl::mask<double, abi_type>(
const_where_expression<mask_type, value_type> const& x);

friend constexpr auto const& Impl::value<double, abi_type>(
const_where_expression<mask_type, value_type> const& x);
};

template <>
Expand Down Expand Up @@ -987,19 +1019,18 @@ class const_where_expression<
public:
const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
: m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
mask() const {
return m_mask;
}
[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
value() const {
return m_value;
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
void copy_to(std::int32_t* mem, element_aligned_tag) const {
_mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask),
static_cast<__m128i>(m_value));
}

friend constexpr auto const& Impl::mask<std::int32_t, abi_type>(
const_where_expression<mask_type, value_type> const& x);

friend constexpr auto const& Impl::value<std::int32_t, abi_type>(
const_where_expression<mask_type, value_type> const& x);
};

template <>
Expand All @@ -1017,6 +1048,148 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>>,
void copy_from(std::int32_t const* mem, element_aligned_tag) {
m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask)));
}
template <
class U,
std::enable_if_t<std::is_convertible_v<
U, simd<std::int32_t, simd_abi::avx2_fixed_size<4>>>,
bool> = false>
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
auto const x_as_value_type =
static_cast<simd<std::int32_t, simd_abi::avx2_fixed_size<4>>>(
std::forward<U>(x));
m_value = simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(_mm_castps_si128(
_mm_blendv_ps(_mm_castsi128_ps(static_cast<__m128i>(m_value)),
_mm_castsi128_ps(static_cast<__m128i>(x_as_value_type)),
_mm_castsi128_ps(static_cast<__m128i>(m_mask)))));
}
};

template <>
class const_where_expression<
simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>,
simd<std::int64_t, simd_abi::avx2_fixed_size<4>>> {
public:
using abi_type = simd_abi::avx2_fixed_size<4>;
using value_type = simd<std::int64_t, abi_type>;
using mask_type = simd_mask<std::int64_t, abi_type>;

protected:
value_type& m_value;
mask_type const& m_mask;

public:
const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
: m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
std::int64_t* mem, element_aligned_tag) const {
_mm256_maskstore_epi64(reinterpret_cast<long long*>(mem),
static_cast<__m256i>(m_mask),
static_cast<__m256i>(m_value));
}

friend constexpr auto const& Impl::mask<std::int64_t, abi_type>(
const_where_expression<mask_type, value_type> const& x);

friend constexpr auto const& Impl::value<std::int64_t, abi_type>(
const_where_expression<mask_type, value_type> const& x);
};

template <>
class where_expression<simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>,
simd<std::int64_t, simd_abi::avx2_fixed_size<4>>>
: public const_where_expression<
simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>,
simd<std::int64_t, simd_abi::avx2_fixed_size<4>>> {
public:
where_expression(
simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> const& mask_arg,
simd<std::int64_t, simd_abi::avx2_fixed_size<4>>& value_arg)
: const_where_expression(mask_arg, value_arg) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::int64_t const* mem,
element_aligned_tag) {
m_value = value_type(_mm256_maskload_epi64(
reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask)));
}
template <
class u,
std::enable_if_t<std::is_convertible_v<
u, simd<std::int64_t, simd_abi::avx2_fixed_size<4>>>,
bool> = false>
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(u&& x) {
auto const x_as_value_type =
static_cast<simd<std::int64_t, simd_abi::avx2_fixed_size<4>>>(
std::forward<u>(x));
m_value = simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(
_mm256_castpd_si256(_mm256_blendv_pd(
_mm256_castsi256_pd(static_cast<__m256i>(m_value)),
_mm256_castsi256_pd(static_cast<__m256i>(x_as_value_type)),
_mm256_castsi256_pd(static_cast<__m256i>(m_mask)))));
}
};

template <>
class const_where_expression<
simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>>,
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>> {
public:
using abi_type = simd_abi::avx2_fixed_size<4>;
using value_type = simd<std::uint64_t, abi_type>;
using mask_type = simd_mask<std::uint64_t, abi_type>;

protected:
value_type& m_value;
mask_type const& m_mask;

public:
const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
: m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
std::uint64_t* mem, element_aligned_tag) const {
_mm256_maskstore_epi64(reinterpret_cast<long long*>(mem),
static_cast<__m256i>(m_mask),
static_cast<__m256i>(m_value));
}

friend constexpr auto const& Impl::mask<std::uint64_t, abi_type>(
const_where_expression<mask_type, value_type> const& x);

friend constexpr auto const& Impl::value<std::uint64_t, abi_type>(
const_where_expression<mask_type, value_type> const& x);
};

template <>
class where_expression<simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>>,
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>>
: public const_where_expression<
simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>>,
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>> {
public:
where_expression(
simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& mask_arg,
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>& value_arg)
: const_where_expression(mask_arg, value_arg) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::uint64_t const* mem,
element_aligned_tag) {
m_value = value_type(_mm256_maskload_epi64(
reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask)));
}
template <class u,
std::enable_if_t<
std::is_convertible_v<
u, simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>>,
bool> = false>
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(u&& x) {
auto const x_as_value_type =
static_cast<simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>>(
std::forward<u>(x));
m_value = simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>(
_mm256_castpd_si256(_mm256_blendv_pd(
_mm256_castsi256_pd(static_cast<__m256i>(m_value)),
_mm256_castsi256_pd(static_cast<__m256i>(x_as_value_type)),
_mm256_castsi256_pd(static_cast<__m256i>(m_mask)))));
}
};

} // namespace Experimental
Expand Down

0 comments on commit 915c174

Please sign in to comment.