Skip to content

Commit

Permalink
SIMD: Math functions should be in namespace Kokkos
Browse files Browse the repository at this point in the history
  • Loading branch information
masterleinad committed Sep 22, 2023
1 parent e1c8266 commit b855631
Show file tree
Hide file tree
Showing 6 changed files with 505 additions and 241 deletions.
209 changes: 132 additions & 77 deletions simd/src/Kokkos_SIMD_AVX2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -629,82 +629,100 @@ class simd<double, simd_abi::avx2_fixed_size<4>> {
}
};

} // namespace Experimental

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> copysign(
simd<double, simd_abi::avx2_fixed_size<4>> const& a,
simd<double, simd_abi::avx2_fixed_size<4>> const& b) {
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> copysign(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a,
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& b) {
__m256d const sign_mask = _mm256_set1_pd(-0.0);
return simd<double, simd_abi::avx2_fixed_size<4>>(
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_xor_pd(_mm256_andnot_pd(sign_mask, static_cast<__m256d>(a)),
_mm256_and_pd(sign_mask, static_cast<__m256d>(b))));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> abs(
simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> abs(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
__m256d const sign_mask = _mm256_set1_pd(-0.0);
return simd<double, simd_abi::avx2_fixed_size<4>>(
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_andnot_pd(sign_mask, static_cast<__m256d>(a)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> sqrt(
simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<double, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> sqrt(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_sqrt_pd(static_cast<__m256d>(a)));
}

#ifdef __INTEL_COMPILER

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> cbrt(
simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<double, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> cbrt(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_cbrt_pd(static_cast<__m256d>(a)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> exp(
simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<double, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> exp(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_exp_pd(static_cast<__m256d>(a)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> log(
simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<double, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> log(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_log_pd(static_cast<__m256d>(a)));
}

#endif

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> fma(
simd<double, simd_abi::avx2_fixed_size<4>> const& a,
simd<double, simd_abi::avx2_fixed_size<4>> const& b,
simd<double, simd_abi::avx2_fixed_size<4>> const& c) {
return simd<double, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> fma(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a,
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& b,
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& c) {
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_fmadd_pd(static_cast<__m256d>(a), static_cast<__m256d>(b),
static_cast<__m256d>(c)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> max(
simd<double, simd_abi::avx2_fixed_size<4>> const& a,
simd<double, simd_abi::avx2_fixed_size<4>> const& b) {
return simd<double, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> max(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a,
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& b) {
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_max_pd(static_cast<__m256d>(a), static_cast<__m256d>(b)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> min(
simd<double, simd_abi::avx2_fixed_size<4>> const& a,
simd<double, simd_abi::avx2_fixed_size<4>> const& b) {
return simd<double, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> min(
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& a,
Experimental::simd<double,
Experimental::simd_abi::avx2_fixed_size<4>> const& b) {
return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm256_min_pd(static_cast<__m256d>(a), static_cast<__m256d>(b)));
}

namespace Experimental {

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<double, simd_abi::avx2_fixed_size<4>> condition(
simd_mask<double, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down Expand Up @@ -814,81 +832,100 @@ class simd<float, simd_abi::avx2_fixed_size<4>> {
}
};

} // namespace Experimental

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> copysign(
simd<float, simd_abi::avx2_fixed_size<4>> const& a,
simd<float, simd_abi::avx2_fixed_size<4>> const& b) {
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> copysign(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a,
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
b) {
__m128 const sign_mask = _mm_set1_ps(-0.0);
return simd<float, simd_abi::avx2_fixed_size<4>>(
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_xor_ps(_mm_andnot_ps(sign_mask, static_cast<__m128>(a)),
_mm_and_ps(sign_mask, static_cast<__m128>(b))));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> abs(
simd<float, simd_abi::avx2_fixed_size<4>> const& a) {
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> abs(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a) {
__m128 const sign_mask = _mm_set1_ps(-0.0);
return simd<float, simd_abi::avx2_fixed_size<4>>(
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_andnot_ps(sign_mask, static_cast<__m128>(a)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> sqrt(
simd<float, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<float, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> sqrt(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a) {
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_sqrt_ps(static_cast<__m128>(a)));
}

#ifdef __INTEL_COMPILER

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> cbrt(
simd<float, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<float, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> cbrt(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a) {
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_cbrt_ps(static_cast<__m128>(a)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> exp(
simd<float, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<float, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> exp(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a) {
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_exp_ps(static_cast<__m128>(a)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> log(
simd<float, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<float, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> log(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a) {
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_log_ps(static_cast<__m128>(a)));
}

#endif

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> fma(
simd<float, simd_abi::avx2_fixed_size<4>> const& a,
simd<float, simd_abi::avx2_fixed_size<4>> const& b,
simd<float, simd_abi::avx2_fixed_size<4>> const& c) {
return simd<float, simd_abi::avx2_fixed_size<4>>(_mm_fmadd_ps(
static_cast<__m128>(a), static_cast<__m128>(b), static_cast<__m128>(c)));
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> fma(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a,
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
b,
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
c) {
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_fmadd_ps(static_cast<__m128>(a), static_cast<__m128>(b),
static_cast<__m128>(c)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> max(
simd<float, simd_abi::avx2_fixed_size<4>> const& a,
simd<float, simd_abi::avx2_fixed_size<4>> const& b) {
return simd<float, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> max(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a,
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
b) {
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_max_ps(static_cast<__m128>(a), static_cast<__m128>(b)));
}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> min(
simd<float, simd_abi::avx2_fixed_size<4>> const& a,
simd<float, simd_abi::avx2_fixed_size<4>> const& b) {
return simd<float, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> min(
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
a,
Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
b) {
return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_min_ps(static_cast<__m128>(a), static_cast<__m128>(b)));
}

namespace Experimental {

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<float, simd_abi::avx2_fixed_size<4>> condition(
simd_mask<float, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down Expand Up @@ -1021,13 +1058,20 @@ class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
}
};

} // namespace Experimental

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int32_t, simd_abi::avx2_fixed_size<4>> abs(
simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& a) {
Experimental::simd<std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>>
abs(Experimental::simd<std::int32_t,
Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
__m128i const rhs = static_cast<__m128i>(a);
return simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(_mm_abs_epi32(rhs));
return Experimental::simd<std::int32_t,
Experimental::simd_abi::avx2_fixed_size<4>>(
_mm_abs_epi32(rhs));
}

namespace Experimental {

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int32_t, simd_abi::avx2_fixed_size<4>>
condition(simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down Expand Up @@ -1177,15 +1221,21 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
}
};

} // namespace Experimental

// Manually computing absolute values, because _mm256_abs_epi64
// is not in AVX2; it's available in AVX512.
[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::avx2_fixed_size<4>>
abs(simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& a) {
return simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(
Experimental::simd<std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>>
abs(Experimental::simd<
std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
return Experimental::simd<std::int64_t,
Experimental::simd_abi::avx2_fixed_size<4>>(
[&](std::size_t i) { return (a[i] < 0) ? -a[i] : a[i]; });
}

namespace Experimental {

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::int64_t, simd_abi::avx2_fixed_size<4>>
condition(simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> const& a,
Expand Down Expand Up @@ -1313,12 +1363,6 @@ simd<std::int64_t, simd_abi::avx2_fixed_size<4>>::simd(
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& other)
: m_value(static_cast<__m256i>(other)) {}

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> abs(
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& a) {
return a;
}

[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>
condition(simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& a,
Expand All @@ -1338,6 +1382,17 @@ simd<std::int32_t, simd_abi::avx2_fixed_size<4>>::simd(
}
}

} // namespace Experimental

KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
Experimental::simd<std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>>
abs(Experimental::simd<std::uint64_t,
Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
return a;
}

namespace Experimental {

template <>
class const_where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
simd<double, simd_abi::avx2_fixed_size<4>>> {
Expand Down

0 comments on commit b855631

Please sign in to comment.