40 changes: 15 additions & 25 deletions libc/src/string/memory_utils/memcmp_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY LIBC_LOOP_NOUNROLL
#include "src/__support/macros/properties/architectures.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_riscv.h"
#include "src/string/memory_utils/utils.h" // CPtr MemcmpReturnType

#include <stddef.h> // size_t
Expand All @@ -26,21 +27,17 @@
namespace __llvm_libc {

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
LIBC_LOOP_NOUNROLL
for (; offset < count; ++offset)
if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
return value;
return MemcmpReturnType::ZERO();
inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
return generic::Memcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
}

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint64_t);
if (count <= 2 * kAlign)
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
return inline_memcmp_byte_per_byte(p1, p2, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
Expand All @@ -56,21 +53,20 @@ inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
b = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
p2, offset);
uint64_t a = load64_aligned<uint64_t>(p1, offset);
if (a != b) {
// TODO use cmp_neq_uint64_t from D148717 once it's submitted.
return Endian::to_big_endian(a) < Endian::to_big_endian(b) ? -1 : 1;
}
if (a != b)
return cmp_neq_uint64_t(Endian::to_big_endian(a),
Endian::to_big_endian(b));
}
return inline_memcmp_byte_per_byte(p1, p2, offset, count);
return inline_memcmp_byte_per_byte(p1, p2, count, offset);
}

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint32_t);
if (count <= 2 * kAlign)
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
return inline_memcmp_byte_per_byte(p1, p2, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
Expand All @@ -83,16 +79,10 @@ inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
else
b = load32_aligned<uint8_t, uint16_t, uint8_t>(p2, offset);
uint32_t a = load32_aligned<uint32_t>(p1, offset);
if (a != b) {
// TODO use cmp_uint32_t from D148717 once it's submitted.
// We perform the difference as an uint64_t.
const int64_t diff = static_cast<int64_t>(Endian::to_big_endian(a)) -
static_cast<int64_t>(Endian::to_big_endian(b));
// And reduce the uint64_t into an uint32_t.
return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF));
}
if (a != b)
return cmp_uint32_t(Endian::to_big_endian(a), Endian::to_big_endian(b));
}
return inline_memcmp_byte_per_byte(p1, p2, offset, count);
return inline_memcmp_byte_per_byte(p1, p2, count, offset);
}

LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
Expand All @@ -105,7 +95,7 @@ LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
return inline_memcmp_aligned_access_32bit(p1, p2, count);
#else
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
return inline_memcmp_byte_per_byte(p1, p2, count);
#endif
}

Expand Down
24 changes: 12 additions & 12 deletions libc/src/string/memory_utils/memmove_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,27 @@ LIBC_INLINE void inline_memmove(Ptr dst, CPtr src, size_t count) {
#if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
#if defined(LIBC_TARGET_ARCH_IS_X86)
#if defined(__AVX512F__)
using uint128_t = uint8x16_t;
using uint256_t = uint8x32_t;
using uint512_t = uint8x64_t;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
#elif defined(__AVX__)
using uint128_t = uint8x16_t;
using uint256_t = uint8x32_t;
using uint512_t = cpp::array<uint8x32_t, 2>;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
using uint128_t = uint8x16_t;
using uint256_t = cpp::array<uint8x16_t, 2>;
using uint512_t = cpp::array<uint8x16_t, 4>;
using uint128_t = generic_v128;
using uint256_t = cpp::array<generic_v128, 2>;
using uint512_t = cpp::array<generic_v128, 4>;
#else
using uint128_t = cpp::array<uint64_t, 2>;
using uint256_t = cpp::array<uint64_t, 4>;
using uint512_t = cpp::array<uint64_t, 8>;
#endif
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
static_assert(aarch64::kNeon, "aarch64 supports vector types");
using uint128_t = uint8x16_t;
using uint256_t = uint8x32_t;
using uint512_t = uint8x64_t;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
#endif
if (count == 0)
return;
Expand Down
24 changes: 12 additions & 12 deletions libc/src/string/memory_utils/memset_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,17 @@ inline_memset_aligned_access_64bit(Ptr dst, uint8_t value, size_t count) {
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
using uint128_t = uint8x16_t;
using uint256_t = uint8x32_t;
using uint512_t = uint8x64_t;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
#elif defined(__AVX__)
using uint128_t = uint8x16_t;
using uint256_t = uint8x32_t;
using uint512_t = cpp::array<uint8x32_t, 2>;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
using uint128_t = uint8x16_t;
using uint256_t = cpp::array<uint8x16_t, 2>;
using uint512_t = cpp::array<uint8x16_t, 4>;
using uint128_t = generic_v128;
using uint256_t = cpp::array<generic_v128, 2>;
using uint512_t = cpp::array<generic_v128, 4>;
#else
using uint128_t = cpp::array<uint64_t, 2>;
using uint256_t = cpp::array<uint64_t, 4>;
Expand Down Expand Up @@ -106,9 +106,9 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
[[maybe_unused]] LIBC_INLINE static void
inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
static_assert(aarch64::kNeon, "aarch64 supports vector types");
using uint128_t = uint8x16_t;
using uint256_t = uint8x32_t;
using uint512_t = uint8x64_t;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
if (count == 0)
return;
if (count <= 3) {
Expand Down
96 changes: 95 additions & 1 deletion libc/src/string/memory_utils/op_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ struct BzeroCacheLine {
offset += SIZE;
} while (offset < count - SIZE);
// Unaligned store, we can't use 'dc zva' here.
generic::Memset<uint8x64_t>::tail(dst, value, count);
generic::Memset<generic_v512>::tail(dst, value, count);
}
};

Expand Down Expand Up @@ -171,6 +171,100 @@ template <size_t Size> struct Bcmp {

} // namespace __llvm_libc::aarch64

namespace __llvm_libc::generic {

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint16_t
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
}
template <>
LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
static_cast<int32_t>(load_be<uint16_t>(p2, offset));
}

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint32_t
template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
template <>
LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load_be<uint32_t>(p1, offset);
const auto b = load_be<uint32_t>(p2, offset);
return a > b ? 1 : a < b ? -1 : 0;
}

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint64_t
template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
template <>
LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load_be<uint64_t>(p1, offset);
const auto b = load_be<uint64_t>(p2, offset);
if (a != b)
return a > b ? 1 : -1;
return MemcmpReturnType::ZERO();
}

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint8x16_t
template <> struct is_vector<uint8x16_t> : cpp::true_type {};
template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
template <>
LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
for (size_t i = 0; i < 2; ++i) {
auto a = load<uint64_t>(p1, offset);
auto b = load<uint64_t>(p2, offset);
uint32_t cond = a != b;
if (cond)
return cond;
offset += sizeof(uint64_t);
}
return 0;
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
for (size_t i = 0; i < 2; ++i) {
auto a = load_be<uint64_t>(p1, offset);
auto b = load_be<uint64_t>(p2, offset);
if (a != b)
return cmp_neq_uint64_t(a, b);
offset += sizeof(uint64_t);
}
return MemcmpReturnType::ZERO();
}

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint8x16x2_t
template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
template <>
LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
size_t offset) {
for (size_t i = 0; i < 4; ++i) {
auto a = load_be<uint64_t>(p1, offset);
auto b = load_be<uint64_t>(p2, offset);
if (a != b)
return cmp_neq_uint64_t(a, b);
offset += sizeof(uint64_t);
}
return MemcmpReturnType::ZERO();
}
} // namespace __llvm_libc::generic

#endif // LIBC_TARGET_ARCH_IS_AARCH64

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
451 changes: 247 additions & 204 deletions libc/src/string/memory_utils/op_generic.h

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions libc/src/string/memory_utils/op_riscv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
//===-- RISC-V implementation of memory function building blocks ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides x86 specific building blocks to compose memory functions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H

#include "src/__support/macros/properties/architectures.h"

#if defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)

#include "src/__support/common.h"
#include "src/string/memory_utils/op_generic.h"

namespace __llvm_libc::generic {

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint16_t
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
}
template <>
LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
static_cast<int32_t>(load_be<uint16_t>(p2, offset));
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint32_t
template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
}
template <>
LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load_be<uint32_t>(p1, offset);
const auto b = load_be<uint32_t>(p2, offset);
return cmp_uint32_t(a, b);
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint64_t
template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
}
template <>
LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
return !eq<uint64_t>(p1, p2, offset);
}
template <>
LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
size_t offset) {
const auto a = load_be<uint64_t>(p1, offset);
const auto b = load_be<uint64_t>(p2, offset);
return cmp_neq_uint64_t(a, b);
}

} // namespace __llvm_libc::generic

#endif // LIBC_TARGET_ARCH_IS_ANY_RISCV
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H
381 changes: 181 additions & 200 deletions libc/src/string/memory_utils/op_x86.h

Large diffs are not rendered by default.

63 changes: 62 additions & 1 deletion libc/src/string/memory_utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
#include "src/__support/endian.h"
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
#include "src/__support/macros/properties/architectures.h"

#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t
#include <stdint.h> // intptr_t / uintptr_t / INT32_MAX / INT32_MIN

namespace __llvm_libc {

Expand Down Expand Up @@ -149,6 +150,56 @@ template <typename T> struct StrictIntegralType {
using MemcmpReturnType = StrictIntegralType<int32_t>;
using BcmpReturnType = StrictIntegralType<uint32_t>;

// This implements the semantic of 'memcmp' returning a negative value when 'a'
// is less than 'b', '0' when 'a' equals 'b' and a positive number otherwise.
LIBC_INLINE MemcmpReturnType cmp_uint32_t(uint32_t a, uint32_t b) {
// We perform the difference as an int64_t.
const int64_t diff = static_cast<int64_t>(a) - static_cast<int64_t>(b);
// For the int64_t to int32_t conversion we want the following properties:
// - int32_t[31:31] == 1 iff diff < 0
// - int32_t[31:0] == 0 iff diff == 0

// We also observe that:
// - When diff < 0: diff[63:32] == 0xffffffff and diff[31:0] != 0
// - When diff > 0: diff[63:32] == 0 and diff[31:0] != 0
// - When diff == 0: diff[63:32] == 0 and diff[31:0] == 0
// - https://godbolt.org/z/8W7qWP6e5
// - This implies that we can only look at diff[32:32] for determining the
// sign bit for the returned int32_t.

// So, we do the following:
// - int32_t[31:31] = diff[32:32]
// - int32_t[30:0] = diff[31:0] == 0 ? 0 : non-0.

// And, we can achieve the above by the expression below. We could have also
// used (diff64 >> 1) | (diff64 & 0x1) but (diff64 & 0xFFFF) is faster than
// (diff64 & 0x1). https://godbolt.org/z/j3b569rW1
return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF));
}

// Returns a negative value if 'a' is less than 'b' and a positive value
// otherwise. This implements the semantic of 'memcmp' when we know that 'a' and
// 'b' differ.
LIBC_INLINE MemcmpReturnType cmp_neq_uint64_t(uint64_t a, uint64_t b) {
#if defined(LIBC_TARGET_ARCH_IS_X86_64)
// On x86, we choose the returned values so that they are just one unit appart
// as this allows for better code generation.
static constexpr int32_t POSITIVE = INT32_MAX;
static constexpr int32_t NEGATIVE = INT32_MIN;
static_assert(cpp::bit_cast<uint32_t>(NEGATIVE) -
cpp::bit_cast<uint32_t>(POSITIVE) ==
1);
#else
// On RISC-V we simply use '1' and '-1' as it leads to branchless code.
// On ARMv8, both strategies lead to the same performance.
static constexpr int32_t POSITIVE = 1;
static constexpr int32_t NEGATIVE = -1;
#endif
static_assert(POSITIVE > 0);
static_assert(NEGATIVE < 0);
return a < b ? NEGATIVE : POSITIVE;
}

// Loads bytes from memory (possibly unaligned) and materializes them as
// type.
template <typename T> LIBC_INLINE T load(CPtr ptr) {
Expand Down Expand Up @@ -280,6 +331,16 @@ void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
}

template <size_t SIZE> struct AlignHelper {
AlignHelper(CPtr ptr) : offset_(distance_to_next_aligned<SIZE>(ptr)) {}

LIBC_INLINE bool not_aligned() const { return offset_ != SIZE; }
LIBC_INLINE uintptr_t offset() const { return offset_; }

private:
uintptr_t offset_;
};

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
91 changes: 44 additions & 47 deletions libc/src/string/memory_utils/x86_64/memcmp_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,79 +18,76 @@ namespace __llvm_libc {

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
return generic::Memcmp<uint64_t>::loop_and_tail_align_above(384, p1, p2,
count);
}

#if defined(__SSE4_1__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
inline_memcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
return generic::Memcmp<__m128i>::loop_and_tail_align_above(384, p1, p2,
count);
}
#endif // __SSE4_1__

#if defined(__AVX2__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
return generic::Memcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
return value;
align_to_next_boundary<32, Arg::P1>(p1, p2, count);
}
return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
return generic::Memcmp<__m256i>::head_tail(p1, p2, count);
return generic::Memcmp<__m256i>::loop_and_tail_align_above(384, p1, p2,
count);
}
#endif // __AVX2__

#if defined(__AVX512BW__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
return generic::Memcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
return generic::Memcmp<__m256i>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
return generic::Memcmp<__m512i>::head_tail(p1, p2, count);
return generic::Memcmp<__m512i>::loop_and_tail_align_above(384, p1, p2,
count);
}
#endif // __AVX512BW__

LIBC_INLINE MemcmpReturnType inline_memcmp_x86(CPtr p1, CPtr p2, size_t count) {

if (count == 0)
return MemcmpReturnType::ZERO();
if (count == 1)
return generic::Memcmp<1>::block(p1, p2);
return generic::Memcmp<uint8_t>::block(p1, p2);
if (count == 2)
return generic::Memcmp<2>::block(p1, p2);
return generic::Memcmp<uint16_t>::block(p1, p2);
if (count == 3)
return generic::Memcmp<3>::block(p1, p2);
if (count <= 8)
return generic::Memcmp<4>::head_tail(p1, p2, count);
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
if (count == 4)
return generic::Memcmp<uint32_t>::block(p1, p2);
if (count == 5)
return generic::MemcmpSequence<uint32_t, uint8_t>::block(p1, p2);
if (count == 6)
return generic::MemcmpSequence<uint32_t, uint16_t>::block(p1, p2);
if (count == 7)
return generic::Memcmp<uint32_t>::head_tail(p1, p2, 7);
if (count == 8)
return generic::Memcmp<uint64_t>::block(p1, p2);
if (count <= 16)
return generic::Memcmp<8>::head_tail(p1, p2, count);
if constexpr (x86::kAvx512BW)
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
else if constexpr (x86::kAvx2)
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
else if constexpr (x86::kSse2)
return inline_memcmp_x86_sse2_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);
#if defined(__AVX512BW__)
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
#elif defined(__AVX2__)
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
#elif defined(__SSE4_1__)
return inline_memcmp_x86_sse41_gt16(p1, p2, count);
#else
return inline_memcmp_generic_gt16(p1, p2, count);
#endif
}

} // namespace __llvm_libc

#endif // LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCMP_IMPLEMENTATIONS_H
111 changes: 57 additions & 54 deletions libc/test/src/string/memory_utils/op_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,11 @@
#include "memory_check_utils.h"
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_generic.h" // LLVM_LIBC_HAS_UINT64
#include "src/string/memory_utils/op_riscv.h"
#include "src/string/memory_utils/op_x86.h"
#include "test/UnitTest/Test.h"

#if defined(LIBC_TARGET_ARCH_IS_X86_64) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
#define LLVM_LIBC_HAS_UINT64
#endif

namespace __llvm_libc {

template <typename T> struct has_head_tail {
Expand Down Expand Up @@ -131,13 +128,13 @@ using MemsetImplementations = testing::TypeList<
generic::Memset<uint64_t>, generic::Memset<cpp::array<uint64_t, 2>>,
#endif
#ifdef __AVX512F__
generic::Memset<uint8x64_t>, generic::Memset<cpp::array<uint8x64_t, 2>>,
generic::Memset<generic_v512>, generic::Memset<cpp::array<generic_v512, 2>>,
#endif
#ifdef __AVX__
generic::Memset<uint8x32_t>, generic::Memset<cpp::array<uint8x32_t, 2>>,
generic::Memset<generic_v256>, generic::Memset<cpp::array<generic_v256, 2>>,
#endif
#ifdef __SSE2__
generic::Memset<uint8x16_t>, generic::Memset<cpp::array<uint8x16_t, 2>>,
generic::Memset<generic_v128>, generic::Memset<cpp::array<generic_v128, 2>>,
#endif
generic::Memset<uint32_t>, generic::Memset<cpp::array<uint32_t, 2>>, //
generic::Memset<uint16_t>, generic::Memset<cpp::array<uint16_t, 2>>, //
Expand Down Expand Up @@ -194,35 +191,36 @@ TYPED_TEST(LlvmLibcOpTest, Memset, MemsetImplementations) {
}

using BcmpImplementations = testing::TypeList<
#ifdef __SSE2__
x86::sse2::Bcmp<16>, //
x86::sse2::Bcmp<32>, //
x86::sse2::Bcmp<64>, //
x86::sse2::Bcmp<128>, //
#endif
#ifdef LIBC_TARGET_ARCH_IS_X86_64
#ifdef __SSE4_1__
generic::Bcmp<__m128i>,
#endif // __SSE4_1__
#ifdef __AVX2__
x86::avx2::Bcmp<32>, //
x86::avx2::Bcmp<64>, //
x86::avx2::Bcmp<128>, //
#endif
generic::Bcmp<__m256i>,
#endif // __AVX2__
#ifdef __AVX512BW__
x86::avx512bw::Bcmp<64>, //
x86::avx512bw::Bcmp<128>, //
#endif
generic::Bcmp<__m512i>,
#endif // __AVX512BW__

#endif // LIBC_TARGET_ARCH_IS_X86_64
#ifdef LIBC_TARGET_ARCH_IS_AARCH64
aarch64::Bcmp<16>, //
aarch64::Bcmp<32>, //
aarch64::Bcmp<32>,
#endif
#ifndef LIBC_TARGET_ARCH_IS_ARM // Removing non uint8_t types for ARM
generic::Bcmp<uint16_t>,
generic::Bcmp<uint32_t>, //
#ifdef LLVM_LIBC_HAS_UINT64
generic::Bcmp<8>, //
#endif
generic::Bcmp<1>, //
generic::Bcmp<2>, //
generic::Bcmp<4>, //
generic::Bcmp<16>, //
generic::Bcmp<32>, //
generic::Bcmp<64> //
>;
generic::Bcmp<uint64_t>,
#endif // LLVM_LIBC_HAS_UINT64
generic::BcmpSequence<uint16_t, uint8_t>,
generic::BcmpSequence<uint32_t, uint8_t>, //
generic::BcmpSequence<uint32_t, uint16_t>, //
generic::BcmpSequence<uint32_t, uint16_t, uint8_t>,
#endif // LIBC_TARGET_ARCH_IS_ARM
generic::BcmpSequence<uint8_t, uint8_t>,
generic::BcmpSequence<uint8_t, uint8_t, uint8_t>, //
generic::Bcmp<uint8_t>>;

// Adapt CheckBcmp signature to op implementation signatures.
template <auto FnImpl>
Expand All @@ -247,7 +245,8 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
ASSERT_TRUE((CheckBcmp<BlockImpl>(span1, span2, kSize)));
}
}
{ // Test head tail operations from kSize to 2 * kSize.
if constexpr (has_head_tail<Impl>::value) {
// Test head tail operations from kSize to 2 * kSize.
static constexpr auto HeadTailImpl = CmpAdaptor<Impl::head_tail>;
Buffer Buffer1(2 * kSize);
Buffer Buffer2(2 * kSize);
Expand All @@ -258,7 +257,8 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
ASSERT_TRUE((CheckBcmp<HeadTailImpl>(span1, span2, size)));
}
}
{ // Test loop operations from kSize to 3 * kSize.
if constexpr (has_loop_and_tail<Impl>::value) {
// Test loop operations from kSize to 3 * kSize.
if constexpr (kSize > 1) {
static constexpr auto LoopImpl = CmpAdaptor<Impl::loop_and_tail>;
Buffer Buffer1(3 * kSize);
Expand All @@ -274,32 +274,33 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
}

using MemcmpImplementations = testing::TypeList<
#ifdef LIBC_TARGET_ARCH_IS_X86_64
#ifdef __SSE2__
x86::sse2::Memcmp<16>, //
x86::sse2::Memcmp<32>, //
x86::sse2::Memcmp<64>, //
x86::sse2::Memcmp<128>, //
generic::Memcmp<__m128i>, //
#endif
#ifdef __AVX2__
x86::avx2::Memcmp<32>, //
x86::avx2::Memcmp<64>, //
x86::avx2::Memcmp<128>, //
generic::Memcmp<__m256i>, //
#endif
#ifdef __AVX512BW__
x86::avx512bw::Memcmp<64>, //
x86::avx512bw::Memcmp<128>, //
generic::Memcmp<__m512i>, //
#endif
#ifdef LLVM_LIBC_HAS_UINT64
generic::Memcmp<8>, //
#endif // LIBC_TARGET_ARCH_IS_X86_64
#ifdef LIBC_TARGET_ARCH_IS_AARCH64
generic::Memcmp<uint8x16_t>, //
generic::Memcmp<uint8x16x2_t>,
#endif
generic::Memcmp<1>, //
generic::Memcmp<2>, //
generic::Memcmp<3>, //
generic::Memcmp<4>, //
generic::Memcmp<16>, //
generic::Memcmp<32>, //
generic::Memcmp<64> //
>;
#ifndef LIBC_TARGET_ARCH_IS_ARM // Removing non uint8_t types for ARM
generic::Memcmp<uint16_t>,
generic::Memcmp<uint32_t>, //
#ifdef LLVM_LIBC_HAS_UINT64
generic::Memcmp<uint64_t>,
#endif // LLVM_LIBC_HAS_UINT64
generic::MemcmpSequence<uint16_t, uint8_t>,
generic::MemcmpSequence<uint32_t, uint16_t, uint8_t>, //
#endif // LIBC_TARGET_ARCH_IS_ARM
generic::MemcmpSequence<uint8_t, uint8_t>,
generic::MemcmpSequence<uint8_t, uint8_t, uint8_t>,
generic::Memcmp<uint8_t>>;

TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
using Impl = ParamType;
Expand All @@ -314,7 +315,8 @@ TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
ASSERT_TRUE((CheckMemcmp<BlockImpl>(span1, span2, kSize)));
}
}
{ // Test head tail operations from kSize to 2 * kSize.
if constexpr (has_head_tail<Impl>::value) {
// Test head tail operations from kSize to 2 * kSize.
static constexpr auto HeadTailImpl = CmpAdaptor<Impl::head_tail>;
Buffer Buffer1(2 * kSize);
Buffer Buffer2(2 * kSize);
Expand All @@ -325,7 +327,8 @@ TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
ASSERT_TRUE((CheckMemcmp<HeadTailImpl>(span1, span2, size)));
}
}
{ // Test loop operations from kSize to 3 * kSize.
if constexpr (has_loop_and_tail<Impl>::value) {
// Test loop operations from kSize to 3 * kSize.
if constexpr (kSize > 1) {
static constexpr auto LoopImpl = CmpAdaptor<Impl::loop_and_tail>;
Buffer Buffer1(3 * kSize);
Expand Down
1 change: 1 addition & 0 deletions utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -1960,6 +1960,7 @@ libc_support_library(
"src/string/memory_utils/op_aarch64.h",
"src/string/memory_utils/op_builtin.h",
"src/string/memory_utils/op_generic.h",
"src/string/memory_utils/op_riscv.h",
"src/string/memory_utils/op_x86.h",
"src/string/memory_utils/utils.h",
],
Expand Down