| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,174 @@ | ||
| //===-- aarch64 implementation of memory function building blocks ---------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // This file provides aarch64 specific building blocks to compose memory | ||
| // functions. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H | ||
| #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H | ||
|
|
||
| #include "src/__support/architectures.h" | ||
|
|
||
| #if defined(LLVM_LIBC_ARCH_AARCH64) | ||
|
|
||
| #include "src/string/memory_utils/op_generic.h" | ||
|
|
||
| #ifdef __ARM_NEON | ||
| #include <arm_neon.h> | ||
| #endif //__ARM_NEON | ||
|
|
||
| namespace __llvm_libc::aarch64 { | ||
|
|
||
| static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON); | ||
|
|
||
| namespace neon { | ||
|
|
||
| template <size_t Size> struct BzeroCacheLine { | ||
| static constexpr size_t SIZE = Size; | ||
|
|
||
| static inline void block(Ptr dst, uint8_t) { | ||
| static_assert(Size == 64); | ||
| #if __SIZEOF_POINTER__ == 4 | ||
| asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory"); | ||
| #else | ||
| asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory"); | ||
| #endif | ||
| } | ||
|
|
||
| static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { | ||
| static_assert(Size > 1); | ||
| size_t offset = 0; | ||
| do { | ||
| block(dst + offset, value); | ||
| offset += SIZE; | ||
| } while (offset < count - SIZE); | ||
| // Unaligned store, we can't use 'dc zva' here. | ||
| static constexpr size_t kMaxSize = kNeon ? 16 : 8; | ||
| generic::Memset<Size, kMaxSize>::tail(dst, value, count); | ||
| } | ||
| }; | ||
|
|
||
| inline static bool hasZva() { | ||
| uint64_t zva_val; | ||
| asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val)); | ||
| // DC ZVA is permitted if DZP, bit [4] is zero. | ||
| // BS, bits [3:0] is log2 of the block count in words. | ||
| // So the next line checks whether the instruction is permitted and block | ||
| // count is 16 words (i.e. 64 bytes). | ||
| return (zva_val & 0b11111) == 0b00100; | ||
| } | ||
|
|
||
| } // namespace neon | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Memset | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Bcmp | ||
| template <size_t Size> struct Bcmp { | ||
| static constexpr size_t SIZE = Size; | ||
| static constexpr size_t BlockSize = 32; | ||
|
|
||
| static const unsigned char *as_u8(CPtr ptr) { | ||
| return reinterpret_cast<const unsigned char *>(ptr); | ||
| } | ||
|
|
||
| static inline BcmpReturnType block(CPtr p1, CPtr p2) { | ||
| if constexpr (Size == BlockSize) { | ||
| auto _p1 = as_u8(p1); | ||
| auto _p2 = as_u8(p2); | ||
| uint8x16_t a = vld1q_u8(_p1); | ||
| uint8x16_t b = vld1q_u8(_p1 + 16); | ||
| uint8x16_t n = vld1q_u8(_p2); | ||
| uint8x16_t o = vld1q_u8(_p2 + 16); | ||
| uint8x16_t an = veorq_u8(a, n); | ||
| uint8x16_t bo = veorq_u8(b, o); | ||
| // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is | ||
| // a difference between the two buffers. We reduce this value down to 4 | ||
| // bytes in two steps. First, calculate the saturated move value when | ||
| // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get | ||
| // a single 32 bit nonzero value if a mismatch occurred. | ||
| uint8x16_t anbo = vorrq_u8(an, bo); | ||
| uint32x2_t anbo_reduced = vqmovn_u64(anbo); | ||
| return vmaxv_u32(anbo_reduced); | ||
| } else if constexpr ((Size % BlockSize) == 0) { | ||
| for (size_t offset = 0; offset < Size; offset += BlockSize) | ||
| if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset)) | ||
| return value; | ||
| } else { | ||
| deferred_static_assert("SIZE not implemented"); | ||
| } | ||
| return BcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { | ||
| return block(p1 + count - SIZE, p2 + count - SIZE); | ||
| } | ||
|
|
||
| static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { | ||
| if constexpr (Size <= 8) { | ||
| return generic::Bcmp<Size>::head_tail(p1, p2, count); | ||
| } else if constexpr (Size == 16) { | ||
| auto _p1 = as_u8(p1); | ||
| auto _p2 = as_u8(p2); | ||
| uint8x16_t a = vld1q_u8(_p1); | ||
| uint8x16_t b = vld1q_u8(_p1 + count - 16); | ||
| uint8x16_t n = vld1q_u8(_p2); | ||
| uint8x16_t o = vld1q_u8(_p2 + count - 16); | ||
| uint8x16_t an = veorq_s8(a, n); | ||
| uint8x16_t bo = veorq_s8(b, o); | ||
| // anbo = (a ^ n) | (b ^ o) | ||
| uint8x16_t anbo = vorrq_s8(an, bo); | ||
| uint32x2_t anbo_reduced = vqmovn_u64(anbo); | ||
| return vmaxv_u32(anbo_reduced); | ||
| } else if constexpr (Size == 32) { | ||
| auto _p1 = as_u8(p1); | ||
| auto _p2 = as_u8(p2); | ||
| uint8x16_t a = vld1q_u8(_p1); | ||
| uint8x16_t b = vld1q_u8(_p1 + 16); | ||
| uint8x16_t c = vld1q_u8(_p1 + count - 16); | ||
| uint8x16_t d = vld1q_u8(_p1 + count - 32); | ||
| uint8x16_t n = vld1q_u8(_p2); | ||
| uint8x16_t o = vld1q_u8(_p2 + 16); | ||
| uint8x16_t p = vld1q_u8(_p2 + count - 16); | ||
| uint8x16_t q = vld1q_u8(_p2 + count - 32); | ||
| uint8x16_t an = veorq_s8(a, n); | ||
| uint8x16_t bo = veorq_s8(b, o); | ||
| uint8x16_t cp = veorq_s8(c, p); | ||
| uint8x16_t dq = veorq_s8(d, q); | ||
| uint8x16_t anbo = vorrq_s8(an, bo); | ||
| uint8x16_t cpdq = vorrq_s8(cp, dq); | ||
| // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to | ||
| // a nonzero 32 bit value if a mismatch occurred. | ||
| uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); | ||
| uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); | ||
| return vmaxv_u32(abnocpdq_reduced); | ||
| } else { | ||
| deferred_static_assert("SIZE not implemented"); | ||
| } | ||
| return BcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { | ||
| static_assert(Size > 1); | ||
| size_t offset = 0; | ||
| do { | ||
| if (auto value = block(p1 + offset, p2 + offset)) | ||
| return value; | ||
| offset += SIZE; | ||
| } while (offset < count - SIZE); | ||
| return tail(p1, p2, count); | ||
| } | ||
| }; | ||
|
|
||
| } // namespace __llvm_libc::aarch64 | ||
|
|
||
| #endif // LLVM_LIBC_ARCH_AARCH64 | ||
|
|
||
| #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,148 @@ | ||
| //===-- Implementation using the __builtin_XXX_inline ---------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // This file provides generic C++ building blocks to compose memory functions. | ||
| // They rely on the compiler to generate the best possible code through the use | ||
| // of the `__builtin_XXX_inline` builtins. These builtins are currently only | ||
| // available in Clang. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H | ||
| #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H | ||
|
|
||
| #include "src/string/memory_utils/utils.h" | ||
|
|
||
| namespace __llvm_libc::builtin { | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Memcpy | ||
| template <size_t Size> struct Memcpy { | ||
| static constexpr size_t SIZE = Size; | ||
| static inline void block(Ptr __restrict dst, CPtr __restrict src) { | ||
| #ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE | ||
| return __builtin_memcpy_inline(dst, src, SIZE); | ||
| #else | ||
| deferred_static_assert("Missing __builtin_memcpy_inline"); | ||
| (void)dst; | ||
| (void)src; | ||
| #endif | ||
| } | ||
|
|
||
| static inline void tail(Ptr __restrict dst, CPtr __restrict src, | ||
| size_t count) { | ||
| block(dst + count - SIZE, src + count - SIZE); | ||
| } | ||
|
|
||
| static inline void head_tail(Ptr __restrict dst, CPtr __restrict src, | ||
| size_t count) { | ||
| block(dst, src); | ||
| tail(dst, src, count); | ||
| } | ||
|
|
||
| static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src, | ||
| size_t count) { | ||
| static_assert(Size > 1); | ||
| size_t offset = 0; | ||
| do { | ||
| block(dst + offset, src + offset); | ||
| offset += SIZE; | ||
| } while (offset < count - SIZE); | ||
| tail(dst, src, count); | ||
| } | ||
| }; | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Memset | ||
| template <size_t Size> struct Memset { | ||
| using ME = Memset; | ||
| static constexpr size_t SIZE = Size; | ||
| static inline void block(Ptr dst, uint8_t value) { | ||
| #ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE | ||
| __builtin_memset_inline(dst, value, Size); | ||
| #else | ||
| deferred_static_assert("Missing __builtin_memset_inline"); | ||
| (void)dst; | ||
| (void)value; | ||
| #endif | ||
| } | ||
|
|
||
| static inline void tail(Ptr dst, uint8_t value, size_t count) { | ||
| block(dst + count - SIZE, value); | ||
| } | ||
|
|
||
| static inline void head_tail(Ptr dst, uint8_t value, size_t count) { | ||
| block(dst, value); | ||
| tail(dst, value, count); | ||
| } | ||
|
|
||
| static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { | ||
| static_assert(Size > 1); | ||
| size_t offset = 0; | ||
| do { | ||
| block(dst + offset, value); | ||
| offset += SIZE; | ||
| } while (offset < count - SIZE); | ||
| tail(dst, value, count); | ||
| } | ||
| }; | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Bcmp | ||
| template <size_t Size> struct Bcmp { | ||
| using ME = Bcmp; | ||
| static constexpr size_t SIZE = Size; | ||
| static inline BcmpReturnType block(CPtr, CPtr) { | ||
| deferred_static_assert("Missing __builtin_memcmp_inline"); | ||
| return BcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline BcmpReturnType tail(CPtr, CPtr, size_t) { | ||
| deferred_static_assert("Not implemented"); | ||
| return BcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) { | ||
| deferred_static_assert("Not implemented"); | ||
| return BcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) { | ||
| deferred_static_assert("Not implemented"); | ||
| return BcmpReturnType::ZERO(); | ||
| } | ||
| }; | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Memcmp | ||
| template <size_t Size> struct Memcmp { | ||
| using ME = Memcmp; | ||
| static constexpr size_t SIZE = Size; | ||
| static inline MemcmpReturnType block(CPtr, CPtr) { | ||
| deferred_static_assert("Missing __builtin_memcmp_inline"); | ||
| return MemcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline MemcmpReturnType tail(CPtr, CPtr, size_t) { | ||
| deferred_static_assert("Not implemented"); | ||
| return MemcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) { | ||
| deferred_static_assert("Not implemented"); | ||
| return MemcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) { | ||
| deferred_static_assert("Not implemented"); | ||
| return MemcmpReturnType::ZERO(); | ||
| } | ||
| }; | ||
|
|
||
| } // namespace __llvm_libc::builtin | ||
|
|
||
| #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,219 @@ | ||
| //===-- x86 implementation of memory function building blocks -------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // This file provides x86 specific building blocks to compose memory functions. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H | ||
| #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H | ||
|
|
||
| #include "src/__support/architectures.h" | ||
|
|
||
| #if defined(LLVM_LIBC_ARCH_X86_64) | ||
|
|
||
| #include "src/__support/common.h" | ||
| #include "src/string/memory_utils/op_builtin.h" | ||
| #include "src/string/memory_utils/op_generic.h" | ||
|
|
||
| #ifdef __SSE2__ | ||
| #include <immintrin.h> | ||
| #else | ||
| // Define fake functions to prevent the compiler from failing on undefined | ||
| // functions in case SSE2 is not present. | ||
| #define _mm512_cmpneq_epi8_mask(A, B) 0 | ||
| #define _mm_movemask_epi8(A) 0 | ||
| #define _mm256_movemask_epi8(A) 0 | ||
| #endif // __SSE2__ | ||
|
|
||
| namespace __llvm_libc::x86 { | ||
|
|
||
| // A set of constants to check compile time features. | ||
| static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__); | ||
| static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__); | ||
| static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__); | ||
| static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__); | ||
| static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Memcpy repmovsb implementation | ||
| struct Memcpy { | ||
| static void repmovsb(char *dst, const char *src, size_t count) { | ||
| asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); | ||
| } | ||
| }; | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Bcmp | ||
|
|
||
| // Base implementation for the Bcmp specializations. | ||
| // - BlockSize is either 16, 32 or 64 depending on the available compile time | ||
| // features, it is used to switch between "single native operation" or a | ||
| // "sequence of native operations". | ||
| // - BlockBcmp is the function that implements the bcmp logic. | ||
| template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl { | ||
| static inline BcmpReturnType block(CPtr p1, CPtr p2) { | ||
| if constexpr (Size == BlockSize) { | ||
| return BlockBcmp(p1, p2); | ||
| } else if constexpr (Size % BlockSize == 0) { | ||
| for (size_t offset = 0; offset < Size; offset += BlockSize) | ||
| if (auto value = BlockBcmp(p1 + offset, p2 + offset)) | ||
| return value; | ||
| } else { | ||
| deferred_static_assert("SIZE not implemented"); | ||
| } | ||
| return BcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { | ||
| return block(p1 + count - Size, p2 + count - Size); | ||
| } | ||
|
|
||
| static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { | ||
| return block(p1, p2) | tail(p1, p2, count); | ||
| } | ||
|
|
||
| static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { | ||
| static_assert(Size > 1); | ||
| size_t offset = 0; | ||
| do { | ||
| if (auto value = block(p1 + offset, p2 + offset)) | ||
| return value; | ||
| offset += Size; | ||
| } while (offset < count - Size); | ||
| return tail(p1, p2, count); | ||
| } | ||
| }; | ||
|
|
||
| namespace sse2 { | ||
| static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) { | ||
| using T = char __attribute__((__vector_size__(16))); | ||
| // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. | ||
| const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)); | ||
| return static_cast<uint32_t>(mask); | ||
| } | ||
| template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>; | ||
| } // namespace sse2 | ||
|
|
||
| namespace avx2 { | ||
| static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) { | ||
| using T = char __attribute__((__vector_size__(32))); | ||
| // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. | ||
| const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)); | ||
| // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit | ||
| // mask. | ||
| return static_cast<uint32_t>(mask); | ||
| } | ||
| template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>; | ||
| } // namespace avx2 | ||
|
|
||
| namespace avx512bw { | ||
| static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) { | ||
| using T = char __attribute__((__vector_size__(64))); | ||
| // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. | ||
| const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)); | ||
| const bool mask_is_set = mask != 0; | ||
| return static_cast<uint32_t>(mask_is_set); | ||
| } | ||
| template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>; | ||
| } // namespace avx512bw | ||
|
|
||
| // Assuming that the mask is non zero, the index of the first mismatching byte | ||
| // is the number of trailing zeros in the mask. Trailing zeros and not leading | ||
| // zeros because the x86 architecture is little endian. | ||
| static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2, | ||
| uint64_t mask) { | ||
| const size_t diff_index = __builtin_ctzll(mask); | ||
| const int16_t ca = p1[diff_index]; | ||
| const int16_t cb = p2[diff_index]; | ||
| return ca - cb; | ||
| } | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////// | ||
| // Memcmp | ||
|
|
||
| // Base implementation for the Memcmp specializations. | ||
| // - BlockSize is either 16, 32 or 64 depending on the available compile time | ||
| // features, it is used to switch between "single native operation" or a | ||
| // "sequence of native operations". | ||
| // - BlockMemcmp is the function that implements the memcmp logic. | ||
| // - BlockBcmp is the function that implements the bcmp logic. | ||
| template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp> | ||
| struct MemcmpImpl { | ||
| static inline MemcmpReturnType block(CPtr p1, CPtr p2) { | ||
| if constexpr (Size == BlockSize) { | ||
| return BlockMemcmp(p1, p2); | ||
| } else if constexpr (Size % BlockSize == 0) { | ||
| for (size_t offset = 0; offset < Size; offset += BlockSize) | ||
| if (auto value = BlockBcmp(p1 + offset, p2 + offset)) | ||
| return BlockMemcmp(p1 + offset, p2 + offset); | ||
| } else { | ||
| deferred_static_assert("SIZE not implemented"); | ||
| } | ||
| return MemcmpReturnType::ZERO(); | ||
| } | ||
|
|
||
| static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { | ||
| return block(p1 + count - Size, p2 + count - Size); | ||
| } | ||
|
|
||
| static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { | ||
| if (auto value = block(p1, p2)) | ||
| return value; | ||
| return tail(p1, p2, count); | ||
| } | ||
|
|
||
| static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { | ||
| static_assert(Size > 1); | ||
| size_t offset = 0; | ||
| do { | ||
| if (auto value = block(p1 + offset, p2 + offset)) | ||
| return value; | ||
| offset += Size; | ||
| } while (offset < count - Size); | ||
| return tail(p1, p2, count); | ||
| } | ||
| }; | ||
|
|
||
| namespace sse2 { | ||
| static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { | ||
| using T = char __attribute__((__vector_size__(16))); | ||
| // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. | ||
| if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2))) | ||
| return char_diff_no_zero(p1, p2, mask); | ||
| return MemcmpReturnType::ZERO(); | ||
| } | ||
| template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>; | ||
| } // namespace sse2 | ||
|
|
||
| namespace avx2 { | ||
| static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { | ||
| using T = char __attribute__((__vector_size__(32))); | ||
| // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. | ||
| if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2))) | ||
| return char_diff_no_zero(p1, p2, mask); | ||
| return MemcmpReturnType::ZERO(); | ||
| } | ||
| template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>; | ||
| } // namespace avx2 | ||
|
|
||
| namespace avx512bw { | ||
| static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { | ||
| using T = char __attribute__((__vector_size__(64))); | ||
| // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. | ||
| if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2))) | ||
| return char_diff_no_zero(p1, p2, mask); | ||
| return MemcmpReturnType::ZERO(); | ||
| } | ||
| template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>; | ||
| } // namespace avx512bw | ||
|
|
||
| } // namespace __llvm_libc::x86 | ||
|
|
||
| #endif // LLVM_LIBC_ARCH_X86_64 | ||
|
|
||
| #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |