Skip to content

Commit

Permalink
[reland][libc] Switch to new implementation of mem* functions
Browse files Browse the repository at this point in the history
The new framework makes it explicit which processor feature is being
used and allows for easier per platform customization:
 - ARM cpu now uses trivial implementations to reduce code size.
 - Memcmp, Bcmp and Memmove have been optimized for x86
 - Bcmp has been optimized for aarch64.

This is a reland of https://reviews.llvm.org/D135134 (b3f1d58, 0284148)

Reviewed By: courbet

Differential Revision: https://reviews.llvm.org/D136595
  • Loading branch information
gchatelet committed Nov 2, 2022
1 parent 17c9d4d commit 67437dd
Show file tree
Hide file tree
Showing 13 changed files with 525 additions and 323 deletions.
2 changes: 1 addition & 1 deletion libc/src/stdio/printf_core/string_writer.cpp
Expand Up @@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
len = available_capacity;

if (len > 0) {
inline_memset(cur_buffer, new_char, len);
inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
cur_buffer += len;
available_capacity -= len;
}
Expand Down
3 changes: 1 addition & 2 deletions libc/src/string/bcmp.cpp
Expand Up @@ -14,8 +14,7 @@ namespace __llvm_libc {

LLVM_LIBC_FUNCTION(int, bcmp,
(const void *lhs, const void *rhs, size_t count)) {
return inline_bcmp(static_cast<const char *>(lhs),
static_cast<const char *>(rhs), count);
return inline_bcmp(lhs, rhs, count);
}

} // namespace __llvm_libc
3 changes: 1 addition & 2 deletions libc/src/string/memcmp.cpp
Expand Up @@ -15,8 +15,7 @@ namespace __llvm_libc {

LLVM_LIBC_FUNCTION(int, memcmp,
(const void *lhs, const void *rhs, size_t count)) {
return inline_memcmp(static_cast<const char *>(lhs),
static_cast<const char *>(rhs), count);
return inline_memcmp(lhs, rhs, count);
}

} // namespace __llvm_libc
3 changes: 1 addition & 2 deletions libc/src/string/memcpy.cpp
Expand Up @@ -15,8 +15,7 @@ namespace __llvm_libc {
LLVM_LIBC_FUNCTION(void *, memcpy,
(void *__restrict dst, const void *__restrict src,
size_t size)) {
inline_memcpy(reinterpret_cast<char *>(dst),
reinterpret_cast<const char *>(src), size);
inline_memcpy(dst, src, size);
return dst;
}

Expand Down
104 changes: 86 additions & 18 deletions libc/src/string/memmove.cpp
Expand Up @@ -9,42 +9,110 @@
#include "src/string/memmove.h"

#include "src/__support/common.h"
#include "src/__support/integer_operations.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include <stddef.h> // size_t, ptrdiff_t

#include <stdio.h>

namespace __llvm_libc {

static inline void inline_memmove(char *dst, const char *src, size_t count) {
using namespace __llvm_libc::scalar;
[[maybe_unused]] static inline void
inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) {
if ((count == 0) || (dst == src))
return;
if (dst < src) {
#pragma nounroll
for (size_t offset = 0; offset < count; ++offset)
builtin::Memcpy<1>::block(dst + offset, src + offset);
} else {
#pragma nounroll
for (ptrdiff_t offset = count - 1; offset >= 0; --offset)
builtin::Memcpy<1>::block(dst + offset, src + offset);
}
}

template <size_t MaxSize>
[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src,
size_t count) {
if (count == 0)
return;
if (count == 1)
return move<_1>(dst, src);
return generic::Memmove<1, MaxSize>::block(dst, src);
if (count <= 4)
return move<HeadTail<_2>>(dst, src, count);
return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
if (count <= 8)
return move<HeadTail<_4>>(dst, src, count);
return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
if (count <= 16)
return move<HeadTail<_8>>(dst, src, count);
return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
if (count <= 32)
return move<HeadTail<_16>>(dst, src, count);
return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
if (count <= 64)
return move<HeadTail<_32>>(dst, src, count);
return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
if (count <= 128)
return move<HeadTail<_64>>(dst, src, count);
return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
if (dst < src) {
generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
count);
return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
count);
} else {
generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
count);
return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
count);
}
}

using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
if (dst < src)
return move<AlignedMoveLoop>(dst, src, count);
else if (dst > src)
return move_backward<AlignedMoveLoop>(dst, src, count);
static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
#if defined(LLVM_LIBC_ARCH_X86)
static constexpr size_t kMaxSize = x86::kAvx512F ? 64
: x86::kAvx ? 32
: x86::kSse2 ? 16
: 8;
#elif defined(LLVM_LIBC_ARCH_AARCH64)
static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
#endif
// return inline_memmove_generic<kMaxSize>(dst, src, count);
if (count == 0)
return;
if (count == 1)
return generic::Memmove<1, kMaxSize>::block(dst, src);
if (count <= 4)
return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
if (count <= 8)
return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
if (count <= 16)
return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
if (count <= 32)
return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
if (count <= 64)
return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
if (count <= 128)
return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
if (dst < src) {
generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
count);
} else {
generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
count);
}
#elif defined(LLVM_LIBC_ARCH_ARM)
return inline_memmove_embedded_tiny(dst, src, count);
#else
#error "Unsupported platform"
#endif
}

LLVM_LIBC_FUNCTION(void *, memmove,
(void *dst, const void *src, size_t count)) {
inline_memmove(reinterpret_cast<char *>(dst),
reinterpret_cast<const char *>(src), count);
inline_memmove(reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src),
count);
return dst;
}

Expand Down
176 changes: 148 additions & 28 deletions libc/src/string/memory_utils/bcmp_implementations.h
Expand Up @@ -11,49 +11,169 @@

#include "src/__support/architectures.h"
#include "src/__support/common.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"

#include <stddef.h> // size_t

namespace __llvm_libc {

// Fixed-size difference between 'lhs' and 'rhs'.
template <typename Element> bool differs(const char *lhs, const char *rhs) {
return !Element::equals(lhs, rhs);
[[maybe_unused]] static inline BcmpReturnType
inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
#pragma nounroll
for (size_t offset = 0; offset < count; ++offset)
if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset))
return value;
return BcmpReturnType::ZERO();
}
// Runtime-size difference between 'lhs' and 'rhs'.
template <typename Element>
bool differs(const char *lhs, const char *rhs, size_t size) {
return !Element::equals(lhs, rhs, size);

#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
[[maybe_unused]] static inline BcmpReturnType
inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
if (count < 256)
return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
if (auto value = generic::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)

static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
using namespace ::__llvm_libc::x86;
#elif defined(LLVM_LIBC_ARCH_AARCH64)
using namespace ::__llvm_libc::aarch64;
#else
using namespace ::__llvm_libc::scalar;
#endif
[[maybe_unused]] static inline BcmpReturnType
inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count < 256)
return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
}

[[maybe_unused]] static inline BcmpReturnType
inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
if (unlikely(count >= 256)) {
if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
}

[[maybe_unused]] static inline BcmpReturnType
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
if (unlikely(count >= 256)) {
if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
}

[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
size_t count) {
if (count == 0)
return 0;
return BcmpReturnType::ZERO();
if (count == 1)
return differs<_1>(lhs, rhs);
return generic::Bcmp<1>::block(p1, p2);
if (count == 2)
return differs<_2>(lhs, rhs);
if (count == 3)
return differs<_3>(lhs, rhs);
return generic::Bcmp<2>::block(p1, p2);
if (count <= 4)
return generic::Bcmp<2>::head_tail(p1, p2, count);
if (count <= 8)
return differs<HeadTail<_4>>(lhs, rhs, count);
return generic::Bcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
return differs<HeadTail<_8>>(lhs, rhs, count);
if (count <= 32)
return differs<HeadTail<_16>>(lhs, rhs, count);
return generic::Bcmp<8>::head_tail(p1, p2, count);
if constexpr (x86::kAvx512BW)
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
else if constexpr (x86::kAvx2)
return inline_bcmp_x86_avx2_gt16(p1, p2, count);
else if constexpr (x86::kSse2)
return inline_bcmp_x86_sse2_gt16(p1, p2, count);
else
return inline_bcmp_generic_gt16(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86)

#if defined(LLVM_LIBC_ARCH_AARCH64)
[[maybe_unused]] static inline BcmpReturnType
inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) {
if (likely(count <= 32)) {
if (unlikely(count >= 16)) {
return generic::Bcmp<16>::head_tail(p1, p2, count);
}
switch (count) {
case 0:
return BcmpReturnType::ZERO();
case 1:
return generic::Bcmp<1>::block(p1, p2);
case 2:
return generic::Bcmp<2>::block(p1, p2);
case 3:
return generic::Bcmp<2>::head_tail(p1, p2, count);
case 4:
return generic::Bcmp<4>::block(p1, p2);
case 5:
case 6:
case 7:
return generic::Bcmp<4>::head_tail(p1, p2, count);
case 8:
return generic::Bcmp<8>::block(p1, p2);
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
return generic::Bcmp<8>::head_tail(p1, p2, count);
}
}

if (count <= 64)
return differs<HeadTail<_32>>(lhs, rhs, count);
if (count <= 128)
return differs<HeadTail<_64>>(lhs, rhs, count);
return differs<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
return generic::Bcmp<32>::head_tail(p1, p2, count);

// Aligned loop if > 256, otherwise normal loop
if (count > 256) {
if (auto value = generic::Bcmp<32>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_AARCH64)

static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
return inline_bcmp_x86(p1, p2, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
return inline_bcmp_aarch64(p1, p2, count);
#elif defined(LLVM_LIBC_ARCH_ARM)
return inline_bcmp_embedded_tiny(p1, p2, count);
#else
#error "Unsupported platform"
#endif
}

static inline int inline_bcmp(const void *p1, const void *p2, size_t count) {
return static_cast<int>(inline_bcmp(reinterpret_cast<CPtr>(p1),
reinterpret_cast<CPtr>(p2), count));
}

} // namespace __llvm_libc
Expand Down
6 changes: 5 additions & 1 deletion libc/src/string/memory_utils/bzero_implementations.h
Expand Up @@ -15,10 +15,14 @@

namespace __llvm_libc {

inline static void inline_bzero(char *dst, size_t count) {
inline static void inline_bzero(Ptr dst, size_t count) {
inline_memset(dst, 0, count);
}

inline static void inline_bzero(void *dst, size_t count) {
inline_bzero(reinterpret_cast<Ptr>(dst), count);
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H

0 comments on commit 67437dd

Please sign in to comment.