Skip to content

Commit

Permalink
[libc] Add optimized memcpy for RISCV
Browse files Browse the repository at this point in the history
This patch adds two versions of memcpy optimized for architectures where unaligned accesses are either illegal or extremely slow.
It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well.

Here is the before / after output of `libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_Memcpy` on a quad core Linux starfive RISCV 64 board running at 1.5GHz.

Before:
```
Run on (4 X 1500 MHz CPU s)
CPU Caches:
  L1 Instruction 32 KiB (x4)
  L1 Data 32 KiB (x4)
  L2 Unified 2048 KiB (x1)
------------------------------------------------------------------------
Benchmark              Time             CPU   Iterations UserCounters...
------------------------------------------------------------------------
BM_Memcpy/0/0        474 ns          474 ns      1483776 bytes_per_cycle=0.243492/s bytes_per_second=348.318M/s items_per_second=2.11097M/s __llvm_libc::memcpy,memcpy Google A
BM_Memcpy/1/0        210 ns          209 ns      3649536 bytes_per_cycle=0.233819/s bytes_per_second=334.481M/s items_per_second=4.77519M/s __llvm_libc::memcpy,memcpy Google B
BM_Memcpy/2/0       1814 ns         1814 ns       396288 bytes_per_cycle=0.247899/s bytes_per_second=354.622M/s items_per_second=551.402k/s __llvm_libc::memcpy,memcpy Google D
BM_Memcpy/3/0       89.3 ns         89.2 ns      7459840 bytes_per_cycle=0.217415/s bytes_per_second=311.014M/s items_per_second=11.2071M/s __llvm_libc::memcpy,memcpy Google L
BM_Memcpy/4/0        134 ns          134 ns      3815424 bytes_per_cycle=0.226584/s bytes_per_second=324.131M/s items_per_second=7.44567M/s __llvm_libc::memcpy,memcpy Google M
BM_Memcpy/5/0       52.8 ns         52.6 ns     11001856 bytes_per_cycle=0.194893/s bytes_per_second=278.797M/s items_per_second=19.0284M/s __llvm_libc::memcpy,memcpy Google Q
BM_Memcpy/6/0        180 ns          180 ns      4101120 bytes_per_cycle=0.231884/s bytes_per_second=331.713M/s items_per_second=5.55957M/s __llvm_libc::memcpy,memcpy Google S
BM_Memcpy/7/0        195 ns          195 ns      3906560 bytes_per_cycle=0.232951/s bytes_per_second=333.239M/s items_per_second=5.1217M/s __llvm_libc::memcpy,memcpy Google U
BM_Memcpy/8/0        152 ns          152 ns      4789248 bytes_per_cycle=0.227507/s bytes_per_second=325.452M/s items_per_second=6.58187M/s __llvm_libc::memcpy,memcpy Google W
BM_Memcpy/9/0       6036 ns         6033 ns       118784 bytes_per_cycle=0.249158/s bytes_per_second=356.423M/s items_per_second=165.75k/s __llvm_libc::memcpy,uniform 384 to 4096
```

After:
```
BM_Memcpy/0/0        126 ns          126 ns      5770240 bytes_per_cycle=1.04707/s bytes_per_second=1.46273G/s items_per_second=7.9385M/s __llvm_libc::memcpy,memcpy Google A
BM_Memcpy/1/0       75.1 ns         75.0 ns     10204160 bytes_per_cycle=0.691143/s bytes_per_second=988.687M/s items_per_second=13.3289M/s __llvm_libc::memcpy,memcpy Google B
BM_Memcpy/2/0        333 ns          333 ns      2174976 bytes_per_cycle=1.39297/s bytes_per_second=1.94596G/s items_per_second=3.00002M/s __llvm_libc::memcpy,memcpy Google D
BM_Memcpy/3/0       49.6 ns         49.5 ns     16092160 bytes_per_cycle=0.710161/s bytes_per_second=1015.89M/s items_per_second=20.1844M/s __llvm_libc::memcpy,memcpy Google L
BM_Memcpy/4/0       57.7 ns         57.7 ns     11213824 bytes_per_cycle=0.561557/s bytes_per_second=803.314M/s items_per_second=17.3228M/s __llvm_libc::memcpy,memcpy Google M
BM_Memcpy/5/0       48.0 ns         47.9 ns     16437248 bytes_per_cycle=0.346708/s bytes_per_second=495.97M/s items_per_second=20.8571M/s __llvm_libc::memcpy,memcpy Google Q
BM_Memcpy/6/0       67.5 ns         67.5 ns     10616832 bytes_per_cycle=0.614173/s bytes_per_second=878.582M/s items_per_second=14.8142M/s __llvm_libc::memcpy,memcpy Google S
BM_Memcpy/7/0       84.7 ns         84.6 ns     10480640 bytes_per_cycle=0.819077/s bytes_per_second=1.14424G/s items_per_second=11.8174M/s __llvm_libc::memcpy,memcpy Google U
BM_Memcpy/8/0       61.7 ns         61.6 ns     11191296 bytes_per_cycle=0.550078/s bytes_per_second=786.893M/s items_per_second=16.2279M/s __llvm_libc::memcpy,memcpy Google W
BM_Memcpy/9/0        981 ns          981 ns       703488 bytes_per_cycle=1.52333/s bytes_per_second=2.12807G/s items_per_second=1019.81k/s __llvm_libc::memcpy,uniform 384 to 4096
```

It is not as good as glibc for now so there's room for improvement. I suspect a path pumping 16 bytes at once given the doubled numbers for large copies.
```
BM_Memcpy/0/1        146 ns         82.5 ns      8576000 bytes_per_cycle=1.35236/s bytes_per_second=1.88922G/s items_per_second=12.1169M/s glibc memcpy,memcpy Google A
BM_Memcpy/1/1        112 ns         63.7 ns     10634240 bytes_per_cycle=0.628018/s bytes_per_second=898.387M/s items_per_second=15.702M/s glibc memcpy,memcpy Google B
BM_Memcpy/2/1        315 ns          180 ns      4079616 bytes_per_cycle=2.65229/s bytes_per_second=3.7052G/s items_per_second=5.54764M/s glibc memcpy,memcpy Google D
BM_Memcpy/3/1       85.3 ns         43.1 ns     15854592 bytes_per_cycle=0.774164/s bytes_per_second=1107.45M/s items_per_second=23.2249M/s glibc memcpy,memcpy Google L
BM_Memcpy/4/1        105 ns         54.3 ns     13427712 bytes_per_cycle=0.7793/s bytes_per_second=1114.8M/s items_per_second=18.4109M/s glibc memcpy,memcpy Google M
BM_Memcpy/5/1       77.1 ns         43.2 ns     16476160 bytes_per_cycle=0.279808/s bytes_per_second=400.269M/s items_per_second=23.1428M/s glibc memcpy,memcpy Google Q
BM_Memcpy/6/1        112 ns         62.7 ns     11236352 bytes_per_cycle=0.676078/s bytes_per_second=967.137M/s items_per_second=15.9387M/s glibc memcpy,memcpy Google S
BM_Memcpy/7/1        131 ns         65.5 ns     11751424 bytes_per_cycle=0.965616/s bytes_per_second=1.34895G/s items_per_second=15.2762M/s glibc memcpy,memcpy Google U
BM_Memcpy/8/1        104 ns         55.0 ns     12314624 bytes_per_cycle=0.583336/s bytes_per_second=834.468M/s items_per_second=18.1937M/s glibc memcpy,memcpy Google W
BM_Memcpy/9/1        932 ns          466 ns      1480704 bytes_per_cycle=3.17342/s bytes_per_second=4.43321G/s items_per_second=2.14679M/s glibc memcpy,uniform 384 to 4096
```

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D150202
  • Loading branch information
gchatelet committed May 10, 2023
1 parent f109b10 commit f4a3549
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 8 deletions.
4 changes: 4 additions & 0 deletions libc/src/__support/macros/properties/architectures.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@
#define LIBC_TARGET_ARCH_IS_RISCV64
#endif

#if defined(__riscv) && (__riscv_xlen == 32)
#define LIBC_TARGET_ARCH_IS_RISCV32
#endif

#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
#define LIBC_TARGET_ARCH_IS_ANY_ARM
#endif
Expand Down
1 change: 1 addition & 0 deletions libc/src/string/memory_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_header_library(
x86_64/memcmp_implementations.h
x86_64/memcpy_implementations.h
DEPS
libc.src.__support.common
libc.src.__support.CPP.bit
libc.src.__support.CPP.cstddef
libc.src.__support.CPP.type_traits
Expand Down
67 changes: 61 additions & 6 deletions libc/src/string/memory_utils/memcpy_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,79 @@
namespace __llvm_libc {

[[maybe_unused]] LIBC_INLINE void
inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
inline_memcpy_byte_per_byte(Ptr dst, CPtr src, size_t offset, size_t count) {
LIBC_LOOP_NOUNROLL
for (size_t offset = 0; offset < count; ++offset)
builtin::Memcpy<1>::block(dst + offset, src + offset);
for (; offset < count; ++offset)
dst[offset] = src[offset];
}

[[maybe_unused]] LIBC_INLINE void
inline_memcpy_aligned_access_32bit(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
constexpr size_t kAlign = sizeof(uint32_t);
if (count <= 2 * kAlign)
return inline_memcpy_byte_per_byte(dst, src, 0, count);
size_t bytes_to_dst_align = distance_to_align_up<kAlign>(dst);
inline_memcpy_byte_per_byte(dst, src, 0, bytes_to_dst_align);
size_t offset = bytes_to_dst_align;
size_t src_alignment = distance_to_align_down<kAlign>(src + offset);
for (; offset < count - kAlign; offset += kAlign) {
uint32_t value;
if (src_alignment == 0)
value = load32_aligned<uint32_t>(src, offset);
else if (src_alignment == 2)
value = load32_aligned<uint16_t, uint16_t>(src, offset);
else
value = load32_aligned<uint8_t, uint16_t, uint8_t>(src, offset);
store32_aligned<uint32_t>(value, dst, offset);
}
// remainder
inline_memcpy_byte_per_byte(dst, src, offset, count);
}

[[maybe_unused]] LIBC_INLINE void
inline_memcpy_aligned_access_64bit(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
constexpr size_t kAlign = sizeof(uint64_t);
if (count <= 2 * kAlign)
return inline_memcpy_byte_per_byte(dst, src, 0, count);
size_t bytes_to_dst_align = distance_to_align_up<kAlign>(dst);
inline_memcpy_byte_per_byte(dst, src, 0, bytes_to_dst_align);
size_t offset = bytes_to_dst_align;
size_t src_alignment = distance_to_align_down<kAlign>(src + offset);
for (; offset < count - kAlign; offset += kAlign) {
uint64_t value;
if (src_alignment == 0)
value = load64_aligned<uint64_t>(src, offset);
else if (src_alignment == 4)
value = load64_aligned<uint32_t, uint32_t>(src, offset);
else if (src_alignment == 2)
value =
load64_aligned<uint16_t, uint16_t, uint16_t, uint16_t>(src, offset);
else
value = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
src, offset);
store64_aligned<uint64_t>(value, dst, offset);
}
// remainder
inline_memcpy_byte_per_byte(dst, src, offset, count);
}

LIBC_INLINE void inline_memcpy(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
using namespace __llvm_libc::builtin;
#if defined(LIBC_COPT_MEMCPY_USE_EMBEDDED_TINY)
return inline_memcpy_embedded_tiny(dst, src, count);
return inline_memcpy_byte_per_byte(dst, src, 0, count);
#elif defined(LIBC_TARGET_ARCH_IS_X86)
return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
return inline_memcpy_aarch64(dst, src, count);
#elif defined(LIBC_TARGET_ARCH_IS_RISCV64)
return inline_memcpy_aligned_access_64bit(dst, src, count);
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
return inline_memcpy_aligned_access_32bit(dst, src, count);
#else
return inline_memcpy_embedded_tiny(dst, src, count);
return inline_memcpy_byte_per_byte(dst, src, 0, count);
#endif
}

Expand Down
87 changes: 85 additions & 2 deletions libc/src/string/memory_utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/cstddef.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/macros/attributes.h" //LIBC_INLINE
#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
#include "src/__support/endian.h"
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN

#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t
Expand Down Expand Up @@ -97,8 +98,15 @@ LIBC_INLINE void memcpy_inline(void *__restrict dst,
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
__builtin_memcpy_inline(dst, src, Size);
#else
// In memory functions `memcpy_inline` is instantiated several times with
// different value of the Size parameter. This doesn't play well with GCC's
// Value Range Analysis that wrongly detects out of bounds accesses. We disable
// the 'array-bounds' warning for the purpose of this function.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
for (size_t i = 0; i < Size; ++i)
static_cast<char *>(dst)[i] = static_cast<const char *>(src)[i];
#pragma GCC diagnostic pop
#endif
}

Expand Down Expand Up @@ -153,6 +161,81 @@ template <typename T> LIBC_INLINE void store(Ptr ptr, T value) {
memcpy_inline<sizeof(T)>(ptr, &value);
}

// On architectures that do not allow for unaligned access we perform several
// aligned accesses and recombine them through shifts and logicals operations.
// For instance, if we know that the pointer is 2-byte aligned we can decompose
// a 64-bit operation into four 16-bit operations.

// Loads a 'ValueType' by decomposing it into several loads that are assumed to
// be aligned.
// e.g. load_aligned<uint32_t, uint16_t, uint16_t>(ptr);
template <typename ValueType, typename T, typename... TS>
ValueType load_aligned(CPtr src) {
static_assert(sizeof(ValueType) >= (sizeof(T) + ... + sizeof(TS)));
const ValueType value = load<T>(assume_aligned<sizeof(T)>(src));
if constexpr (sizeof...(TS) > 0) {
constexpr size_t shift = sizeof(T) * 8;
const ValueType next = load_aligned<ValueType, TS...>(src + sizeof(T));
if constexpr (Endian::IS_LITTLE)
return value | (next << shift);
else if constexpr (Endian::IS_BIG)
return (value << shift) | next;
else
deferred_static_assert("Invalid endianness");
} else {
return value;
}
}

// Alias for loading a 'uint32_t'.
template <typename T, typename... TS>
auto load32_aligned(CPtr src, size_t offset) {
static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint32_t));
return load_aligned<uint32_t, T, TS...>(src + offset);
}

// Alias for loading a 'uint64_t'.
template <typename T, typename... TS>
auto load64_aligned(CPtr src, size_t offset) {
static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint64_t));
return load_aligned<uint64_t, T, TS...>(src + offset);
}

// Stores a 'ValueType' by decomposing it into several stores that are assumed
// to be aligned.
// e.g. store_aligned<uint32_t, uint16_t, uint16_t>(value, ptr);
template <typename ValueType, typename T, typename... TS>
void store_aligned(ValueType value, Ptr dst) {
static_assert(sizeof(ValueType) >= (sizeof(T) + ... + sizeof(TS)));
constexpr size_t shift = sizeof(T) * 8;
if constexpr (Endian::IS_LITTLE) {
store<T>(assume_aligned<sizeof(T)>(dst), value & ~T(0));
if constexpr (sizeof...(TS) > 0)
store_aligned<ValueType, TS...>(value >> shift, dst + sizeof(T));
} else if constexpr (Endian::IS_BIG) {
constexpr size_t OFFSET = (0 + ... + sizeof(TS));
store<T>(assume_aligned<sizeof(T)>(dst + OFFSET), value & ~T(0));
if constexpr (sizeof...(TS) > 0)
store_aligned<ValueType, TS...>(value >> shift, dst);
} else {
deferred_static_assert("Invalid endianness");
}
}

// Alias for storing a 'uint32_t'.
template <typename T, typename... TS>
void store32_aligned(uint32_t value, Ptr dst, size_t offset) {
static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint32_t));
store_aligned<uint32_t, T, TS...>(value, dst + offset);
}

// Alias for storing a 'uint64_t'.
template <typename T, typename... TS>
void store64_aligned(uint64_t value, Ptr dst, size_t offset) {
static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint64_t));
store_aligned<uint64_t, T, TS...>(value, dst + offset);
}

// Advances the pointers p1 and p2 by offset bytes and decrease count by the
// same amount.
template <typename T1, typename T2>
Expand Down
40 changes: 40 additions & 0 deletions libc/test/src/string/memory_utils/utils_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,4 +144,44 @@ TEST(LlvmLibcUtilsTest, Align2) {
}
}

TEST(LlvmLibcUtilsTest, LoadStoreAligned) {
const uint64_t init = 0xDEAD'C0DE'BEEF'F00D;
CPtr const src = reinterpret_cast<CPtr>(&init);
uint64_t store;
Ptr const dst = reinterpret_cast<Ptr>(&store);

using LoadFun = uint64_t (*)(CPtr);
using StoreFun = void (*)(uint64_t, Ptr);

{
LoadFun ld = load_aligned<uint64_t, uint64_t>;
StoreFun st = store_aligned<uint64_t, uint64_t>;
const uint64_t loaded = ld(src);
EXPECT_EQ(init, loaded);
store = 0;
st(init, dst);
EXPECT_EQ(init, store);
}

{
LoadFun ld = load_aligned<uint64_t, uint32_t, uint32_t>;
StoreFun st = store_aligned<uint64_t, uint32_t, uint32_t>;
const uint64_t loaded = ld(src);
EXPECT_EQ(init, loaded);
store = 0;
st(init, dst);
EXPECT_EQ(init, store);
}

{
LoadFun ld = load_aligned<uint64_t, uint32_t, uint16_t, uint8_t, uint8_t>;
StoreFun st = store_aligned<uint64_t, uint32_t, uint16_t, uint8_t, uint8_t>;
const uint64_t loaded = ld(src);
EXPECT_EQ(init, loaded);
store = 0;
st(init, dst);
EXPECT_EQ(init, store);
}
}

} // namespace __llvm_libc

0 comments on commit f4a3549

Please sign in to comment.