22 changes: 22 additions & 0 deletions libc/src/string/memcpy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===--------------------- Implementation of memcpy -----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/string/memcpy.h"
#include "src/__support/common.h"
#include "src/string/memcpy_arch_specific.h"

namespace __llvm_libc {

void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
const void *__restrict src, size_t size) {
memcpy_no_return(reinterpret_cast<char *>(dst),
reinterpret_cast<const char *>(src), size);
return dst;
}

} // namespace __llvm_libc
21 changes: 21 additions & 0 deletions libc/src/string/memcpy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//===----------------- Implementation header for memcpy -------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H
#define LLVM_LIBC_SRC_STRING_MEMCPY_H

#include "include/string.h"
#include <stddef.h> // size_t

namespace __llvm_libc {

void *memcpy(void *__restrict, const void *__restrict, size_t);

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMCPY_H
65 changes: 65 additions & 0 deletions libc/src/string/memcpy_arch_specific.h.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
//===-------------- Implementation of arch specific memcpy ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H

%%include_file(${memcpy_arch_specific})

namespace __llvm_libc {

// Design rationale
// ================
//
// Using a profiler to observe size distributions for calls into libc
// functions, it was found most operations act on a small number of bytes.
// This makes it important to favor small sizes.
//
// The tests for `count` are in ascending order so the cost of branching is
// proportional to the cost of copying.
//
// The function is written in C++ for several reasons:
// - The compiler can __see__ the code, this is useful when performing Profile
// Guided Optimization as the optimized code can take advantage of branching
// probabilities.
// - It also allows for easier customization and favors testing multiple
// implementation parameters.
// - As compilers and processors get better, the generated code is improved
// with little change on the code side.
static void memcpy_no_return(char *__restrict dst, const char *__restrict src,
size_t count) {
if (count == 0)
return;
if (count == 1)
return Copy<1>(dst, src);
if (count == 2)
return Copy<2>(dst, src);
if (count == 3)
return Copy<3>(dst, src);
if (count == 4)
return Copy<4>(dst, src);
if (count < 8)
return CopyOverlap<4>(dst, src, count);
if (count == 8)
return Copy<8>(dst, src);
if (count < 16)
return CopyOverlap<8>(dst, src, count);
if (count == 16)
return Copy<16>(dst, src);
if (count < 32)
return CopyOverlap<16>(dst, src, count);
if (count < 64)
return CopyOverlap<32>(dst, src, count);
if (count < 128)
return CopyOverlap<64>(dst, src, count);
CopyGE128(dst, src, count);
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
7 changes: 5 additions & 2 deletions libc/src/string/memory_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ add_gen_header(

add_header_library(
memory_utils
HDRS utils.h
DEPENDS cacheline_size
HDRS
utils.h
memcpy_utils.h
DEPENDS
cacheline_size
)
100 changes: 100 additions & 0 deletions libc/src/string/memory_utils/memcpy_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
//===---------------------------- Memcpy utils ----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
#define LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H

#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t

// __builtin_memcpy_inline guarantees to never call external functions.
// Unfortunately it is not widely available.
#if defined(__clang__) && __has_builtin(__builtin_memcpy_inline)
#define USE_BUILTIN_MEMCPY_INLINE
#elif defined(__GNUC__)
#define USE_BUILTIN_MEMCPY
#endif

// This is useful for testing.
#if defined(LLVM_LIBC_MEMCPY_MONITOR)
extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
const char *__restrict, size_t);
#endif

namespace __llvm_libc {

// Copies `kBlockSize` bytes from `src` to `dst`.
template <size_t kBlockSize>
static void Copy(char *__restrict dst, const char *__restrict src) {
#if defined(LLVM_LIBC_MEMCPY_MONITOR)
LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
#elif defined(USE_BUILTIN_MEMCPY_INLINE)
__builtin_memcpy_inline(dst, src, kBlockSize);
#elif defined(USE_BUILTIN_MEMCPY)
__builtin_memcpy(dst, src, kBlockSize);
#else
for (size_t i = 0; i < kBlockSize; ++i)
dst[i] = src[i];
#endif
}

// Copies `kBlockSize` bytes from `src + count - kBlockSize` to
// `dst + count - kBlockSize`.
// Precondition: `count >= kBlockSize`.
template <size_t kBlockSize>
static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
size_t count) {
const size_t offset = count - kBlockSize;
Copy<kBlockSize>(dst + offset, src + offset);
}

// Copies `kBlockSize` bytes twice with an overlap between the two.
//
// [1234567812345678123]
// [__XXXXXXXXXXXXXX___]
// [__XXXXXXXX_________]
// [________XXXXXXXX___]
//
// Precondition: `count >= kBlockSize && count <= kBlockSize`.
template <size_t kBlockSize>
static void CopyOverlap(char *__restrict dst, const char *__restrict src,
size_t count) {
Copy<kBlockSize>(dst, src);
CopyLastBlock<kBlockSize>(dst, src, count);
}

// Copies `count` bytes by blocks of `kBlockSize` bytes.
// Copies at the start and end of the buffer are unaligned.
// Copies in the middle of the buffer are aligned to `kBlockSize`.
//
// e.g. with
// [12345678123456781234567812345678]
// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
// [__XXXXXXXX______________________]
// [________XXXXXXXX________________]
// [________________XXXXXXXX________]
// [_____________________XXXXXXXX___]
//
// Precondition: `count > 2 * kBlockSize` for efficiency.
// `count >= kBlockSize` for correctness.
template <size_t kBlockSize>
static void CopyAligned(char *__restrict dst, const char *__restrict src,
size_t count) {
Copy<kBlockSize>(dst, src); // Copy first block

// Copy aligned blocks
size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
for (; offset + kBlockSize < count; offset += kBlockSize)
Copy<kBlockSize>(dst + offset, src + offset);

CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
7 changes: 6 additions & 1 deletion libc/src/string/memory_utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ static constexpr size_t ge_power2(size_t value) {
return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
}

template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
// The logic is not straightforward and involves unsigned modulo arithmetic
Expand All @@ -51,7 +56,7 @@ template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
}

// Returns the offset from `ptr` to the next cache line.
static intptr_t offset_to_next_cache_line(const void *ptr) {
static inline intptr_t offset_to_next_cache_line(const void *ptr) {
return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
}

Expand Down
4 changes: 4 additions & 0 deletions libc/src/string/x86/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_none" REJECT "${ALL_CPU_FEATURES}")
add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_sse" REQUIRE "SSE" REJECT "SSE2")
add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx" REQUIRE "AVX" REJECT "AVX2")
add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx512f" REQUIRE "AVX512F")
35 changes: 35 additions & 0 deletions libc/src/string/x86/memcpy_arch_specific.h.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "src/string/memory_utils/memcpy_utils.h"

namespace __llvm_libc {

static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
size_t count) {
// FIXME: Add MSVC suppport with
// #include <intrin.h>
// __movsb(reinterpret_cast<unsigned char *>(dst),
// reinterpret_cast<const unsigned char *>(src), count);
asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
}

#if defined(__AVX__)
#define BEST_SIZE 64
#else
#define BEST_SIZE 32
#endif

static void CopyGE128(char *__restrict dst, const char *__restrict src,
size_t count) {
#if defined(__AVX__)
if (count < 256)
return CopyOverlap<128>(dst, src, count);
#endif
// kRepMovsBSize == -1 : Only CopyAligned is used.
// kRepMovsBSize == 0 : Only RepMovsb is used.
// else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
constexpr size_t kRepMovsBSize = -1;
if (count <= kRepMovsBSize)
return CopyAligned<BEST_SIZE>(dst, src, count);
CopyRepMovsb(dst, src, count);
}

} // namespace __llvm_libc
21 changes: 21 additions & 0 deletions libc/test/src/string/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,24 @@ add_libc_unittest(
DEPENDS
strcpy
)

# Tests all implementations of memcpy that can run on the host.
get_property(memcpy_implementations GLOBAL PROPERTY memcpy_implementations)
foreach(memcpy_config_name IN LISTS memcpy_implementations)
get_target_property(require_cpu_features ${memcpy_config_name} REQUIRE_CPU_FEATURES)
host_supports(can_run "${require_cpu_features}")
if(can_run)
add_libc_unittest(
${memcpy_config_name}_test
SUITE
libc_string_unittests
SRCS
memcpy_test.cpp
DEPENDS
${memcpy_config_name}
)
else()
message(STATUS "Skipping test for '${memcpy_config_name}' insufficient host cpu features")
endif()
endforeach()

53 changes: 53 additions & 0 deletions libc/test/src/string/memcpy_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===----------------------- Unittests for memcpy -------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "utils/CPP/ArrayRef.h"
#include "utils/UnitTest/Test.h"
#include "src/string/memcpy.h"

using __llvm_libc::cpp::Array;
using __llvm_libc::cpp::ArrayRef;
using __llvm_libc::cpp::MutableArrayRef;
using Data = Array<char, 2048>;

static const ArrayRef<char> kNumbers("0123456789", 10);
static const ArrayRef<char> kDeadcode("DEADC0DE", 8);

// Returns a Data object filled with a repetition of `filler`.
Data getData(ArrayRef<char> filler) {
Data out;
for (size_t i = 0; i < out.size(); ++i)
out[i] = filler[i % filler.size()];
return out;
}

TEST(MemcpyTest, Thorough) {
const Data groundtruth = getData(kNumbers);
const Data dirty = getData(kDeadcode);
for (size_t count = 0; count < 1024; ++count) {
for (size_t align = 0; align < 64; ++align) {
auto buffer = dirty;
const char *const src = groundtruth.data();
char *const dst = &buffer[align];
__llvm_libc::memcpy(dst, src, count);
// Everything before copy is untouched.
for (size_t i = 0; i < align; ++i)
ASSERT_EQ(buffer[i], dirty[i]);
// Everything in between is copied.
for (size_t i = 0; i < count; ++i)
ASSERT_EQ(buffer[align + i], groundtruth[i]);
// Everything after copy is untouched.
for (size_t i = align + count; i < dirty.size(); ++i)
ASSERT_EQ(buffer[i], dirty[i]);
}
}
}

// FIXME: Add tests with reads and writes on the boundary of a read/write
// protected page to check we're not reading nor writing prior/past the allowed
// regions.
7 changes: 7 additions & 0 deletions libc/test/src/string/memory_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,14 @@ add_libc_unittest(
libc_string_unittests
SRCS
utils_test.cpp
memcpy_utils_test.cpp
DEPENDS
memory_utils
standalone_cpp
)

target_compile_definitions(
utils_test
PRIVATE
LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor
)
208 changes: 208 additions & 0 deletions libc/test/src/string/memory_utils/memcpy_utils_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
//===-------------------- Unittests for memory_utils ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/string/memory_utils/memcpy_utils.h"
#include "utils/CPP/Array.h"
#include "utils/UnitTest/Test.h"

#include <assert.h>
#include <stdint.h> // uintptr_t

#ifndef LLVM_LIBC_MEMCPY_MONITOR
#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test.
#endif

namespace __llvm_libc {

struct Buffer {
static constexpr size_t kMaxBuffer = 1024;
char buffer[kMaxBuffer + 1];
size_t last = 0;

void Clear() {
last = 0;
for (size_t i = 0; i < kMaxBuffer; ++i)
buffer[i] = '0';
buffer[kMaxBuffer] = '\0';
}

void Increment(const void *ptr) {
const auto offset = reinterpret_cast<uintptr_t>(ptr);
assert(offset < kMaxBuffer);
++buffer[offset];
if (offset > last)
last = offset;
}

char *Finish() {
assert(last < kMaxBuffer);
buffer[last + 1] = '\0';
return buffer;
}
};

struct Trace {
Buffer read;
Buffer write;

void Add(char *__restrict dst, const char *__restrict src, size_t count) {
for (size_t i = 0; i < count; ++i)
read.Increment(src + i);
for (size_t i = 0; i < count; ++i)
write.Increment(dst + i);
}

void Clear() {
read.Clear();
write.Clear();
}

char *Read() { return read.Finish(); }
char *Write() { return write.Finish(); }
};

static Trace &GetTrace() {
static thread_local Trace events;
return events;
}

extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst,
const char *__restrict src,
size_t count) {
GetTrace().Add(dst, src, count);
}

char *I(uintptr_t offset) { return reinterpret_cast<char *>(offset); }

TEST(MemcpyUtilsTest, CopyTrivial) {
auto &trace = GetTrace();

trace.Clear();
Copy<1>(I(0), I(0));
EXPECT_STREQ(trace.Write(), "1");
EXPECT_STREQ(trace.Read(), "1");

trace.Clear();
Copy<2>(I(0), I(0));
EXPECT_STREQ(trace.Write(), "11");
EXPECT_STREQ(trace.Read(), "11");

trace.Clear();
Copy<4>(I(0), I(0));
EXPECT_STREQ(trace.Write(), "1111");
EXPECT_STREQ(trace.Read(), "1111");

trace.Clear();
Copy<8>(I(0), I(0));
EXPECT_STREQ(trace.Write(), "11111111");
EXPECT_STREQ(trace.Read(), "11111111");

trace.Clear();
Copy<16>(I(0), I(0));
EXPECT_STREQ(trace.Write(), "1111111111111111");
EXPECT_STREQ(trace.Read(), "1111111111111111");

trace.Clear();
Copy<32>(I(0), I(0));
EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");

trace.Clear();
Copy<64>(I(0), I(0));
EXPECT_STREQ(
trace.Write(),
"1111111111111111111111111111111111111111111111111111111111111111");
EXPECT_STREQ(
trace.Read(),
"1111111111111111111111111111111111111111111111111111111111111111");
}

TEST(MemcpyUtilsTest, CopyOffset) {
auto &trace = GetTrace();

trace.Clear();
Copy<1>(I(3), I(1));
EXPECT_STREQ(trace.Write(), "0001");
EXPECT_STREQ(trace.Read(), "01");

trace.Clear();
Copy<1>(I(2), I(1));
EXPECT_STREQ(trace.Write(), "001");
EXPECT_STREQ(trace.Read(), "01");
}

TEST(MemcpyUtilsTest, CopyOverlap) {
auto &trace = GetTrace();

trace.Clear();
CopyOverlap<2>(I(0), I(0), 2);
EXPECT_STREQ(trace.Write(), "22");
EXPECT_STREQ(trace.Read(), "22");

trace.Clear();
CopyOverlap<2>(I(0), I(0), 3);
EXPECT_STREQ(trace.Write(), "121");
EXPECT_STREQ(trace.Read(), "121");

trace.Clear();
CopyOverlap<2>(I(0), I(0), 4);
EXPECT_STREQ(trace.Write(), "1111");
EXPECT_STREQ(trace.Read(), "1111");

trace.Clear();
CopyOverlap<4>(I(2), I(1), 7);
EXPECT_STREQ(trace.Write(), "001112111");
EXPECT_STREQ(trace.Read(), "01112111");
}

TEST(MemcpyUtilsTest, CopyAligned) {
auto &trace = GetTrace();
// Destination is aligned already.
// "1111000000000"
// + "0000111100000"
// + "0000000011110"
// + "0000000001111"
// = "1111111112221"
trace.Clear();
CopyAligned<4>(I(0), I(0), 13);
EXPECT_STREQ(trace.Write(), "1111111112221");
EXPECT_STREQ(trace.Read(), "1111111112221");

// Misaligned destination
// "01111000000000"
// + "00001111000000"
// + "00000000111100"
// + "00000000001111"
// = "01112111112211"
trace.Clear();
CopyAligned<4>(I(1), I(0), 13);
EXPECT_STREQ(trace.Write(), "01112111112211");
EXPECT_STREQ(trace.Read(), "1112111112211");
}

TEST(MemcpyUtilsTest, MaxReloads) {
auto &trace = GetTrace();
for (size_t alignment = 0; alignment < 32; ++alignment) {
for (size_t count = 64; count < 768; ++count) {
trace.Clear();
// We should never reload more than twice when copying from count = 2x32.
CopyAligned<32>(I(alignment), I(0), count);
const char *const written = trace.Write();
// First bytes are untouched.
for (size_t i = 0; i < alignment; ++i)
EXPECT_EQ(written[i], '0');
// Next bytes are loaded once or twice but no more.
for (size_t i = alignment; i < count; ++i) {
EXPECT_GE(written[i], '1');
EXPECT_LE(written[i], '2');
}
}
}
}

} // namespace __llvm_libc
8 changes: 8 additions & 0 deletions libc/test/src/string/memory_utils/utils_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,14 @@ TEST(UtilsTest, OffsetToNextAligned) {
EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
}

TEST(UtilsTest, OffsetFromLastAligned) {
EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
}

TEST(UtilsTest, OffsetToNextCacheLine) {
EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));
Expand Down