Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64][compiler-rt] Add memcpy, memset, memmove, memchr builtins. #77496

Merged
merged 7 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compiler-rt/lib/builtins/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ set(aarch64_SOURCES
)

if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

COMPILER_RT_HAS_ASM_SME tests if the compiler supports SME assembly, but not if the compiler supports the C/C++ level attributes. You'll need to add another check to the function which sets this variable and probably rename the variable to COMPILER_RT_HAS_AARCH64_SME to make it clear the compiler supports more than just the ASM. (and also to add "aarch64" to the name to avoid future name clashes with other meanings of 'sme')

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
message(STATUS "AArch64 SME ABI routines enabled")
else()
message(STATUS "AArch64 SME ABI routines disabled")
Expand Down
102 changes: 102 additions & 0 deletions compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#include <stdlib.h>
Copy link
Contributor

@peterwaller-arm peterwaller-arm Jan 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've proposed #79454 to switch this out for stddef.h which is provided by the compiler resource headers rather than the libc (since this is inside builtins).


// WARNING: When building the scalar versions of these functions you need to
// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
// from recognising a loop idiom and planting calls to memcpy!
Comment on lines +3 to +5
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a comment here that seems important, but doesn't seem to be addressed by your patch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


static void *
__arm_sc_memcpy_fwd(void *dest, const void *src,
size_t n) __arm_streaming_compatible __arm_preserves_za {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;

for (size_t i = 0; i < n; i++) {
destp[i] = srcp[i];
}

return dest;
}

// If dest and src overlap then behaviour is undefined, hence we can add the
// restrict keywords here. This also matches the definition of the libc memcpy
// according to the man page.
void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
size_t n) __arm_streaming_compatible __arm_preserves_za {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compiler building compiler-rt may not support __arm_streaming_compatible. You'll want to check for the SME attributes when configuring the project with cmake.

I'd also suggest removing __arm_preserves_za in anticipation of this ACLE PR landing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, not sure what extra required for __arm_streaming_compatible avaliblity check, We have already check that in compiler-rt/lib/builtins/CMakeLists.txt?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

return __arm_sc_memcpy_fwd(dest, src, n);
}

void *__arm_sc_memset(void *dest, int c,
size_t n) __arm_streaming_compatible __arm_preserves_za {
unsigned char *destp = (unsigned char *)dest;
unsigned char c8 = (unsigned char)c;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete blank line after variable declaration

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

for (size_t i = 0; i < n; i++) {
destp[i] = c8;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

drop braces, ++i

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

}

return dest;
}

static void *
__arm_sc_memcpy_rev(void *dest, const void *src,
size_t n) __arm_streaming_compatible __arm_preserves_za {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;

// TODO: Improve performance by copying larger chunks in reverse, or by
// using SVE.
while (n > 0) {
n--;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

destp[n] = srcp[n];
}
return dest;
}

// Semantically a memmove is equivalent to the following:
// 1. Copy the entire contents of src to a temporary array that does not
// overlap with src or dest.
// 2. Copy the contents of the temporary array into dest.
void *__arm_sc_memmove(void *dest, const void *src,
size_t n) __arm_streaming_compatible __arm_preserves_za {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;

// If src and dest are identical there is nothing to do!
if ((destp == srcp) || (n == 0))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder whether the two conditions (destp == srcp) || (n == 0) are useful.

A lot of libc implementations don't check destp == srcp. Some don't check n==0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I think we can safely remove those now. Thanks for finding this.

return destp;

// If src and dest don't overlap then just invoke memcpy
if ((srcp > (destp + n)) || (destp > (srcp + n)))
return __arm_sc_memcpy_fwd(dest, src, n);

// Overlap case 1:
// src: Low | -> | High
// dest: Low | -> | High
// Here src is always ahead of dest at a higher addres. If we first read a
// chunk of data from src we can safely write the same chunk to dest without
// corrupting future reads of src.
if (srcp > destp)
return __arm_sc_memcpy_fwd(dest, src, n);

// Overlap case 2:
// src: Low | -> | High
// dest: Low | -> | High
// While we're in the overlap region we're always corrupting future reads of
// src when writing to dest. An efficient way to do this is to copy the data
// in reverse by starting at the highest address.
return __arm_sc_memcpy_rev(dest, src, n);
}

const void *
__arm_sc_memchr(const void *src, int c,
size_t n) __arm_streaming_compatible __arm_preserves_za {
const unsigned char *srcp = (const unsigned char *)src;
unsigned char c8 = (unsigned char)c;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete blank line

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

for (size_t i = 0; i < n; i++) {
if (srcp[i] == c8)
return &srcp[i];
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

drop braces

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


return NULL;
}
183 changes: 183 additions & 0 deletions compiler-rt/test/builtins/Unit/sme-string-test.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
// REQUIRES: linux, aarch64-target-arch
// RUN: %clang_builtins %s %librt -o %t && %run %t

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define N 1024
#define NREPS 1234
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this test is testing more than just functionality (i.e. it seems to be testing performance). This can be seen from e.g. NREPS and get_time_diff. I think you'll want to make this purely a functional test and try to test out specific edge cases for each of the routines.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


static uint8_t dst[N], src[N];

extern void *__arm_sc_memcpy(void *, const void *, size_t);
extern void *__arm_sc_memset(void *, int, size_t);
extern void *__arm_sc_memmove(void *, const void *, size_t);
extern void *__arm_sc_memchr(const void *, int, size_t);

void init(void) {
for (int i = 0; i < N; i++) {
src[i] = i * 2;
dst[i] = i + 1;
}
}

void reinit_dst(int n) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's worth making this a .cpp file so that you can have a class with a constructor to do the init and have a sum method, rather than having to call functions like reinit_dst. We can reduce the sizes of the arrays a bit more to reduce runtime of the test, which makes the impact of reinitialising the entire array negligible.

In general I'd like to see a more structured way to test these routines, where we can see that some of the edge-cases are being tested. At the moment it looks like you've stripped the timing functionality from this file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

for (int i = 0; i < n; i++) {
dst[i] = i + 1;
}
}

int sum(uint8_t *dest, int n) {
int t = 0;
for (int i = 0; i < n; i++) {
t += dest[i];
}
return t;
}

long get_time_diff(struct timespec tv[2]) {
long us0 = (tv[0].tv_sec * 1000000) + (tv[0].tv_nsec / 1000);
long us1 = (tv[1].tv_sec * 1000000) + (tv[1].tv_nsec / 1000);
return us1 - us0;
}

int main() {
struct timespec tv[2];

init();

// Test correctness of memcpy
for (int i = 0; i < 67; i++) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a test that goes from 0 to 67. Is 67a special number that was chosen for a reason?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

int t[2];
if (!__arm_sc_memcpy(dst, src, i)) {
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
abort();
}
t[0] = sum(dst, N);
reinit_dst(i);
memcpy(dst, src, i);
t[1] = sum(dst, N);
reinit_dst(i);
if (t[0] != t[1]) {
fprintf(stderr, "__arm_sc_memcpy doesn't match memcpy behaviour!\n");
abort();
}
}

#ifdef TEST_PERF
// Collect perf data for memcpy
clock_gettime(CLOCK_REALTIME, &tv[0]);
for (int r = 0; r < NREPS; r++) {
for (int i = 0; i < 67; i++) {
int t[2];
if (!__arm_sc_memcpy(dst, src, i)) {
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
abort();
}
}
}
reinit_dst(67);
clock_gettime(CLOCK_REALTIME, &tv[1]);
printf("memcpy time = %ld\n", get_time_diff(tv));
#endif

// Test correctness of memset
for (int i = 0; i < 67; i++) {
int t[2];
if (!__arm_sc_memset(dst, src[i], i)) {
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
abort();
}
t[0] = sum(dst, N);
reinit_dst(i);
memset(dst, src[i], i);
t[1] = sum(dst, N);
reinit_dst(i);
if (t[0] != t[1]) {
fprintf(stderr, "__arm_sc_memcpy doesn't match memset behaviour!\n");
abort();
}
}

#ifdef TEST_PERF
// Collect perf data for memset
clock_gettime(CLOCK_REALTIME, &tv[0]);
for (int r = 0; r < NREPS; r++) {
for (int i = 0; i < 67; i++) {
if (!__arm_sc_memset(dst, src[i], i)) {
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
abort();
}
}
}
reinit_dst(67);
clock_gettime(CLOCK_REALTIME, &tv[1]);
printf("memset time = %ld\n", get_time_diff(tv));
#endif

// Test correctness of memchr
for (int i = 0; i < 67; i++) {
for (int j = 0; j < 67; j++) {
uint8_t *t[2];
t[0] = __arm_sc_memchr(src, src[j], i);
t[1] = memchr(src, src[j], i);
if (t[0] != t[1]) {
fprintf(stderr, "__arm_sc_memchr doesn't match memchr behaviour!\n");
abort();
}
}
}

#ifdef TEST_PERF
// Collect perf data for memchr
clock_gettime(CLOCK_REALTIME, &tv[0]);
for (int r = 0; r < NREPS; r++) {
for (int i = 0; i < 67; i++) {
for (int j = 0; j < 67; j++) {
__arm_sc_memchr(src, src[j], i);
}
}
}
clock_gettime(CLOCK_REALTIME, &tv[1]);
printf("memchr time = %ld\n", get_time_diff(tv));
#endif

// Test correctness for memmove
for (int i = 0; i < 67; i++) {
for (int j = 0; j < 67; j++) {
int t[2];
if (!__arm_sc_memmove(&dst[66 - j], &dst[j], i)) {
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memmove!\n");
abort();
}
t[0] = sum(dst, N);
reinit_dst(200);
memmove(&dst[66 - j], &dst[j], i);
t[1] = sum(dst, N);
reinit_dst(200);
if (t[0] != t[1]) {
fprintf(stderr, "__arm_sc_memmove doesn't match memmove behaviour!\n");
abort();
}
}
}

#ifdef TEST_PERF
// Collect perf data for memmove
clock_gettime(CLOCK_REALTIME, &tv[0]);
for (int r = 0; r < NREPS; r++) {
for (int i = 0; i < 67; i++) {
for (int j = 0; j < 67; j++) {
__arm_sc_memmove(&dst[66 - j], &dst[j], i);
}
}
}
clock_gettime(CLOCK_REALTIME, &tv[1]);
printf("memmove time = %ld\n", get_time_diff(tv));
#endif

return 0;
}
Loading