From 2fc690e01f525b78918ae393f402871e8c89f974 Mon Sep 17 00:00:00 2001 From: Anjan Roy Date: Thu, 25 Jan 2024 23:35:08 +0400 Subject: [PATCH] Add support for running constant-time tests on aarch64, Apple Silicon and wider spectrum of non-x86_64 targets Signed-off-by: Anjan Roy --- src/dudect.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/src/dudect.h b/src/dudect.h index ee40912..c9ddd40 100644 --- a/src/dudect.h +++ b/src/dudect.h @@ -71,8 +71,15 @@ extern "C" { #include #include + +#if defined __x86_64__ #include #include +#elif defined __APPLE__ +#include +#else +#include +#endif #ifdef DUDECT_VISIBLITY_STATIC #define DUDECT_VISIBILITY static @@ -262,8 +269,10 @@ uint8_t randombit(void) { return (ret & 1); } +#if defined __x86_64__ + /* - Returns current CPU tick count from *T*ime *S*tamp *C*ounter. + Returns current CPU tick count from x86_64 *T*ime *S*tamp *C*ounter. To enforce CPU to issue RDTSC instruction where we want it to, we put a `mfence` instruction before issuing `rdtsc`, which should make all memory load/ store operations, prior to RDTSC, globally visible. @@ -279,6 +288,60 @@ static inline int64_t cpucycles(void) { return (int64_t)__rdtsc(); } +#elif defined __aarch64__ && defined __linux__ + +/* + Returns current CPU cycle count from aarch64 *P*erformance *M*onitors *C*ycle Counter (PMCCNTR_EL0). + + To enforce CPU to complete all pending memory access operations, appearing before PMCCTR_EL0, we issue a + *D*ata *S*ynchronization *B*arrier instruction right before reading CPU cycle counter. + + Note, issuing PMCCTR_EL0 instruction from the userspace will probably result in panicing with + a message "illegal instruction executed". So we've to install a Linux Kernel Module. I've tested the + LKM @ https://github.com/jerinjacobk/armv8_pmu_cycle_counter_el0 and it works fine. + + See PMCCTR_EL0 documentation @ https://developer.arm.com/documentation/ddi0595/2021-09/External-Registers/PMCCNTR-EL0--Performance-Monitors-Cycle-Counter?lang=en + See DSB documentation @ https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/DSB--Data-Synchronization-Barrier- + + Also see https://github.com/itzmeanjan/criterion-cycles-per-byte/blob/a270a496/src/lib.rs#L61-L74 +*/ +static inline int64_t cpucycles(void) { + uint64_t val = 0; + __asm__ volatile("dsb sy; mrs %0, pmccntr_el0" : "=r"(val)); + return (int64_t)val; +} + +#elif defined __APPLE__ + +/* + Returns the number of "mach time units" elapsed since system startup, on non-x86_64 Apple targets. + + See https://github.com/google/benchmark/blob/4682db08/src/cycleclock.h#L63-L73 +*/ +static inline int64_t cpucycles(void) { + return (int64_t)mach_absolute_time(); +} + +#else + +/* + Returns approximate measurement of CPU time consumed by the calling process (i.e., CPU time + consumed by all threads in the process). + + See https://www.man7.org/linux/man-pages/man3/clock_gettime.3.html +*/ +static inline int64_t cpucycles(void) { + const clockid_t clock_id = CLOCK_PROCESS_CPUTIME_ID; + struct timespec ts = {0}; + + (void)clock_gettime(clock_id, &ts); + + const uint64_t ns = (uint64_t)ts.tv_sec * (uint64_t)1e9 + (uint64_t)ts.tv_nsec; + return (int64_t)ns; +} + +#endif + // threshold values for Welch's t-test #define t_threshold_bananas 500 // test failed, with overwhelming probability #define t_threshold_moderate \