diff --git a/cmake/build_configurations/compiler_options.cmake b/cmake/build_configurations/compiler_options.cmake index c112418da028..4d72f4436387 100644 --- a/cmake/build_configurations/compiler_options.cmake +++ b/cmake/build_configurations/compiler_options.cmake @@ -30,7 +30,11 @@ IF(UNIX) # Default GCC flags IF(CMAKE_COMPILER_IS_GNUCC) - SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing -march=armv8-a+crypto+crc") + ELSE() + SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing") + ENDIF() # Disable inline optimizations for valgrind testing to avoid false positives IF(WITH_VALGRIND) SET(COMMON_C_FLAGS "-fno-inline ${COMMON_C_FLAGS}") @@ -54,7 +58,11 @@ IF(UNIX) SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 ${COMMON_C_FLAGS}") ENDIF() IF(CMAKE_COMPILER_IS_GNUCXX) - SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing -march=armv8-a+crypto+crc") + ELSE() + SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing") + ENDIF() # GCC 6 has C++14 as default, set it explicitly to the old default. EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GXX_VERSION) diff --git a/config.h.cmake b/config.h.cmake index c89493a6e559..d87ef82c93c8 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -448,4 +448,7 @@ /* For default value of --early_plugin_load */ #cmakedefine DEFAULT_EARLY_PLUGIN_LOAD @DEFAULT_EARLY_PLUGIN_LOAD@ +/* Support ARMv8 CRC instructions */ +#cmakedefine ENABLE_ARMV8_CRC32 + #endif diff --git a/configure.cmake b/configure.cmake index cb8aa678e889..e239187569cb 100644 --- a/configure.cmake +++ b/configure.cmake @@ -929,3 +929,9 @@ CHECK_TYPE_SIZE("socklen_t" SIZEOF_SOCKLEN_T) IF(SIZEOF_SOCKLEN_T) SET(HAVE_SOCKLEN_T 1) ENDIF() + +# Enable crc32 on AArch64 Platform +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + MESSAGE(STATUS "ARMv8 crc32 enabled.") + SET(ENABLE_ARMV8_CRC32 1) +ENDIF() diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc index 979713f1f3bd..b7d003c44306 100644 --- a/storage/innobase/ut/ut0crc32.cc +++ b/storage/innobase/ut/ut0crc32.cc @@ -114,11 +114,62 @@ ut_crc32_swap_byteorder( | i >> 56); } + /* CRC32 hardware implementation. */ +/*For AArch64*/ +#ifdef ENABLE_ARMV8_CRC32 +#include +#include + +#define ARM_CRC32_INTRINSIC + +#define CRC32C3X8(buffer,ITR) \ + crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\ + crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\ + crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR))); + +#define CRC32C7X3X8(buffer,ITR) do {\ + CRC32C3X8(buffer,(ITR)*7+0) \ + CRC32C3X8(buffer,(ITR)*7+1) \ + CRC32C3X8(buffer,(ITR)*7+2) \ + CRC32C3X8(buffer,(ITR)*7+3) \ + CRC32C3X8(buffer,(ITR)*7+4) \ + CRC32C3X8(buffer,(ITR)*7+5) \ + CRC32C3X8(buffer,(ITR)*7+6) \ + } while(0) + +#define PREF4X64L1(buffer,PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL1(buffer,PREF_OFFSET) \ + PREF4X64L1(buffer,(PREF_OFFSET), 0) \ + PREF4X64L1(buffer,(PREF_OFFSET), 4) \ + PREF4X64L1(buffer,(PREF_OFFSET), 8) \ + PREF4X64L1(buffer,(PREF_OFFSET), 12) + +#define PREF4X64L2(buffer,PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL2(buffer,PREF_OFFSET) \ + PREF4X64L2(buffer,(PREF_OFFSET), 0) \ + PREF4X64L2(buffer,(PREF_OFFSET), 4) \ + PREF4X64L2(buffer,(PREF_OFFSET), 8) \ + PREF4X64L2(buffer,(PREF_OFFSET), 12) +#else +#undef ARM_CRC32_INTRINSIC +#endif + /* Flag that tells whether the CPU supports CRC32 or not */ bool ut_crc32_sse2_enabled = false; + #if defined(__GNUC__) && defined(__x86_64__) /********************************************************************//** Fetches CPU info */ @@ -421,6 +472,107 @@ ut_crc32_byte_by_byte_hw( } #endif /* defined(__GNUC__) && defined(__x86_64__) */ + +#ifdef ARM_CRC32_INTRINSIC +uint32_t +ut_crc32_byte_by_byte_aarch64( + const uint8_t* buf, + uint64_t len) +{ + uint32_t crc = 0xFFFFFFFFU; + + ut_a(ut_crc32_sse2_enabled); + + while (len > 0) { + crc = __crc32cb(crc, *buf++); + len--; + } + + return(~crc); +} + +uint32_t +ut_crc32_aarch64( + const uint8_t* buf, + uint64_t len) +{ + register uint32_t crc = 0xFFFFFFFFU; + register const uint16_t *buf2; + register const uint32_t *buf4; + register const uint64_t *buf8; + + ut_a(ut_crc32_sse2_enabled); + + uint32_t crc0, crc1, crc2; + int64_t length = (int64_t)len; + buf8 = (const uint64_t *)(const void *)buf; + + /* Calculate reflected crc with PMULL Instruction */ + const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; + uint64_t t0, t1; + + /* crc done "by 3" for fixed input block size of 1024 bytes */ + while ((length -= 1024) >= 0) { + /* Prefetch data for following block to avoid cache miss */ + PREF1KL2(buf,1024*3); + /* Do first 8 bytes here for better pipelining */ + crc0 = __crc32cd(crc, *buf8++); + crc1 = 0; + crc2 = 0; + + /* Process block inline + Process crc0 last to avoid dependency with above */ + CRC32C7X3X8(buf8,0); + CRC32C7X3X8(buf8,1); + CRC32C7X3X8(buf8,2); + CRC32C7X3X8(buf8,3); + CRC32C7X3X8(buf8,4); + CRC32C7X3X8(buf8,5); + + buf8 += 42*3; + /* Prefetch data for following block to avoid cache miss */ + PREF1KL1((uint8_t *)buf8,1024); + + /* Merge crc0 and crc1 into crc2 + crc1 multiply by K2 + crc0 multiply by K1 */ + + t1 = (uint64_t)vmull_p64(crc1, k2); + t0 = (uint64_t)vmull_p64(crc0, k1); + crc = __crc32cd(crc2, *buf8++); + crc1 = __crc32cd(0, t1); + crc ^= crc1; + crc0 = __crc32cd(0, t0); + crc ^= crc0; + } + + if(!(length += 1024)) + return (~crc); + + while ((length -= sizeof(uint64_t)) >= 0) { + crc = __crc32cd(crc, *buf8++); + } + + /* The following is more efficient than the straight loop */ + buf4 = (const uint32_t *)(const void *)buf8; + if (length & sizeof(uint32_t)) { + crc = __crc32cw(crc, *buf4++); + } + + buf2 = (const uint16_t *)(const void *)buf4; + if (length & sizeof(uint16_t)) { + crc = __crc32ch(crc, *buf2++); + } + + buf = (const uint8_t *)(const void *)buf2; + if (length & sizeof(uint8_t)) + crc = __crc32cb(crc, *buf); + + return(~crc); +} +#endif + + /* CRC32 software implementation. */ /* Precalculated table used to generate the CRC32 if the CPU does not @@ -431,7 +583,7 @@ static bool ut_crc32_slice8_table_initialized = false; /********************************************************************//** Initializes the table that is used to generate the CRC32 if the CPU does not have support for it. */ -static +//static void ut_crc32_slice8_table_init() /*========================*/ @@ -727,6 +879,15 @@ ut_crc32_init() #endif /* defined(__GNUC__) && defined(__x86_64__) */ +#ifdef ARM_CRC32_INTRINSIC + ut_crc32_sse2_enabled = 0x1; + if (ut_crc32_sse2_enabled) { + ut_crc32 = ut_crc32_aarch64; + ut_crc32_legacy_big_endian = NULL; + ut_crc32_byte_by_byte = ut_crc32_byte_by_byte_aarch64; + } +#endif + if (!ut_crc32_sse2_enabled) { ut_crc32_slice8_table_init(); ut_crc32 = ut_crc32_sw;