diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt index 822f9ca2b8251..f075c67778fc8 100644 --- a/openmp/runtime/src/CMakeLists.txt +++ b/openmp/runtime/src/CMakeLists.txt @@ -93,7 +93,9 @@ else() # Windows specific files libomp_append(LIBOMP_CXXFILES z_Windows_NT_util.cpp) libomp_append(LIBOMP_CXXFILES z_Windows_NT-586_util.cpp) - libomp_append(LIBOMP_ASMFILES z_Windows_NT-586_asm.asm) # Windows assembly file + if(${LIBOMP_ARCH} STREQUAL "i386" OR ${LIBOMP_ARCH} STREQUAL "x86_64") + libomp_append(LIBOMP_ASMFILES z_Windows_NT-586_asm.asm) # Windows assembly file + endif() else() # Unix specific files libomp_append(LIBOMP_CXXFILES z_Linux_util.cpp) diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports index c6be679494cec..473746887574f 100644 --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -792,7 +792,9 @@ kmp_set_disp_num_buffers 890 %endif + # These are specific to x86 and x64 %ifndef arch_64 + %ifndef arch_aarch64 # ATOMIC extensions for OpenMP 3.1 spec (x86 and x64 only) @@ -1196,6 +1198,7 @@ kmp_set_disp_num_buffers 890 __kmpc_atomic_float10_div_cpt_rev_fp %endif + %endif # arch_aarch64 %endif # arch_64 %ifdef HAVE_QUAD diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 470344aa817eb..327ebbeae1e9a 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -3457,7 +3457,7 @@ extern void __kmp_check_stack_overlap(kmp_info_t *thr); extern void __kmp_expand_host_name(char *buffer, size_t size); extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern); -#if KMP_ARCH_X86 || KMP_ARCH_X86_64 +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 || (KMP_OS_WINDOWS && KMP_ARCH_AARCH64) extern void __kmp_initialize_system_tick(void); /* Initialize timer tick value */ #endif diff --git a/openmp/runtime/src/kmp_atomic.cpp b/openmp/runtime/src/kmp_atomic.cpp index a71a1b33ae380..fcc06216a4fa5 100644 --- a/openmp/runtime/src/kmp_atomic.cpp +++ b/openmp/runtime/src/kmp_atomic.cpp @@ -832,6 +832,39 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs, // end of the first part of the workaround for C78287 #endif // USE_CMPXCHG_FIX +#if KMP_OS_WINDOWS && KMP_ARCH_AARCH64 +// Undo explicit type casts to get MSVC ARM64 to build. Uses +// OP_CMPXCHG_WORKAROUND definition for OP_CMPXCHG +#undef OP_CMPXCHG +#define OP_CMPXCHG(TYPE, BITS, OP) \ + { \ + struct _sss { \ + TYPE cmp; \ + kmp_int##BITS *vvv; \ + }; \ + struct _sss old_value, new_value; \ + old_value.vvv = (kmp_int##BITS *)&old_value.cmp; \ + new_value.vvv = (kmp_int##BITS *)&new_value.cmp; \ + *old_value.vvv = *(volatile kmp_int##BITS *)lhs; \ + new_value.cmp = old_value.cmp OP rhs; \ + while (!KMP_COMPARE_AND_STORE_ACQ##BITS( \ + (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) old_value.vvv, \ + *VOLATILE_CAST(kmp_int##BITS *) new_value.vvv)) { \ + KMP_DO_PAUSE; \ + \ + *old_value.vvv = *(volatile kmp_int##BITS *)lhs; \ + new_value.cmp = old_value.cmp OP rhs; \ + } \ + } + +#undef OP_UPDATE_CRITICAL +#define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID) \ + __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid); \ + (*lhs) = (*lhs)OP rhs; \ + __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid); + +#endif // KMP_OS_WINDOWS && KMP_ARCH_AARCH64 + #if KMP_ARCH_X86 || KMP_ARCH_X86_64 // ------------------------------------------------------------------------ diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index fe02ccfae06ff..011c517f34b13 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -139,7 +139,7 @@ typedef struct kmp_struct64 kmp_uint64; #undef KMP_USE_X87CONTROL #define KMP_USE_X87CONTROL 1 #endif -#if KMP_ARCH_X86_64 +#if KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 #define KMP_INTPTR 1 typedef __int64 kmp_intptr_t; typedef unsigned __int64 kmp_uintptr_t; @@ -463,8 +463,13 @@ inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) { return *(kmp_real32 *)&tmp; } -// Routines that we still need to implement in assembly. -extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v); +#define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v)) +#define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v)) +#define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v)) +#define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v)) +#define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v)) +#define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v)) + extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v); extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v); extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v); @@ -474,6 +479,113 @@ extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v); extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v); extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v); +#if KMP_ARCH_AARCH64 && KMP_COMPILER_MSVC && !KMP_COMPILER_CLANG +#define KMP_TEST_THEN_INC64(p) _InterlockedExchangeAdd64((p), 1LL) +#define KMP_TEST_THEN_INC_ACQ64(p) _InterlockedExchangeAdd64_acq((p), 1LL) +#define KMP_TEST_THEN_ADD4_64(p) _InterlockedExchangeAdd64((p), 4LL) +// #define KMP_TEST_THEN_ADD4_ACQ64(p) _InterlockedExchangeAdd64_acq((p), 4LL) +// #define KMP_TEST_THEN_DEC64(p) _InterlockedExchangeAdd64((p), -1LL) +// #define KMP_TEST_THEN_DEC_ACQ64(p) _InterlockedExchangeAdd64_acq((p), -1LL) +// #define KMP_TEST_THEN_ADD8(p, v) _InterlockedExchangeAdd8((p), (v)) +#define KMP_TEST_THEN_ADD64(p, v) _InterlockedExchangeAdd64((p), (v)) + +#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv) \ + __kmp_compare_and_store_acq8((p), (cv), (sv)) +#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv) \ + __kmp_compare_and_store_rel8((p), (cv), (sv)) +#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) \ + __kmp_compare_and_store_acq16((p), (cv), (sv)) +// #define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) \ +// __kmp_compare_and_store_rel16((p), (cv), (sv)) +#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) \ + __kmp_compare_and_store_acq32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \ + (kmp_int32)(sv)) +#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) \ + __kmp_compare_and_store_rel32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \ + (kmp_int32)(sv)) +#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \ + __kmp_compare_and_store_acq64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \ + (kmp_int64)(sv)) +#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \ + __kmp_compare_and_store_rel64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \ + (kmp_int64)(sv)) +#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \ + __kmp_compare_and_store_ptr((void *volatile *)(p), (void *)(cv), (void *)(sv)) + +// KMP_COMPARE_AND_STORE expects this order: pointer, compare, exchange +// _InterlockedCompareExchange expects this order: pointer, exchange, compare +// KMP_COMPARE_AND_STORE also returns a bool indicating a successful write. A +// write is successful if the return value of _InterlockedCompareExchange is the +// same as the compare value. +inline kmp_int8 __kmp_compare_and_store_acq8(volatile kmp_int8 *p, kmp_int8 cv, + kmp_int8 sv) { + return _InterlockedCompareExchange8_acq(p, sv, cv) == cv; +} + +inline kmp_int8 __kmp_compare_and_store_rel8(volatile kmp_int8 *p, kmp_int8 cv, + kmp_int8 sv) { + return _InterlockedCompareExchange8_rel(p, sv, cv) == cv; +} + +inline kmp_int16 __kmp_compare_and_store_acq16(volatile kmp_int16 *p, + kmp_int16 cv, kmp_int16 sv) { + return _InterlockedCompareExchange16_acq(p, sv, cv) == cv; +} + +inline kmp_int16 __kmp_compare_and_store_rel16(volatile kmp_int16 *p, + kmp_int16 cv, kmp_int16 sv) { + return _InterlockedCompareExchange16_rel(p, sv, cv) == cv; +} + +inline kmp_int32 __kmp_compare_and_store_acq32(volatile kmp_int32 *p, + kmp_int32 cv, kmp_int32 sv) { + return _InterlockedCompareExchange_acq((volatile long *)p, sv, cv) == cv; +} + +inline kmp_int32 __kmp_compare_and_store_rel32(volatile kmp_int32 *p, + kmp_int32 cv, kmp_int32 sv) { + return _InterlockedCompareExchange_rel((volatile long *)p, sv, cv) == cv; +} + +inline kmp_int32 __kmp_compare_and_store_acq64(volatile kmp_int64 *p, + kmp_int64 cv, kmp_int64 sv) { + return _InterlockedCompareExchange64_acq(p, sv, cv) == cv; +} + +inline kmp_int32 __kmp_compare_and_store_rel64(volatile kmp_int64 *p, + kmp_int64 cv, kmp_int64 sv) { + return _InterlockedCompareExchange64_rel(p, sv, cv) == cv; +} + +inline kmp_int32 __kmp_compare_and_store_ptr(void *volatile *p, void *cv, + void *sv) { + return _InterlockedCompareExchangePointer(p, sv, cv) == cv; +} + +// The _RET versions return the value instead of a bool +// #define KMP_COMPARE_AND_STORE_RET8(p, cv, sv) \ +// _InterlockedCompareExchange8((p), (sv), (cv)) +// #define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) \ +// _InterlockedCompareExchange16((p), (sv), (cv)) +#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \ + _InterlockedCompareExchange64((volatile kmp_int64 *)(p), (kmp_int64)(sv), \ + (kmp_int64)(cv)) + +// #define KMP_XCHG_FIXED8(p, v) \ +// _InterlockedExchange8((volatile kmp_int8 *)(p), (kmp_int8)(v)); +// #define KMP_XCHG_FIXED16(p, v) _InterlockedExchange16((p), (v)); +// #define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v))); + +// inline kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v) { +// kmp_int64 tmp = _InterlockedExchange64((volatile kmp_int64 *)p, *(kmp_int64 +// *)&v); return *(kmp_real64 *)&tmp; +// } + +#else // !KMP_ARCH_AARCH64 + +// Routines that we still need to implement in assembly. +extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v); + extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv); extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, @@ -514,12 +626,6 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v); #define KMP_TEST_THEN_ADD8(p, v) __kmp_test_then_add8((p), (v)) #define KMP_TEST_THEN_ADD64(p, v) __kmp_test_then_add64((p), (v)) -#define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v)) -#define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v)) -#define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v)) -#define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v)) -#define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v)) -#define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v)) #define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv) \ __kmp_compare_and_store8((p), (cv), (sv)) @@ -567,6 +673,7 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v); //#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v)); //#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v)); #define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v)); +#endif #elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64) @@ -885,8 +992,13 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v); #if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || \ KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 +#if KMP_OS_WINDOWS +#undef KMP_MB +#define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst) +#else /* !KMP_OS_WINDOWS */ #define KMP_MB() __sync_synchronize() #endif +#endif #ifndef KMP_MB #define KMP_MB() /* nothing to do */ diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h index 4dbc0158880ea..bbbd72dd6951f 100644 --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -97,6 +97,9 @@ #if defined(_M_AMD64) || defined(__x86_64) #undef KMP_ARCH_X86_64 #define KMP_ARCH_X86_64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#undef KMP_ARCH_AARCH64 +#define KMP_ARCH_AARCH64 1 #else #undef KMP_ARCH_X86 #define KMP_ARCH_X86 1 diff --git a/openmp/runtime/src/z_Windows_NT-586_util.cpp b/openmp/runtime/src/z_Windows_NT-586_util.cpp index b3728a5d975f4..991943c1b2b57 100644 --- a/openmp/runtime/src/z_Windows_NT-586_util.cpp +++ b/openmp/runtime/src/z_Windows_NT-586_util.cpp @@ -12,7 +12,7 @@ #include "kmp.h" -#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) +#if (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) /* Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to use compare_and_store for these routines */ @@ -22,7 +22,7 @@ kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 d) { old_value = TCR_1(*p); new_value = old_value | d; - while (!__kmp_compare_and_store8(p, old_value, new_value)) { + while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) { KMP_CPU_PAUSE(); old_value = TCR_1(*p); new_value = old_value | d; @@ -36,7 +36,7 @@ kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 d) { old_value = TCR_1(*p); new_value = old_value & d; - while (!__kmp_compare_and_store8(p, old_value, new_value)) { + while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) { KMP_CPU_PAUSE(); old_value = TCR_1(*p); new_value = old_value & d; @@ -50,8 +50,8 @@ kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 d) { old_value = TCR_4(*p); new_value = old_value | d; - while (!__kmp_compare_and_store32((volatile kmp_int32 *)p, old_value, - new_value)) { + while (!KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)p, old_value, + new_value)) { KMP_CPU_PAUSE(); old_value = TCR_4(*p); new_value = old_value | d; @@ -65,8 +65,8 @@ kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) { old_value = TCR_4(*p); new_value = old_value & d; - while (!__kmp_compare_and_store32((volatile kmp_int32 *)p, old_value, - new_value)) { + while (!KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)p, old_value, + new_value)) { KMP_CPU_PAUSE(); old_value = TCR_4(*p); new_value = old_value & d; @@ -74,6 +74,7 @@ kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) { return old_value; } +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 d) { kmp_int64 old_value, new_value; @@ -101,14 +102,15 @@ kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 d) { return old_value; } #endif /* KMP_ARCH_X86 */ +#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 d) { kmp_uint64 old_value, new_value; old_value = TCR_8(*p); new_value = old_value | d; - while (!__kmp_compare_and_store64((volatile kmp_int64 *)p, old_value, - new_value)) { + while (!KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)p, old_value, + new_value)) { KMP_CPU_PAUSE(); old_value = TCR_8(*p); new_value = old_value | d; @@ -122,8 +124,8 @@ kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 d) { old_value = TCR_8(*p); new_value = old_value & d; - while (!__kmp_compare_and_store64((volatile kmp_int64 *)p, old_value, - new_value)) { + while (!KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)p, old_value, + new_value)) { KMP_CPU_PAUSE(); old_value = TCR_8(*p); new_value = old_value & d; @@ -132,4 +134,57 @@ kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 d) { return old_value; } -#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ +#if KMP_ARCH_AARCH64 +int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc, + void *p_argv[] +#if OMPT_SUPPORT + , + void **exit_frame_ptr +#endif +) { +#if OMPT_SUPPORT + *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0); +#endif + + switch (argc) { + case 0: + (*pkfn)(>id, &tid); + break; + case 1: + (*pkfn)(>id, &tid, p_argv[0]); + break; + case 2: + (*pkfn)(>id, &tid, p_argv[0], p_argv[1]); + break; + case 3: + (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2]); + break; + case 4: + (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]); + break; + case 5: + (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]); + break; + default: { + // p_argv[6] and onwards must be passed on the stack since 8 registers are + // already used. + size_t len = (argc - 6) * sizeof(void *); + void *argbuf = alloca(len); + memcpy(argbuf, &p_argv[6], len); + } + [[fallthrough]]; + case 6: + (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4], + p_argv[5]); + break; + } + +#if OMPT_SUPPORT + *exit_frame_ptr = 0; +#endif + + return 1; +} +#endif + +#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 */