Skip to content

Commit

Permalink
[openmp] Provide an assembly implementation of __kmp_invoke_microtask…
Browse files Browse the repository at this point in the history
… on ARM

This fixes passing an arbitrarily large number of arguments to
microtasks, fixing the misc_bugs/many-microtask-args.c testcase on
ARM.

Differential Revision: https://reviews.llvm.org/D138704
  • Loading branch information
mstorsjo committed Dec 8, 2022
1 parent f850035 commit c2a289d
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 101 deletions.
4 changes: 2 additions & 2 deletions openmp/runtime/src/CMakeLists.txt
Expand Up @@ -95,8 +95,8 @@ else()
libomp_append(LIBOMP_CXXFILES z_Windows_NT-586_util.cpp)
if(${LIBOMP_ARCH} STREQUAL "i386" OR ${LIBOMP_ARCH} STREQUAL "x86_64")
libomp_append(LIBOMP_ASMFILES z_Windows_NT-586_asm.asm) # Windows assembly file
elseif(${LIBOMP_ARCH} STREQUAL "aarch64" AND (NOT MSVC OR CMAKE_C_COMPILER_ID STREQUAL "Clang"))
# z_Linux_asm.S works for AArch64 Windows too.
elseif((${LIBOMP_ARCH} STREQUAL "aarch64" OR ${LIBOMP_ARCH} STREQUAL "arm") AND (NOT MSVC OR CMAKE_C_COMPILER_ID STREQUAL "Clang"))
# z_Linux_asm.S works for AArch64 and ARM Windows too.
libomp_append(LIBOMP_GNUASMFILES z_Linux_asm.S)
else()
# AArch64 with MSVC gets implementations of the functions from
Expand Down
152 changes: 150 additions & 2 deletions openmp/runtime/src/z_Linux_asm.S
Expand Up @@ -108,7 +108,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
# endif // KMP_OS_DARWIN
#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64

#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)

# if KMP_OS_DARWIN
# define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
Expand Down Expand Up @@ -160,7 +160,11 @@ KMP_PREFIX_UNDERSCORE(\proc):
.cfi_endproc
// Not sure why we need .type and .size for the functions
ALIGN 2
#if KMP_ARCH_ARM
.type \proc,%function
#else
.type \proc,@function
#endif
.size \proc,.-\proc
.endm

Expand All @@ -172,7 +176,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
.endm
# endif // KMP_OS_DARWIN

#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)

.macro COMMON name, size, align_power
#if KMP_OS_DARWIN
Expand Down Expand Up @@ -1358,6 +1362,148 @@ KMP_LABEL(kmp_1):

#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */

#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM

//------------------------------------------------------------------------
// int
// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
// int gtid, int tid,
// int argc, void *p_argv[]
// #if OMPT_SUPPORT
// ,
// void **exit_frame_ptr
// #endif
// ) {
// #if OMPT_SUPPORT
// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
// #endif
//
// (*pkfn)( & gtid, & tid, argv[0], ... );
//
// // FIXME: This is done at call-site and can be removed here.
// #if OMPT_SUPPORT
// *exit_frame_ptr = 0;
// #endif
//
// return 1;
// }
//
// parameters:
// r0: pkfn
// r1: gtid
// r2: tid
// r3: argc
// r4(stack): p_argv
// r5(stack): &exit_frame
//
// locals:
// __gtid: gtid parm pushed on stack so can pass &gtid to pkfn
// __tid: tid parm pushed on stack so can pass &tid to pkfn
//
// reg temps:
// r4: used to hold pkfn address
// r5: used as temporary for number of pkfn parms
// r6: used to traverse p_argv array
// r7: frame pointer (in some configurations)
// r8: used as temporary for stack placement calculation
// and as pointer to base of callee saved area
// r9: used as temporary for stack parameters
// r10: used to preserve exit_frame_ptr, callee-save
// r11: frame pointer (in some configurations)
//
// return: r0 (always 1/TRUE)
//

__gtid = 4
__tid = 8

// -- Begin __kmp_invoke_microtask
// mark_begin;
.text
PROC __kmp_invoke_microtask

// Pushing one extra register (r3) to keep the stack aligned
// for when we call pkfn below
push {r3-r11,lr}
// Load p_argv and &exit_frame
ldrd r4, r5, [sp, #10*4]

# if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
# define FP r7
# define FPOFF 4*4
#else
# define FP r11
# define FPOFF 8*4
#endif
add FP, sp, #FPOFF
# if OMPT_SUPPORT
mov r10, r5
str FP, [r10]
# endif
mov r8, sp

// Calculate how much stack to allocate, in increments of 8 bytes.
// We strictly need 4*(argc-2) bytes (2 arguments are passed in
// registers) but allocate 4*argc for simplicity (to avoid needing
// to handle the argc<2 cases). We align the number of bytes
// allocated to 8 bytes, to keep the stack aligned. (Since we
// already allocate more than enough, it's ok to round down
// instead of up for the alignment.) We allocate another extra
// 8 bytes for gtid and tid.
mov r5, #1
add r5, r5, r3, lsr #1
sub sp, sp, r5, lsl #3

str r1, [r8, #-__gtid]
str r2, [r8, #-__tid]
mov r5, r3
mov r6, r4
mov r4, r0

// Prepare the first 2 parameters to pkfn - pointers to gtid and tid
// in our stack frame.
sub r0, r8, #__gtid
sub r1, r8, #__tid

mov r8, sp

// Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
cmp r5, #0
beq KMP_LABEL(kmp_1)
ldr r2, [r6]

subs r5, r5, #1
beq KMP_LABEL(kmp_1)
ldr r3, [r6, #4]!

// Loop, loading the rest of p_argv and writing the elements on the
// stack.
KMP_LABEL(kmp_0):
subs r5, r5, #1
beq KMP_LABEL(kmp_1)
ldr r12, [r6, #4]!
str r12, [r8], #4
b KMP_LABEL(kmp_0)
KMP_LABEL(kmp_1):
blx r4
mov r0, #1

sub r4, FP, #FPOFF
mov sp, r4
# undef FP
# undef FPOFF

# if OMPT_SUPPORT
mov r1, #0
str r1, [r10]
# endif
pop {r3-r11,pc}

DEBUG_INFO __kmp_invoke_microtask
// -- End __kmp_invoke_microtask

#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */

#if KMP_ARCH_PPC64

//------------------------------------------------------------------------
Expand Down Expand Up @@ -1919,7 +2065,9 @@ __kmp_invoke_microtask:
.global __kmp_unnamed_critical_addr
__kmp_unnamed_critical_addr:
.4byte .gomp_critical_user_
#ifdef __ELF__
.size __kmp_unnamed_critical_addr,4
#endif
#endif /* KMP_ARCH_ARM */

#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
Expand Down
3 changes: 2 additions & 1 deletion openmp/runtime/src/z_Linux_util.cpp
Expand Up @@ -2448,7 +2448,8 @@ int __kmp_get_load_balance(int max) {

#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || \
((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || \
KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64)
KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
KMP_ARCH_ARM)

// we really only need the case with 1 argument, because CLANG always build
// a struct of pointers to shared variables referenced in the outlined function
Expand Down
91 changes: 0 additions & 91 deletions openmp/runtime/src/z_Windows_NT-586_util.cpp
Expand Up @@ -189,95 +189,4 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
}
#endif

#if KMP_ARCH_ARM
// This matches the generic fallback implementation of __kmp_invoke_microtask
// from z_Linux_util.cpp, which is used on Linux on ARM.
int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
void *p_argv[]
#if OMPT_SUPPORT
,
void **exit_frame_ptr
#endif
) {
#if OMPT_SUPPORT
*exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
#endif

switch (argc) {
default:
fprintf(stderr, "Too many args to microtask: %d!\n", argc);
fflush(stderr);
exit(-1);
case 0:
(*pkfn)(&gtid, &tid);
break;
case 1:
(*pkfn)(&gtid, &tid, p_argv[0]);
break;
case 2:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
break;
case 3:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
break;
case 4:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
break;
case 5:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
break;
case 6:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5]);
break;
case 7:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6]);
break;
case 8:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6], p_argv[7]);
break;
case 9:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
break;
case 10:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
break;
case 11:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
break;
case 12:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
p_argv[11]);
break;
case 13:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
p_argv[11], p_argv[12]);
break;
case 14:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
p_argv[11], p_argv[12], p_argv[13]);
break;
case 15:
(*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
break;
}

#if OMPT_SUPPORT
*exit_frame_ptr = 0;
#endif

return 1;
}
#endif

#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 || KMP_ARCH_ARM */
5 changes: 0 additions & 5 deletions openmp/runtime/test/misc_bugs/many-microtask-args.c
@@ -1,11 +1,6 @@
// RUN: %libomp-compile-and-run
#include <stdio.h>

// This test fails with Clang unless __kmp_invoke_microtask supports at least
// 17 arguments. On ARM, the fallback C implementation of __kmp_invoke_microtask
// is used, and that one only currently supports up to 15 arguments.
// XFAIL: arm

int main()
{

Expand Down

0 comments on commit c2a289d

Please sign in to comment.