Skip to content

Commit

Permalink
[OpenMP] Do not SPMDize generic regions with no parallel
Browse files Browse the repository at this point in the history
This patch changes SPMDization to not trigger for regions with no
parallelism. Otherwise, this will introduce unnecessary barriers that
will slow the single-threaded region down.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D109438
  • Loading branch information
jhuber6 committed Sep 8, 2021
1 parent e567356 commit 6b9a3ec
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 12 deletions.
11 changes: 8 additions & 3 deletions llvm/lib/Transforms/IPO/OpenMPOpt.cpp
Expand Up @@ -599,6 +599,12 @@ struct KernelInfoState : AbstractState {
return true;
}

/// Returns true if this kernel contains any OpenMP parallel regions.
bool mayContainParallelRegion() {
return !ReachedKnownParallelRegions.empty() ||
!ReachedUnknownParallelRegions.empty();
}

/// Return empty set as the best state of potential values.
static KernelInfoState getBestState() { return KernelInfoState(true); }

Expand Down Expand Up @@ -3003,7 +3009,7 @@ struct AAKernelInfoFunction : AAKernelInfo {

// If we can we change the execution mode to SPMD-mode otherwise we build a
// custom state machine.
if (!changeToSPMDMode(A))
if (!mayContainParallelRegion() || !changeToSPMDMode(A))
buildCustomStateMachine(A);

return ChangeStatus::CHANGED;
Expand Down Expand Up @@ -3308,8 +3314,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
// happen if there simply are no parallel regions. In the resulting kernel
// all worker threads will simply exit right away, leaving the main thread
// to do the work alone.
if (ReachedKnownParallelRegions.empty() &&
ReachedUnknownParallelRegions.empty()) {
if (!mayContainParallelRegion()) {
++NumOpenMPTargetRegionKernelsWithoutStateMachine;

auto Remark = [&](OptimizationRemark OR) {
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/OpenMP/always_inline_device.ll
Expand Up @@ -12,11 +12,11 @@ define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
; CHECK: Function Attrs: convergent norecurse nounwind
; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false)
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true)
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
; CHECK: user_code.entry:
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false)
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
; CHECK-NEXT: ret void
; CHECK: worker.exit:
; CHECK-NEXT: ret void
Expand Down
Expand Up @@ -54,18 +54,40 @@ define weak void @kernel1() #0 {
define weak void @kernel2() #0 {
; CHECK-LABEL: define {{[^@]+}}@kernel2
; CHECK-SAME: () #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; CHECK: common.ret:
; CHECK-NEXT: ret void
; CHECK: user_code.entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) #[[ATTR1]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @helper0() #[[ATTR1]]
; CHECK-NEXT: call void @helper1() #[[ATTR1]]
; CHECK-NEXT: call void @helper2() #[[ATTR1]]
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0)
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
; CHECK-NEXT: ret void
;
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
entry:
%captured_vars_addrs = alloca [0 x i8*], align 8
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true)
%exec_user_code = icmp eq i32 %i, -1
br i1 %exec_user_code, label %user_code.entry, label %common.ret

common.ret:
ret void

user_code.entry:
%0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null)
%1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
call void @helper0()
call void @helper1()
call void @helper2()
call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0)
call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true)
ret void
}

Expand Down Expand Up @@ -136,9 +158,31 @@ define internal void @helper2() {
ret void
}

define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: ret void
;
entry:
ret void
}

define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) {
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: ret void
;
entry:
ret void
}

declare i32 @__kmpc_get_hardware_num_threads_in_block()
declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1
declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1
declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
declare i32 @__kmpc_global_thread_num(%struct.ident_t*)


!llvm.module.flags = !{!0, !1}
Expand All @@ -155,7 +199,8 @@ attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"}
;.
; CHECK: attributes #[[ATTR0]] = { "omp_target_num_teams"="777" "omp_target_thread_limit"="666" }
; CHECK: attributes #[[ATTR1]] = { nounwind }
; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind }
; CHECK: attributes #[[ATTR2:[0-9]+]] = { alwaysinline }
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind }
;.
; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
Expand Down
54 changes: 50 additions & 4 deletions llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
Expand Up @@ -38,14 +38,36 @@ define weak void @is_spmd() {

define weak void @will_be_spmd() {
; CHECK-LABEL: define {{[^@]+}}@will_be_spmd() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
; CHECK: common.ret:
; CHECK-NEXT: ret void
; CHECK: user_code.entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) #[[ATTR2:[0-9]+]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @is_spmd_helper2()
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0)
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
; CHECK-NEXT: ret void
;
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
entry:
%captured_vars_addrs = alloca [0 x i8*], align 8
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true)
%exec_user_code = icmp eq i32 %i, -1
br i1 %exec_user_code, label %user_code.entry, label %common.ret

common.ret:
ret void

user_code.entry:
%0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null)
%1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
call void @is_spmd_helper2()
call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0)
call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true)
ret void
}

Expand Down Expand Up @@ -153,10 +175,32 @@ define internal void @is_mixed_helper() {
ret void
}

define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: ret void
;
entry:
ret void
}

define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) {
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: ret void
;
entry:
ret void
}

declare void @spmd_compatible() "llvm.assume"="ompx_spmd_amenable"
declare i8 @__kmpc_is_spmd_exec_mode()
declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1
declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1
declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext)
declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext)
declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
declare void @foo()
declare void @bar()

Expand All @@ -171,6 +215,8 @@ declare void @bar()
!5 = !{void ()* @will_not_be_spmd, !"kernel", i32 1}
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { alwaysinline }
; CHECK: attributes #[[ATTR2]] = { nounwind }
;.
; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
Expand Down

0 comments on commit 6b9a3ec

Please sign in to comment.