diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index e1b4b1f68d1e7..880e5560e03c6 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -2674,6 +2674,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { if (!isValidState()) return false; + bool ForwardIsOk = true; const Instruction *CurI; // Check forward until a call or the block end is reached. @@ -2682,19 +2683,18 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { auto *CB = dyn_cast(CurI); if (!CB) continue; - if (CB != &I && AlignedBarriers.contains(const_cast(CB))) { - break; - } + if (CB != &I && AlignedBarriers.contains(const_cast(CB))) + return true; const auto &It = CEDMap.find({CB, PRE}); if (It == CEDMap.end()) continue; if (!It->getSecond().IsReachingAlignedBarrierOnly) - return false; + ForwardIsOk = false; break; } while ((CurI = CurI->getNextNonDebugInstruction())); if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly) - return false; + ForwardIsOk = false; // Check backward until a call or the block beginning is reached. CurI = &I; @@ -2702,9 +2702,8 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { auto *CB = dyn_cast(CurI); if (!CB) continue; - if (CB != &I && AlignedBarriers.contains(const_cast(CB))) { - break; - } + if (CB != &I && AlignedBarriers.contains(const_cast(CB))) + return true; const auto &It = CEDMap.find({CB, POST}); if (It == CEDMap.end()) continue; @@ -2713,6 +2712,11 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { return false; } while ((CurI = CurI->getPrevNonDebugInstruction())); + // Delayed decision on the forward pass to allow aligned barrier detection + // in the backwards traversal. + if (!ForwardIsOk) + return false; + if (!CurI) { const BasicBlock *BB = I.getParent(); if (BB == &BB->getParent()->getEntryBlock()) diff --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll index ed4912810be28..f431906f7027a 100644 --- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll +++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll @@ -21,6 +21,8 @@ target triple = "amdgcn-amd-amdhsa" @QB3 = internal addrspace(3) global i32 undef, align 4 @QC3 = internal addrspace(3) global i32 undef, align 4 @QD3 = internal addrspace(3) global i32 undef, align 4 +@UAA1 = internal addrspace(3) global i32 undef, align 4 +@UAA2 = internal addrspace(3) global i32 undef, align 4 @str = private unnamed_addr addrspace(4) constant [1 x i8] c"\00", align 1 ; Make sure we do not delete the stores to @G without also replacing the load with `1`. @@ -40,6 +42,8 @@ target triple = "amdgcn-amd-amdhsa" ; TUNIT: @QB3 = internal addrspace(3) global i32 undef, align 4 ; TUNIT: @QC3 = internal addrspace(3) global i32 undef, align 4 ; TUNIT: @QD3 = internal addrspace(3) global i32 undef, align 4 +; TUNIT: @UAA1 = internal addrspace(3) global i32 undef, align 4 +; TUNIT: @UAA2 = internal addrspace(3) global i32 undef, align 4 ; TUNIT: @str = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1 ; TUNIT: @kernel_nested_parallelism = weak constant i8 0 ;. @@ -58,6 +62,8 @@ target triple = "amdgcn-amd-amdhsa" ; CGSCC: @QB3 = internal addrspace(3) global i32 undef, align 4 ; CGSCC: @QC3 = internal addrspace(3) global i32 undef, align 4 ; CGSCC: @QD3 = internal addrspace(3) global i32 undef, align 4 +; CGSCC: @UAA1 = internal addrspace(3) global i32 undef, align 4 +; CGSCC: @UAA2 = internal addrspace(3) global i32 undef, align 4 ; CGSCC: @str = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1 ;. define void @kernel() "kernel" { @@ -646,6 +652,86 @@ S: ret void } +define void @kernel_unknown_and_aligned1(i1 %c) "kernel" { +; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned1 +; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { +; TUNIT-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] +; TUNIT: L: +; TUNIT-NEXT: call void @use1(i32 2) #[[ATTR7]] +; TUNIT-NEXT: ret void +; TUNIT: S: +; TUNIT-NEXT: call void @sync() +; TUNIT-NEXT: call void @barrier() #[[ATTR7]] +; TUNIT-NEXT: call void @sync() +; TUNIT-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned1 +; CGSCC-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] +; CGSCC: L: +; CGSCC-NEXT: call void @use1(i32 2) #[[ATTR6]] +; CGSCC-NEXT: ret void +; CGSCC: S: +; CGSCC-NEXT: call void @sync() +; CGSCC-NEXT: call void @barrier() #[[ATTR6]] +; CGSCC-NEXT: call void @sync() +; CGSCC-NEXT: ret void +; + br i1 %c, label %S, label %L +L: + call void @barrier(); + %v = load i32, ptr addrspace(3) @UAA1 + call void @use1(i32 %v) + ret void +S: + call void @sync(); + store i32 2, ptr addrspace(3) @UAA1 + call void @barrier(); + call void @sync(); + ret void +} + +define void @kernel_unknown_and_aligned2(i1 %c) "kernel" { +; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned2 +; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { +; TUNIT-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] +; TUNIT: L: +; TUNIT-NEXT: call void @sync() +; TUNIT-NEXT: call void @use1(i32 2) #[[ATTR7]] +; TUNIT-NEXT: ret void +; TUNIT: S: +; TUNIT-NEXT: call void @sync() +; TUNIT-NEXT: call void @barrier() #[[ATTR7]] +; TUNIT-NEXT: call void @sync() +; TUNIT-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned2 +; CGSCC-SAME: (i1 [[C:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: br i1 [[C]], label [[S:%.*]], label [[L:%.*]] +; CGSCC: L: +; CGSCC-NEXT: call void @sync() +; CGSCC-NEXT: call void @use1(i32 2) #[[ATTR6]] +; CGSCC-NEXT: ret void +; CGSCC: S: +; CGSCC-NEXT: call void @sync() +; CGSCC-NEXT: call void @barrier() #[[ATTR6]] +; CGSCC-NEXT: call void @sync() +; CGSCC-NEXT: ret void +; + br i1 %c, label %S, label %L +L: + call void @sync(); + %v = load i32, ptr addrspace(3) @UAA2 + call void @use1(i32 %v) + ret void +S: + call void @sync(); + store i32 2, ptr addrspace(3) @UAA2 + call void @barrier(); + call void @sync(); + ret void +} + declare void @sync() declare void @barrier() norecurse nounwind nocallback "llvm.assume"="ompx_aligned_barrier" declare void @use1(i32) nosync norecurse nounwind nocallback @@ -654,7 +740,7 @@ declare void @__kmpc_target_deinit(ptr, i8) nocallback declare void @llvm.assume(i1) !llvm.module.flags = !{!0, !1} -!nvvm.annotations = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16} +!nvvm.annotations = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18} !0 = !{i32 7, !"openmp", i32 50} !1 = !{i32 7, !"openmp-device", i32 50} @@ -673,6 +759,8 @@ declare void @llvm.assume(i1) !14 = !{ptr @kernel4d2, !"kernel", i32 1} !15 = !{ptr @kernel4c3, !"kernel", i32 1} !16 = !{ptr @kernel4d3, !"kernel", i32 1} +!17 = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1} +!18 = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1} ;. ; TUNIT: attributes #[[ATTR0]] = { norecurse "kernel" } @@ -709,4 +797,6 @@ declare void @llvm.assume(i1) ; CHECK: [[META14:![0-9]+]] = !{ptr @kernel4d2, !"kernel", i32 1} ; CHECK: [[META15:![0-9]+]] = !{ptr @kernel4c3, !"kernel", i32 1} ; CHECK: [[META16:![0-9]+]] = !{ptr @kernel4d3, !"kernel", i32 1} +; CHECK: [[META17:![0-9]+]] = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1} +; CHECK: [[META18:![0-9]+]] = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1} ;.