From 523f90a2bad9995e52e2f00dd42836c726076b5a Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Thu, 18 Apr 2019 16:17:35 +0000 Subject: [PATCH] [SDA] Bug fix: Use IPD outside the loop as divergence bound Summary: The immediate post dominator of the loop header may be part of the divergent loop. Since this /was/ the divergence propagation bound the SDA would not detect joins of divergent paths outside the loop. Reviewers: nhaehnle Reviewed By: nhaehnle Subscribers: mmasten, arsenm, jvesely, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59042 llvm-svn: 358681 --- llvm/lib/Analysis/SyncDependenceAnalysis.cpp | 28 +++++++++----- .../AMDGPU/hidden_diverge.ll | 37 +++++++++++++++++++ 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp index 54b4bb3113baf..3cf248a311428 100644 --- a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp @@ -218,14 +218,9 @@ struct DivergencePropagator { template std::unique_ptr computeJoinPoints(const BasicBlock &RootBlock, - SuccessorIterable NodeSuccessors, const Loop *ParentLoop) { + SuccessorIterable NodeSuccessors, const Loop *ParentLoop, const BasicBlock * PdBoundBlock) { assert(JoinBlocks); - // immediate post dominator (no join block beyond that block) - const auto *PdNode = PDT.getNode(const_cast(&RootBlock)); - const auto *IpdNode = PdNode->getIDom(); - const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; - // bootstrap with branch targets for (const auto *SuccBlock : NodeSuccessors) { DefMap.emplace(SuccBlock, SuccBlock); @@ -340,13 +335,23 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) { // already available in cache? auto ItCached = CachedLoopExitJoins.find(&Loop); - if (ItCached != CachedLoopExitJoins.end()) + if (ItCached != CachedLoopExitJoins.end()) { return *ItCached->second; + } + + // dont propagte beyond the immediate post dom of the loop + const auto *PdNode = PDT.getNode(const_cast(Loop.getHeader())); + const auto *IpdNode = PdNode->getIDom(); + const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + while (PdBoundBlock && Loop.contains(PdBoundBlock)) { + IpdNode = IpdNode->getIDom(); + PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + } // compute all join points DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; auto JoinBlocks = Propagator.computeJoinPoints( - *Loop.getHeader(), LoopExits, Loop.getParentLoop()); + *Loop.getHeader(), LoopExits, Loop.getParentLoop(), PdBoundBlock); auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks)); assert(ItInserted.second); @@ -365,11 +370,16 @@ SyncDependenceAnalysis::join_blocks(const Instruction &Term) { if (ItCached != CachedBranchJoins.end()) return *ItCached->second; + // dont propagate beyond the immediate post dominator of the branch + const auto *PdNode = PDT.getNode(const_cast(Term.getParent())); + const auto *IpdNode = PdNode->getIDom(); + const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; + // compute all join points DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; const auto &TermBlock = *Term.getParent(); auto JoinBlocks = Propagator.computeJoinPoints( - TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock)); + TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock), PdBoundBlock); auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks)); assert(ItInserted.second); diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll index 660c4f573f1df..889553d347123 100644 --- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll +++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll @@ -21,6 +21,43 @@ merge: ret void } +define amdgpu_kernel void @hidden_loop_ipd(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_loop_ipd' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cond.var = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %cond.var = icmp + %cond.uni = icmp slt i32 %n, 0 +; CHECK-NOT: DIVERGENT: %cond.uni = icmp + br label %for.header +for.header: + br i1 %cond.var, label %A, label %B +A: + br label %C +B: + br label %C +C: + br i1 %cond.uni, label %E, label %D +D: + br i1 %cond.var, label %for.header, label %F + +E: + %e.lcssa.uni = phi i32 [ 0, %C ] +; CHECK-NOT: DIVERGENT: %e.lcssa.uni = phi i32 + br label %G + +F: + %f.lcssa.uni = phi i32 [ 1, %D ] +; CHECK-NOT: DIVERGENT: %f.lcssa.uni = phi i32 + br label %G + +G: + %g.join.var = phi i32 [ %e.lcssa.uni, %E ], [ %f.lcssa.uni, %F ] +; CHECK: DIVERGENT: %g.join.var = phi i32 + ret void +} + + declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone }