-
Notifications
You must be signed in to change notification settings - Fork 15.1k
Reapply: [AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr instruction with inline-asm (#152161) #166195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-llvm-transforms Author: Robert Imschweiler (ro-i) ChangesReapply #152161 with fixed 'changed' flags (see second commit). Patch is 48.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166195.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index ddf9a24eb5230..fe81a5efd9d51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
return NewRetBlock;
}
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+ SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+ BasicBlock *DummyReturnBB =
+ BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+ ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ ReturningBlocks.push_back(DummyReturnBB);
+ return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+ BasicBlock *DummyReturnBB,
+ std::vector<DominatorTree::UpdateType> &Updates) {
+ SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+ // Create a new transition block to hold the conditional branch.
+ BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+ Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+ // 'Successors' become successors of TransitionBB instead of BB,
+ // and TransitionBB becomes a single successor of BB.
+ Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+ for (BasicBlock *Successor : Successors) {
+ Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+ Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+ }
+
+ // Create a branch that will always branch to the transition block and
+ // references DummyReturnBB.
+ BB->getTerminator()->eraseFromParent();
+ BranchInst::Create(TransitionBB, DummyReturnBB,
+ ConstantInt::getTrue(F.getContext()), BB);
+ Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
- assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
- !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+ !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
return false;
// Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
if (HasDivergentExitBlock)
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
- ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
- if (DummyReturnBB == nullptr) {
- DummyReturnBB =
- BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
- Type *RetTy = F.getReturnType();
- Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
- ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
- ReturningBlocks.push_back(DummyReturnBB);
- }
+ if (!DummyReturnBB)
+ DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
- BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
- Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
- } else { // Conditional branch.
- SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
- // Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
- Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
- // 'Successors' become successors of TransitionBB instead of BB,
- // and TransitionBB becomes a single successor of BB.
- Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
- for (BasicBlock *Successor : Successors) {
- Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
- Updates.emplace_back(DominatorTree::Delete, BB, Successor);
- }
-
- // Create a branch that will always branch to the transition block and
- // references DummyReturnBB.
- BB->getTerminator()->eraseFromParent();
- BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+ BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+ ConstantInt::getTrue(F.getContext()), BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+ } else {
+ handleNBranch(F, BB, BI, DummyReturnBB, Updates);
}
Changed = true;
+ } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+ if (!DummyReturnBB)
+ DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+ handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+ Changed = true;
+ } else {
+ llvm_unreachable("unsupported block terminator");
}
}
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5f6f66a4bc213..0a8f5ea2fdae1 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
} else {
// Test for successors as back edge
BasicBlock *BB = N->getNodeAs<BasicBlock>();
- BranchInst *Term = cast<BranchInst>(BB->getTerminator());
-
- for (BasicBlock *Succ : Term->successors())
- if (Visited.count(Succ))
- Loops[Succ] = BB;
+ if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
+ for (BasicBlock *Succ : Term->successors())
+ if (Visited.count(Succ))
+ Loops[Succ] = BB;
}
}
@@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
for (BasicBlock *P : predecessors(BB)) {
// Ignore it if it's a branch from outside into our region entry
- if (!ParentRegion->contains(P))
+ if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
continue;
Region *R = RI->getRegionFor(P);
@@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
/// Run the transformation for each region found
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
const TargetTransformInfo *TTI) {
- if (R->isTopLevelRegion())
+ // CallBr and its corresponding direct target blocks are for now ignored by
+ // this pass. This is not a limitation for the currently intended uses cases
+ // of callbr in the AMDGPU backend.
+ // Parent and child regions are not affected by this (current) restriction.
+ // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
+ if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
return false;
this->DT = DT;
this->TTI = TTI;
Func = R->getEntry()->getParent();
- assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
ParentRegion = R;
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 94c5c1709f43e..e86ab13094b15 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;
// Redirect exiting edges through a control flow hub.
ControlFlowHub CHub;
+ bool Changed = false;
for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
BasicBlock *BB = ExitingBlocks[I];
@@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
bool UpdatedLI = false;
BasicBlock *NewSucc =
SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI);
+ // SplitCallBrEdge modifies the CFG because it creates an intermediate
+ // block. So we need to set the changed flag no matter what the
+ // ControlFlowHub is going to do later.
+ Changed = true;
// Even if CallBr and Succ do not have a common parent loop, we need to
// add the new target block to the parent loop of the current loop.
if (!UpdatedLI)
@@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
bool ChangedCFG;
std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
&DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
+ ChangedCFG |= Changed;
if (!ChangedCFG)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
new file mode 100644
index 0000000000000..253a6ec100eae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
+; CHECK-LABEL: callbr_inline_asm:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dword v0, v[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; %bb.1: ; %fallthrough
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[2:3], v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target
+; CHECK-NEXT: ; %indirect
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[4:5], v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %a = load i32, ptr %src, align 4
+ callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
+fallthrough:
+ store i32 %a, ptr %dst1, align 4
+ br label %ret
+indirect:
+ store i32 %a, ptr %dst2, align 4
+ br label %ret
+ret:
+ ret void
+}
+
+define void @callbr_self_loop(i1 %c) {
+; CHECK-LABEL: callbr_self_loop:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: .LBB1_1: ; %callbr
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_branch .LBB1_1
+; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target
+; CHECK-NEXT: ; %callbr.target.ret
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ br label %callbr
+callbr:
+ callbr void asm "", "!i"() to label %callbr [label %ret]
+ret:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 007e3f0a6bdbc..076a99ff8588f 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,6 +3,7 @@
declare void @foo(ptr)
declare i1 @bar(ptr)
+declare i32 @bar32(ptr)
define void @musttail_call_without_return_value(ptr %p) {
; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
ret void
}
+define void @musttail_call_without_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
+; CHECK: [[BB_0]]:
+; CHECK-NEXT: musttail call void @foo(ptr [[P]])
+; CHECK-NEXT: ret void
+; CHECK: [[BB_1:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %load = load i32, ptr %p, align 1
+ callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+ musttail call void @foo(ptr %p)
+ ret void
+
+bb.1:
+ ret void
+}
+
define i1 @musttail_call_with_return_value(ptr %p) {
; CHECK-LABEL: define i1 @musttail_call_with_return_value(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
bb.1:
ret i1 %load
}
+
+define i32 @musttail_call_with_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
+; CHECK: [[BB_0]]:
+; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
+; CHECK-NEXT: ret i32 [[RET]]
+; CHECK: [[BB_1:.*:]]
+; CHECK-NEXT: ret i32 [[LOAD]]
+;
+entry:
+ %load = load i32, ptr %p, align 1
+ callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+ %ret = musttail call i32 @bar32(ptr %p)
+ ret i32 %ret
+
+bb.1:
+ ret i32 %load
+}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e2e43faca5aa..df635925b87df 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,26 +36,60 @@ loop:
br label %loop
}
+define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+; IR-LABEL: @infinite_loop_callbr(
+; IR-NEXT: entry:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP:%.*]] []
+; IR: loop:
+; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP]] []
+; IR: DummyReturnBlock:
+; IR-NEXT: ret void
+;
+entry:
+ callbr void asm "", ""() to label %loop []
+
+loop:
+ store volatile i32 999, ptr addrspace(1) %out, align 4
+ callbr void asm "", ""() to label %loop []
+}
+
define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loop_ret:
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT: s_cbranch_execz .LBB1_3
+; SI-NEXT: s_cbranch_execz .LBB2_3
; SI-NEXT: ; %bb.1: ; %loop.preheader
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB1_2: ; %loop
+; SI-NEXT: .LBB2_2: ; %loop
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB1_2
-; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock
+; SI-NEXT: s_cbranch_vccnz .LBB2_2
+; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loop_ret(
; IR-NEXT: entry:
@@ -81,44 +115,93 @@ return:
ret void
}
+define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_ret_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: ; %bb.1: ; %loop.preheader
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: .LBB3_2: ; Inline asm indirect target
+; SI-NEXT: ; %UnifiedReturnBlock
+; SI-NEXT: ; Label of block must be emitted
+; SI-NEXT: s_endpgm
+; IR-LABEL: @infinite_loop_ret_callbr(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32
+; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]])
+; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR: loop:
+; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP]] []
+; IR: UnifiedReturnBlock:
+; IR-NEXT: ret void
+;
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %cond = icmp eq i32 %tmp, 1
+ %cond32 = zext i1 %cond to i32
+ callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
+
+loop:
+ store volatile i32 999, ptr addrspace(1) %out, align 4
+ callbr void asm "", ""() to label %loop []
+
+return:
+ ret void
+}
+
define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loops:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b64 s[2:3], -1
-; SI-NEXT: s_cbranch_scc1 .LBB2_4
+; SI-NEXT: s_cbranch_scc1 .LBB4_4
; SI-NEXT: ; %bb.1:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x378
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB2_2: ; %loop2
+; SI-NEXT: .LBB4_2: ; %loop2
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB2_2
+; SI-NEXT: s_cbranch_vccnz .LBB4_2
; SI-NEXT: ; %bb.3: ; %Flow
; SI-NEXT: s_mov_b64 s[2:3], 0
-; SI-NEXT: .LBB2_4: ; %Flow2
+; SI-NEXT: .LBB4_4: ; %Flow2
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB2_7
+; SI-NEXT: s_cbranch_vccz .LBB4_7
; SI-NEXT: ; %bb.5:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, 0
-; SI-NEXT: .LBB2_6: ; %loop1
+; SI-NEXT: .LBB4_6: ; %loop1
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB2_6
-; SI-NEXT: .LBB2_7: ; %DummyReturnBlock
+; SI-NEXT: s_cbranch_vccz .LBB4_6
+; SI-NEXT: .LBB4_7: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loops(
; IR-NEXT: entry:
@@ -144,24 +227,78 @@ loop2:
br label %loop2
}
+define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loops_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: ; %bb.1: ; %loop1
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: ...
[truncated]
|
|
@llvm/pr-subscribers-backend-amdgpu Author: Robert Imschweiler (ro-i) ChangesReapply #152161 with fixed 'changed' flags (see second commit). Patch is 48.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166195.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index ddf9a24eb5230..fe81a5efd9d51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
return NewRetBlock;
}
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+ SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+ BasicBlock *DummyReturnBB =
+ BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+ ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ ReturningBlocks.push_back(DummyReturnBB);
+ return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+ BasicBlock *DummyReturnBB,
+ std::vector<DominatorTree::UpdateType> &Updates) {
+ SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+ // Create a new transition block to hold the conditional branch.
+ BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+ Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+ // 'Successors' become successors of TransitionBB instead of BB,
+ // and TransitionBB becomes a single successor of BB.
+ Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+ for (BasicBlock *Successor : Successors) {
+ Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+ Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+ }
+
+ // Create a branch that will always branch to the transition block and
+ // references DummyReturnBB.
+ BB->getTerminator()->eraseFromParent();
+ BranchInst::Create(TransitionBB, DummyReturnBB,
+ ConstantInt::getTrue(F.getContext()), BB);
+ Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
- assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
- !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+ !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
return false;
// Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
if (HasDivergentExitBlock)
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
- ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
- if (DummyReturnBB == nullptr) {
- DummyReturnBB =
- BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
- Type *RetTy = F.getReturnType();
- Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
- ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
- ReturningBlocks.push_back(DummyReturnBB);
- }
+ if (!DummyReturnBB)
+ DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
- BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
- Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
- } else { // Conditional branch.
- SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
- // Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
- Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
- // 'Successors' become successors of TransitionBB instead of BB,
- // and TransitionBB becomes a single successor of BB.
- Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
- for (BasicBlock *Successor : Successors) {
- Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
- Updates.emplace_back(DominatorTree::Delete, BB, Successor);
- }
-
- // Create a branch that will always branch to the transition block and
- // references DummyReturnBB.
- BB->getTerminator()->eraseFromParent();
- BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+ BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+ ConstantInt::getTrue(F.getContext()), BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+ } else {
+ handleNBranch(F, BB, BI, DummyReturnBB, Updates);
}
Changed = true;
+ } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+ if (!DummyReturnBB)
+ DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+ handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+ Changed = true;
+ } else {
+ llvm_unreachable("unsupported block terminator");
}
}
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5f6f66a4bc213..0a8f5ea2fdae1 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
} else {
// Test for successors as back edge
BasicBlock *BB = N->getNodeAs<BasicBlock>();
- BranchInst *Term = cast<BranchInst>(BB->getTerminator());
-
- for (BasicBlock *Succ : Term->successors())
- if (Visited.count(Succ))
- Loops[Succ] = BB;
+ if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
+ for (BasicBlock *Succ : Term->successors())
+ if (Visited.count(Succ))
+ Loops[Succ] = BB;
}
}
@@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
for (BasicBlock *P : predecessors(BB)) {
// Ignore it if it's a branch from outside into our region entry
- if (!ParentRegion->contains(P))
+ if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
continue;
Region *R = RI->getRegionFor(P);
@@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
/// Run the transformation for each region found
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
const TargetTransformInfo *TTI) {
- if (R->isTopLevelRegion())
+ // CallBr and its corresponding direct target blocks are for now ignored by
+ // this pass. This is not a limitation for the currently intended uses cases
+ // of callbr in the AMDGPU backend.
+ // Parent and child regions are not affected by this (current) restriction.
+ // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
+ if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
return false;
this->DT = DT;
this->TTI = TTI;
Func = R->getEntry()->getParent();
- assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
ParentRegion = R;
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 94c5c1709f43e..e86ab13094b15 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;
// Redirect exiting edges through a control flow hub.
ControlFlowHub CHub;
+ bool Changed = false;
for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
BasicBlock *BB = ExitingBlocks[I];
@@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
bool UpdatedLI = false;
BasicBlock *NewSucc =
SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI);
+ // SplitCallBrEdge modifies the CFG because it creates an intermediate
+ // block. So we need to set the changed flag no matter what the
+ // ControlFlowHub is going to do later.
+ Changed = true;
// Even if CallBr and Succ do not have a common parent loop, we need to
// add the new target block to the parent loop of the current loop.
if (!UpdatedLI)
@@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
bool ChangedCFG;
std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
&DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
+ ChangedCFG |= Changed;
if (!ChangedCFG)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
new file mode 100644
index 0000000000000..253a6ec100eae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
+; CHECK-LABEL: callbr_inline_asm:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dword v0, v[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; %bb.1: ; %fallthrough
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[2:3], v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target
+; CHECK-NEXT: ; %indirect
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[4:5], v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %a = load i32, ptr %src, align 4
+ callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
+fallthrough:
+ store i32 %a, ptr %dst1, align 4
+ br label %ret
+indirect:
+ store i32 %a, ptr %dst2, align 4
+ br label %ret
+ret:
+ ret void
+}
+
+define void @callbr_self_loop(i1 %c) {
+; CHECK-LABEL: callbr_self_loop:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: .LBB1_1: ; %callbr
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_branch .LBB1_1
+; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target
+; CHECK-NEXT: ; %callbr.target.ret
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ br label %callbr
+callbr:
+ callbr void asm "", "!i"() to label %callbr [label %ret]
+ret:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 007e3f0a6bdbc..076a99ff8588f 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,6 +3,7 @@
declare void @foo(ptr)
declare i1 @bar(ptr)
+declare i32 @bar32(ptr)
define void @musttail_call_without_return_value(ptr %p) {
; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
ret void
}
+define void @musttail_call_without_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
+; CHECK: [[BB_0]]:
+; CHECK-NEXT: musttail call void @foo(ptr [[P]])
+; CHECK-NEXT: ret void
+; CHECK: [[BB_1:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %load = load i32, ptr %p, align 1
+ callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+ musttail call void @foo(ptr %p)
+ ret void
+
+bb.1:
+ ret void
+}
+
define i1 @musttail_call_with_return_value(ptr %p) {
; CHECK-LABEL: define i1 @musttail_call_with_return_value(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
bb.1:
ret i1 %load
}
+
+define i32 @musttail_call_with_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
+; CHECK: [[BB_0]]:
+; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
+; CHECK-NEXT: ret i32 [[RET]]
+; CHECK: [[BB_1:.*:]]
+; CHECK-NEXT: ret i32 [[LOAD]]
+;
+entry:
+ %load = load i32, ptr %p, align 1
+ callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+ %ret = musttail call i32 @bar32(ptr %p)
+ ret i32 %ret
+
+bb.1:
+ ret i32 %load
+}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e2e43faca5aa..df635925b87df 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,26 +36,60 @@ loop:
br label %loop
}
+define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+; IR-LABEL: @infinite_loop_callbr(
+; IR-NEXT: entry:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP:%.*]] []
+; IR: loop:
+; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP]] []
+; IR: DummyReturnBlock:
+; IR-NEXT: ret void
+;
+entry:
+ callbr void asm "", ""() to label %loop []
+
+loop:
+ store volatile i32 999, ptr addrspace(1) %out, align 4
+ callbr void asm "", ""() to label %loop []
+}
+
define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loop_ret:
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT: s_cbranch_execz .LBB1_3
+; SI-NEXT: s_cbranch_execz .LBB2_3
; SI-NEXT: ; %bb.1: ; %loop.preheader
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB1_2: ; %loop
+; SI-NEXT: .LBB2_2: ; %loop
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB1_2
-; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock
+; SI-NEXT: s_cbranch_vccnz .LBB2_2
+; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loop_ret(
; IR-NEXT: entry:
@@ -81,44 +115,93 @@ return:
ret void
}
+define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_ret_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: ; %bb.1: ; %loop.preheader
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: .LBB3_2: ; Inline asm indirect target
+; SI-NEXT: ; %UnifiedReturnBlock
+; SI-NEXT: ; Label of block must be emitted
+; SI-NEXT: s_endpgm
+; IR-LABEL: @infinite_loop_ret_callbr(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32
+; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]])
+; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR: loop:
+; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP]] []
+; IR: UnifiedReturnBlock:
+; IR-NEXT: ret void
+;
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %cond = icmp eq i32 %tmp, 1
+ %cond32 = zext i1 %cond to i32
+ callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
+
+loop:
+ store volatile i32 999, ptr addrspace(1) %out, align 4
+ callbr void asm "", ""() to label %loop []
+
+return:
+ ret void
+}
+
define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loops:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b64 s[2:3], -1
-; SI-NEXT: s_cbranch_scc1 .LBB2_4
+; SI-NEXT: s_cbranch_scc1 .LBB4_4
; SI-NEXT: ; %bb.1:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x378
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB2_2: ; %loop2
+; SI-NEXT: .LBB4_2: ; %loop2
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB2_2
+; SI-NEXT: s_cbranch_vccnz .LBB4_2
; SI-NEXT: ; %bb.3: ; %Flow
; SI-NEXT: s_mov_b64 s[2:3], 0
-; SI-NEXT: .LBB2_4: ; %Flow2
+; SI-NEXT: .LBB4_4: ; %Flow2
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB2_7
+; SI-NEXT: s_cbranch_vccz .LBB4_7
; SI-NEXT: ; %bb.5:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, 0
-; SI-NEXT: .LBB2_6: ; %loop1
+; SI-NEXT: .LBB4_6: ; %loop1
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB2_6
-; SI-NEXT: .LBB2_7: ; %DummyReturnBlock
+; SI-NEXT: s_cbranch_vccz .LBB4_6
+; SI-NEXT: .LBB4_7: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loops(
; IR-NEXT: entry:
@@ -144,24 +227,78 @@ loop2:
br label %loop2
}
+define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loops_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: ; %bb.1: ; %loop1
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: ...
[truncated]
|
Reapply #152161 with fixed 'changed' flags (see second commit).