-
Notifications
You must be signed in to change notification settings - Fork 15.1k
Revert "[AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr instruction with inline-asm" #166186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
… for cal…" This reverts commit 332f9b5.
|
@llvm/pr-subscribers-backend-amdgpu Author: Robert Imschweiler (ro-i) ChangesReverts llvm/llvm-project#152161 Need to revert to fix changed logic for the expensive checks. Example failure from the buildbot: Patch is 47.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166186.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 706237b906cc3..733c5d520fb23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,52 +181,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
return NewRetBlock;
}
-static BasicBlock *
-createDummyReturnBlock(Function &F,
- SmallVector<BasicBlock *, 4> &ReturningBlocks) {
- BasicBlock *DummyReturnBB =
- BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
- Type *RetTy = F.getReturnType();
- Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
- ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
- ReturningBlocks.push_back(DummyReturnBB);
- return DummyReturnBB;
-}
-
-/// Handle conditional branch instructions (-> 2 targets) and callbr
-/// instructions with N targets.
-static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
- BasicBlock *DummyReturnBB,
- std::vector<DominatorTree::UpdateType> &Updates) {
- SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
- // Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
- Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
- // 'Successors' become successors of TransitionBB instead of BB,
- // and TransitionBB becomes a single successor of BB.
- Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
- for (BasicBlock *Successor : Successors) {
- Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
- Updates.emplace_back(DominatorTree::Delete, BB, Successor);
- }
-
- // Create a branch that will always branch to the transition block and
- // references DummyReturnBB.
- BB->getTerminator()->eraseFromParent();
- BranchInst::Create(TransitionBB, DummyReturnBB,
- ConstantInt::getTrue(F.getContext()), BB);
- Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-}
-
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
+ assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
+
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
- !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
+ !isa<BranchInst>(PDT.getRoot()->getTerminator())))
return false;
// Loop over all of the blocks in a function, tracking all of the blocks that
@@ -260,27 +222,46 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
if (HasDivergentExitBlock)
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
- if (!DummyReturnBB)
- DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+ ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+ if (DummyReturnBB == nullptr) {
+ DummyReturnBB = BasicBlock::Create(F.getContext(),
+ "DummyReturnBlock", &F);
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+ ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ ReturningBlocks.push_back(DummyReturnBB);
+ }
if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
- BranchInst::Create(LoopHeaderBB, DummyReturnBB,
- ConstantInt::getTrue(F.getContext()), BB);
+ BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+ Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+ } else { // Conditional branch.
+ SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+ // Create a new transition block to hold the conditional branch.
+ BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+ Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+ // 'Successors' become successors of TransitionBB instead of BB,
+ // and TransitionBB becomes a single successor of BB.
+ Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+ for (BasicBlock *Successor : Successors) {
+ Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+ Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+ }
+
+ // Create a branch that will always branch to the transition block and
+ // references DummyReturnBB.
+ BB->getTerminator()->eraseFromParent();
+ BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
- } else {
- handleNBranch(F, BB, BI, DummyReturnBB, Updates);
}
Changed = true;
- } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
- if (!DummyReturnBB)
- DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
-
- handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
- } else {
- llvm_unreachable("unsupported block terminator");
}
}
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0a8f5ea2fdae1..5f6f66a4bc213 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,10 +558,11 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
} else {
// Test for successors as back edge
BasicBlock *BB = N->getNodeAs<BasicBlock>();
- if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
- for (BasicBlock *Succ : Term->successors())
- if (Visited.count(Succ))
- Loops[Succ] = BB;
+ BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+ for (BasicBlock *Succ : Term->successors())
+ if (Visited.count(Succ))
+ Loops[Succ] = BB;
}
}
@@ -593,7 +594,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
for (BasicBlock *P : predecessors(BB)) {
// Ignore it if it's a branch from outside into our region entry
- if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
+ if (!ParentRegion->contains(P))
continue;
Region *R = RI->getRegionFor(P);
@@ -1401,17 +1402,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
/// Run the transformation for each region found
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
const TargetTransformInfo *TTI) {
- // CallBr and its corresponding direct target blocks are for now ignored by
- // this pass. This is not a limitation for the currently intended uses cases
- // of callbr in the AMDGPU backend.
- // Parent and child regions are not affected by this (current) restriction.
- // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
- if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
+ if (R->isTopLevelRegion())
return false;
this->DT = DT;
this->TTI = TTI;
Func = R->getEntry()->getParent();
+ assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
ParentRegion = R;
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
deleted file mode 100644
index 253a6ec100eae..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/callbr.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
-
-define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
-; CHECK-LABEL: callbr_inline_asm:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dword v0, v[0:1]
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; %bb.1: ; %fallthrough
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_dword v[2:3], v0
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
-; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target
-; CHECK-NEXT: ; %indirect
-; CHECK-NEXT: ; Label of block must be emitted
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_dword v[4:5], v0
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
- %a = load i32, ptr %src, align 4
- callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
-fallthrough:
- store i32 %a, ptr %dst1, align 4
- br label %ret
-indirect:
- store i32 %a, ptr %dst2, align 4
- br label %ret
-ret:
- ret void
-}
-
-define void @callbr_self_loop(i1 %c) {
-; CHECK-LABEL: callbr_self_loop:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: .LBB1_1: ; %callbr
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_branch .LBB1_1
-; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target
-; CHECK-NEXT: ; %callbr.target.ret
-; CHECK-NEXT: ; Label of block must be emitted
-; CHECK-NEXT: s_setpc_b64 s[30:31]
- br label %callbr
-callbr:
- callbr void asm "", "!i"() to label %callbr [label %ret]
-ret:
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 076a99ff8588f..007e3f0a6bdbc 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,7 +3,6 @@
declare void @foo(ptr)
declare i1 @bar(ptr)
-declare i32 @bar32(ptr)
define void @musttail_call_without_return_value(ptr %p) {
; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -29,31 +28,6 @@ bb.1:
ret void
}
-define void @musttail_call_without_return_value_callbr(ptr %p) {
-; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
-; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
-; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
-; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
-; CHECK: [[BB_0]]:
-; CHECK-NEXT: musttail call void @foo(ptr [[P]])
-; CHECK-NEXT: ret void
-; CHECK: [[BB_1:.*:]]
-; CHECK-NEXT: ret void
-;
-entry:
- %load = load i32, ptr %p, align 1
- callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
-
-bb.0:
- musttail call void @foo(ptr %p)
- ret void
-
-bb.1:
- ret void
-}
-
define i1 @musttail_call_with_return_value(ptr %p) {
; CHECK-LABEL: define i1 @musttail_call_with_return_value(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -77,28 +51,3 @@ bb.0:
bb.1:
ret i1 %load
}
-
-define i32 @musttail_call_with_return_value_callbr(ptr %p) {
-; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
-; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
-; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
-; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
-; CHECK: [[BB_0]]:
-; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
-; CHECK-NEXT: ret i32 [[RET]]
-; CHECK: [[BB_1:.*:]]
-; CHECK-NEXT: ret i32 [[LOAD]]
-;
-entry:
- %load = load i32, ptr %p, align 1
- callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
-
-bb.0:
- %ret = musttail call i32 @bar32(ptr %p)
- ret i32 %ret
-
-bb.1:
- ret i32 %load
-}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index df635925b87df..3e2e43faca5aa 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,60 +36,26 @@ loop:
br label %loop
}
-define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loop_callbr:
-; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: ;;#ASMEND
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: s_endpgm
-; IR-LABEL: @infinite_loop_callbr(
-; IR-NEXT: entry:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP:%.*]] []
-; IR: loop:
-; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
-; IR: TransitionBlock:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP]] []
-; IR: DummyReturnBlock:
-; IR-NEXT: ret void
-;
-entry:
- callbr void asm "", ""() to label %loop []
-
-loop:
- store volatile i32 999, ptr addrspace(1) %out, align 4
- callbr void asm "", ""() to label %loop []
-}
-
define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loop_ret:
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT: s_cbranch_execz .LBB2_3
+; SI-NEXT: s_cbranch_execz .LBB1_3
; SI-NEXT: ; %bb.1: ; %loop.preheader
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB2_2: ; %loop
+; SI-NEXT: .LBB1_2: ; %loop
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB2_2
-; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock
+; SI-NEXT: s_cbranch_vccnz .LBB1_2
+; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loop_ret(
; IR-NEXT: entry:
@@ -115,93 +81,44 @@ return:
ret void
}
-define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loop_ret_callbr:
-; SI: ; %bb.0: ; %entry
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: ;;#ASMEND
-; SI-NEXT: ; %bb.1: ; %loop.preheader
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: .LBB3_2: ; Inline asm indirect target
-; SI-NEXT: ; %UnifiedReturnBlock
-; SI-NEXT: ; Label of block must be emitted
-; SI-NEXT: s_endpgm
-; IR-LABEL: @infinite_loop_ret_callbr(
-; IR-NEXT: entry:
-; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1
-; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32
-; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]])
-; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
-; IR: loop:
-; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
-; IR: TransitionBlock:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP]] []
-; IR: UnifiedReturnBlock:
-; IR-NEXT: ret void
-;
-entry:
- %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
- %cond = icmp eq i32 %tmp, 1
- %cond32 = zext i1 %cond to i32
- callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
-
-loop:
- store volatile i32 999, ptr addrspace(1) %out, align 4
- callbr void asm "", ""() to label %loop []
-
-return:
- ret void
-}
-
define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loops:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b64 s[2:3], -1
-; SI-NEXT: s_cbranch_scc1 .LBB4_4
+; SI-NEXT: s_cbranch_scc1 .LBB2_4
; SI-NEXT: ; %bb.1:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x378
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB4_2: ; %loop2
+; SI-NEXT: .LBB2_2: ; %loop2
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB4_2
+; SI-NEXT: s_cbranch_vccnz .LBB2_2
; SI-NEXT: ; %bb.3: ; %Flow
; SI-NEXT: s_mov_b64 s[2:3], 0
-; SI-NEXT: .LBB4_4: ; %Flow2
+; SI-NEXT: .LBB2_4: ; %Flow2
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB4_7
+; SI-NEXT: s_cbranch_vccz .LBB2_7
; SI-NEXT: ; %bb.5:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, 0
-; SI-NEXT: .LBB4_6: ; %loop1
+; SI-NEXT: .LBB2_6: ; %loop1
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB4_6
-; SI-NEXT: .LBB4_7: ; %DummyReturnBlock
+; SI-NEXT: s_cbranch_vccz .LBB2_6
+; SI-NEXT: .LBB2_7: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loops(
; IR-NEXT: entry:
@@ -227,78 +144,24 @@ loop2:
br label %loop2
}
-define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loops_callbr:
-; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: ;;#ASMEND
-; SI-NEXT: ; %bb.1: ; %loop1
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: s_endpgm
-; SI-NEXT: .LBB5_2: ; Inline asm indirect target
-; SI-NEXT: ; %loop2.preheader
-; SI-NEXT: ; Label of block must be emitted
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x378
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: s_endpgm
-; IR-LABEL: @infinite_loops_callbr(
-; IR-NEXT: entry:
-; IR-NEXT: callbr void asm "", "r,!i"(i32 poison)
-; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2]
-; IR: loop1:
-; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
-; IR: TransitionBlock:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP1]] []
-; IR: loop2:
-; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
-; IR: TransitionBlock1:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP2:%.*]] []
-; IR: DummyReturnBlock:
-; IR-NEXT: ret void
-;
-entry:
- callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
-
-loop1:
- store volatile i32 999, ptr addrspace(1) %out, align 4
- callbr void asm "", ""() to label %loop1 []
...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Robert Imschweiler (ro-i) ChangesReverts llvm/llvm-project#152161 Need to revert to fix changed logic for the expensive checks. Example failure from the buildbot: Patch is 47.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166186.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 706237b906cc3..733c5d520fb23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,52 +181,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
return NewRetBlock;
}
-static BasicBlock *
-createDummyReturnBlock(Function &F,
- SmallVector<BasicBlock *, 4> &ReturningBlocks) {
- BasicBlock *DummyReturnBB =
- BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
- Type *RetTy = F.getReturnType();
- Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
- ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
- ReturningBlocks.push_back(DummyReturnBB);
- return DummyReturnBB;
-}
-
-/// Handle conditional branch instructions (-> 2 targets) and callbr
-/// instructions with N targets.
-static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
- BasicBlock *DummyReturnBB,
- std::vector<DominatorTree::UpdateType> &Updates) {
- SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
- // Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
- Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
- // 'Successors' become successors of TransitionBB instead of BB,
- // and TransitionBB becomes a single successor of BB.
- Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
- for (BasicBlock *Successor : Successors) {
- Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
- Updates.emplace_back(DominatorTree::Delete, BB, Successor);
- }
-
- // Create a branch that will always branch to the transition block and
- // references DummyReturnBB.
- BB->getTerminator()->eraseFromParent();
- BranchInst::Create(TransitionBB, DummyReturnBB,
- ConstantInt::getTrue(F.getContext()), BB);
- Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-}
-
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
+ assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
+
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
- !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
+ !isa<BranchInst>(PDT.getRoot()->getTerminator())))
return false;
// Loop over all of the blocks in a function, tracking all of the blocks that
@@ -260,27 +222,46 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
if (HasDivergentExitBlock)
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
- if (!DummyReturnBB)
- DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+ ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+ if (DummyReturnBB == nullptr) {
+ DummyReturnBB = BasicBlock::Create(F.getContext(),
+ "DummyReturnBlock", &F);
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+ ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ ReturningBlocks.push_back(DummyReturnBB);
+ }
if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
- BranchInst::Create(LoopHeaderBB, DummyReturnBB,
- ConstantInt::getTrue(F.getContext()), BB);
+ BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+ Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+ } else { // Conditional branch.
+ SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+ // Create a new transition block to hold the conditional branch.
+ BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+ Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+ // 'Successors' become successors of TransitionBB instead of BB,
+ // and TransitionBB becomes a single successor of BB.
+ Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+ for (BasicBlock *Successor : Successors) {
+ Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+ Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+ }
+
+ // Create a branch that will always branch to the transition block and
+ // references DummyReturnBB.
+ BB->getTerminator()->eraseFromParent();
+ BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
- } else {
- handleNBranch(F, BB, BI, DummyReturnBB, Updates);
}
Changed = true;
- } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
- if (!DummyReturnBB)
- DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
-
- handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
- } else {
- llvm_unreachable("unsupported block terminator");
}
}
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0a8f5ea2fdae1..5f6f66a4bc213 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,10 +558,11 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
} else {
// Test for successors as back edge
BasicBlock *BB = N->getNodeAs<BasicBlock>();
- if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
- for (BasicBlock *Succ : Term->successors())
- if (Visited.count(Succ))
- Loops[Succ] = BB;
+ BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+ for (BasicBlock *Succ : Term->successors())
+ if (Visited.count(Succ))
+ Loops[Succ] = BB;
}
}
@@ -593,7 +594,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
for (BasicBlock *P : predecessors(BB)) {
// Ignore it if it's a branch from outside into our region entry
- if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
+ if (!ParentRegion->contains(P))
continue;
Region *R = RI->getRegionFor(P);
@@ -1401,17 +1402,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
/// Run the transformation for each region found
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
const TargetTransformInfo *TTI) {
- // CallBr and its corresponding direct target blocks are for now ignored by
- // this pass. This is not a limitation for the currently intended uses cases
- // of callbr in the AMDGPU backend.
- // Parent and child regions are not affected by this (current) restriction.
- // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
- if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
+ if (R->isTopLevelRegion())
return false;
this->DT = DT;
this->TTI = TTI;
Func = R->getEntry()->getParent();
+ assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
ParentRegion = R;
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
deleted file mode 100644
index 253a6ec100eae..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/callbr.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
-
-define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
-; CHECK-LABEL: callbr_inline_asm:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dword v0, v[0:1]
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; %bb.1: ; %fallthrough
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_dword v[2:3], v0
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
-; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target
-; CHECK-NEXT: ; %indirect
-; CHECK-NEXT: ; Label of block must be emitted
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_dword v[4:5], v0
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
- %a = load i32, ptr %src, align 4
- callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
-fallthrough:
- store i32 %a, ptr %dst1, align 4
- br label %ret
-indirect:
- store i32 %a, ptr %dst2, align 4
- br label %ret
-ret:
- ret void
-}
-
-define void @callbr_self_loop(i1 %c) {
-; CHECK-LABEL: callbr_self_loop:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: .LBB1_1: ; %callbr
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_branch .LBB1_1
-; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target
-; CHECK-NEXT: ; %callbr.target.ret
-; CHECK-NEXT: ; Label of block must be emitted
-; CHECK-NEXT: s_setpc_b64 s[30:31]
- br label %callbr
-callbr:
- callbr void asm "", "!i"() to label %callbr [label %ret]
-ret:
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 076a99ff8588f..007e3f0a6bdbc 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,7 +3,6 @@
declare void @foo(ptr)
declare i1 @bar(ptr)
-declare i32 @bar32(ptr)
define void @musttail_call_without_return_value(ptr %p) {
; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -29,31 +28,6 @@ bb.1:
ret void
}
-define void @musttail_call_without_return_value_callbr(ptr %p) {
-; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
-; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
-; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
-; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
-; CHECK: [[BB_0]]:
-; CHECK-NEXT: musttail call void @foo(ptr [[P]])
-; CHECK-NEXT: ret void
-; CHECK: [[BB_1:.*:]]
-; CHECK-NEXT: ret void
-;
-entry:
- %load = load i32, ptr %p, align 1
- callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
-
-bb.0:
- musttail call void @foo(ptr %p)
- ret void
-
-bb.1:
- ret void
-}
-
define i1 @musttail_call_with_return_value(ptr %p) {
; CHECK-LABEL: define i1 @musttail_call_with_return_value(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -77,28 +51,3 @@ bb.0:
bb.1:
ret i1 %load
}
-
-define i32 @musttail_call_with_return_value_callbr(ptr %p) {
-; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
-; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
-; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
-; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
-; CHECK: [[BB_0]]:
-; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
-; CHECK-NEXT: ret i32 [[RET]]
-; CHECK: [[BB_1:.*:]]
-; CHECK-NEXT: ret i32 [[LOAD]]
-;
-entry:
- %load = load i32, ptr %p, align 1
- callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
-
-bb.0:
- %ret = musttail call i32 @bar32(ptr %p)
- ret i32 %ret
-
-bb.1:
- ret i32 %load
-}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index df635925b87df..3e2e43faca5aa 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,60 +36,26 @@ loop:
br label %loop
}
-define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loop_callbr:
-; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: ;;#ASMEND
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: s_endpgm
-; IR-LABEL: @infinite_loop_callbr(
-; IR-NEXT: entry:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP:%.*]] []
-; IR: loop:
-; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
-; IR: TransitionBlock:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP]] []
-; IR: DummyReturnBlock:
-; IR-NEXT: ret void
-;
-entry:
- callbr void asm "", ""() to label %loop []
-
-loop:
- store volatile i32 999, ptr addrspace(1) %out, align 4
- callbr void asm "", ""() to label %loop []
-}
-
define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loop_ret:
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT: s_cbranch_execz .LBB2_3
+; SI-NEXT: s_cbranch_execz .LBB1_3
; SI-NEXT: ; %bb.1: ; %loop.preheader
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB2_2: ; %loop
+; SI-NEXT: .LBB1_2: ; %loop
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB2_2
-; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock
+; SI-NEXT: s_cbranch_vccnz .LBB1_2
+; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loop_ret(
; IR-NEXT: entry:
@@ -115,93 +81,44 @@ return:
ret void
}
-define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loop_ret_callbr:
-; SI: ; %bb.0: ; %entry
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: ;;#ASMEND
-; SI-NEXT: ; %bb.1: ; %loop.preheader
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: .LBB3_2: ; Inline asm indirect target
-; SI-NEXT: ; %UnifiedReturnBlock
-; SI-NEXT: ; Label of block must be emitted
-; SI-NEXT: s_endpgm
-; IR-LABEL: @infinite_loop_ret_callbr(
-; IR-NEXT: entry:
-; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1
-; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32
-; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]])
-; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
-; IR: loop:
-; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
-; IR: TransitionBlock:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP]] []
-; IR: UnifiedReturnBlock:
-; IR-NEXT: ret void
-;
-entry:
- %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
- %cond = icmp eq i32 %tmp, 1
- %cond32 = zext i1 %cond to i32
- callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
-
-loop:
- store volatile i32 999, ptr addrspace(1) %out, align 4
- callbr void asm "", ""() to label %loop []
-
-return:
- ret void
-}
-
define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loops:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b64 s[2:3], -1
-; SI-NEXT: s_cbranch_scc1 .LBB4_4
+; SI-NEXT: s_cbranch_scc1 .LBB2_4
; SI-NEXT: ; %bb.1:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x378
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB4_2: ; %loop2
+; SI-NEXT: .LBB2_2: ; %loop2
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB4_2
+; SI-NEXT: s_cbranch_vccnz .LBB2_2
; SI-NEXT: ; %bb.3: ; %Flow
; SI-NEXT: s_mov_b64 s[2:3], 0
-; SI-NEXT: .LBB4_4: ; %Flow2
+; SI-NEXT: .LBB2_4: ; %Flow2
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB4_7
+; SI-NEXT: s_cbranch_vccz .LBB2_7
; SI-NEXT: ; %bb.5:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, 0
-; SI-NEXT: .LBB4_6: ; %loop1
+; SI-NEXT: .LBB2_6: ; %loop1
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB4_6
-; SI-NEXT: .LBB4_7: ; %DummyReturnBlock
+; SI-NEXT: s_cbranch_vccz .LBB2_6
+; SI-NEXT: .LBB2_7: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loops(
; IR-NEXT: entry:
@@ -227,78 +144,24 @@ loop2:
br label %loop2
}
-define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loops_callbr:
-; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: ;;#ASMEND
-; SI-NEXT: ; %bb.1: ; %loop1
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: s_endpgm
-; SI-NEXT: .LBB5_2: ; Inline asm indirect target
-; SI-NEXT: ; %loop2.preheader
-; SI-NEXT: ; Label of block must be emitted
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x378
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: s_endpgm
-; IR-LABEL: @infinite_loops_callbr(
-; IR-NEXT: entry:
-; IR-NEXT: callbr void asm "", "r,!i"(i32 poison)
-; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2]
-; IR: loop1:
-; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
-; IR: TransitionBlock:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP1]] []
-; IR: loop2:
-; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
-; IR: TransitionBlock1:
-; IR-NEXT: callbr void asm "", ""()
-; IR-NEXT: to label [[LOOP2:%.*]] []
-; IR: DummyReturnBlock:
-; IR-NEXT: ret void
-;
-entry:
- callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
-
-loop1:
- store volatile i32 999, ptr addrspace(1) %out, align 4
- callbr void asm "", ""() to label %loop1 []
...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Reverts #152161
Need to revert to fix changed logic for the expensive checks. Example failure from the buildbot: