Skip to content

Commit

Permalink
AMDGPU/SI: Handle infinite loop for the structurizer to work with CFG…
Browse files Browse the repository at this point in the history
… with infinite loops.

Summary:
  The current StructurizeCFG pass only works for CFG with one exit. AMDGPUUnifyDivergentExitNodes combines multiple "return" blocks and/or "unreachable" blocks
to one exit block for the Structurizer to work. However, infinite loop is another kind of special "exit", and if we don't handle it, the case of multiple exits will prevent the structurizer from working.

In this work, for each infinite loop, we add a dummy edge to the "return" block, and thus the AMDGPUUnifyDivergentExitNodes pass will work with infinite loops.
This will make CFG with infinite loops be structurized.

Reviewer:
  nhaehnle

Differential Revision:
  https://reviews.llvm.org/D46340

llvm-svn: 332625
  • Loading branch information
Changpeng Fang committed May 17, 2018
1 parent daf5169 commit 391bcf8
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 15 deletions.
32 changes: 32 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
Expand Up @@ -170,13 +170,45 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
SmallVector<BasicBlock *, 4> ReturningBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;

// Dummy return block for infinite loop.
BasicBlock *DummyReturnBB = nullptr;

for (BasicBlock *BB : PDT.getRoots()) {
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
ReturningBlocks.push_back(BB);
} else if (isa<UnreachableInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {

ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
if (DummyReturnBB == nullptr) {
DummyReturnBB = BasicBlock::Create(F.getContext(),
"DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
ReturningBlocks.push_back(DummyReturnBB);
}

if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
} else { // Conditional branch.
// Create a new transition block to hold the conditional branch.
BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(),
"TransitionBlock", &F);

// Move BI from BB to the new transition block.
BI->removeFromParent();
TransitionBB->getInstList().push_back(BI);

// Create a branch that will always branch to the transition block.
BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
}
}
}

Expand Down
17 changes: 11 additions & 6 deletions llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
Expand Up @@ -431,11 +431,17 @@ endif:
; si_mask_branch

; GCN-LABEL: {{^}}analyze_mask_branch:
; GCN: v_cmp_lt_f32_e32 vcc
; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
; GCN: v_cmp_nlt_f32_e32 vcc
; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc
; GCN-NEXT: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]]
; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]

; GCN: [[FLOW]]: ; %Flow
; GCN-NEXT: s_or_saveexec_b64 [[TEMP_MASK1:s\[[0-9]+:[0-9]+\]]], [[MASK]]
; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]]
; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]

; GCN-NEXT: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop_body
; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop_body
; GCN: ;;#ASMSTART
; GCN: v_nop_e64
; GCN: v_nop_e64
Expand All @@ -444,6 +450,7 @@ endif:
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: ;;#ASMEND
; GCN: s_cbranch_vccz [[RET]]

; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body
; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
Expand All @@ -452,9 +459,7 @@ endif:
; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
; GCN-NEXT: s_setpc_b64 vcc

; GCN-NEXT: [[RET]]: ; %ret
; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
; GCN: buffer_store_dword
; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @analyze_mask_branch() #0 {
entry:
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
Expand Up @@ -2,10 +2,11 @@
; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s

; GCN-LABEL: {{^}}test_loop:
; GCN: s_and_b64 vcc, exec, -1
; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
; GCN: ds_read_b32
; GCN: ds_write_b32
; GCN: s_branch [[LABEL]]
; GCN: s_cbranch_vccnz [[LABEL]]
; GCN: s_endpgm
define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
entry:
Expand Down
161 changes: 155 additions & 6 deletions llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -1,18 +1,167 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify %s | FileCheck -check-prefix=IR %s

; SI-LABEL: {{^}}infinite_loop:
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
; SI: BB0_1:
; SI: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
; SI: s_waitcnt lgkmcnt(0)
; SI: buffer_store_dword [[REG]]
; SI: s_branch BB0_1
; SI: s_branch [[LOOP]]
define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
entry:
br label %for.body
br label %loop

for.body: ; preds = %entry, %for.body
loop:
store i32 999, i32 addrspace(1)* %out, align 4
br label %for.body
br label %loop
}


; IR-LABEL: @infinite_loop_ret(
; IR: br i1 %cond, label %loop, label %UnifiedReturnBlock

; IR: loop:
; IR: store i32 999, i32 addrspace(1)* %out, align 4
; IR: br i1 true, label %loop, label %UnifiedReturnBlock

; IR: UnifiedReturnBlock:
; IR: ret void


; SI-LABEL: {{^}}infinite_loop_ret:
; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]

; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
; SI: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
; SI: s_and_b64 vcc, exec, -1
; SI: s_waitcnt lgkmcnt(0)
; SI: buffer_store_dword [[REG]]
; SI: s_cbranch_vccnz [[LOOP]]

; SI: [[RET]]: ; %UnifiedReturnBlock
; SI: s_endpgm
define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
entry:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%cond = icmp eq i32 %tmp, 1
br i1 %cond, label %loop, label %return

loop:
store i32 999, i32 addrspace(1)* %out, align 4
br label %loop

return:
ret void
}


; IR-LABEL: @infinite_loops(
; IR: br i1 undef, label %loop1, label %loop2

; IR: loop1:
; IR: store i32 999, i32 addrspace(1)* %out, align 4
; IR: br i1 true, label %loop1, label %DummyReturnBlock

; IR: loop2:
; IR: store i32 888, i32 addrspace(1)* %out, align 4
; IR: br i1 true, label %loop2, label %DummyReturnBlock

; IR: DummyReturnBlock:
; IR: ret void


; SI-LABEL: {{^}}infinite_loops:

; SI: v_mov_b32_e32 [[REG1:v[0-9]+]], 0x3e7
; SI: s_and_b64 vcc, exec, -1

; SI: [[LOOP1:BB[0-9]+_[0-9]+]]: ; %loop1
; SI: s_waitcnt lgkmcnt(0)
; SI: buffer_store_dword [[REG1]]
; SI: s_cbranch_vccnz [[LOOP1]]
; SI: s_branch [[RET:BB[0-9]+_[0-9]+]]

; SI: v_mov_b32_e32 [[REG2:v[0-9]+]], 0x378
; SI: s_and_b64 vcc, exec, -1

; SI: [[LOOP2:BB[0-9]+_[0-9]+]]: ; %loop2
; SI: s_waitcnt lgkmcnt(0)
; SI: buffer_store_dword [[REG2]]
; SI: s_cbranch_vccnz [[LOOP2]]

; SI: [[RET]]: ; %DummyReturnBlock
; SI: s_endpgm
define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
entry:
br i1 undef, label %loop1, label %loop2

loop1:
store i32 999, i32 addrspace(1)* %out, align 4
br label %loop1

loop2:
store i32 888, i32 addrspace(1)* %out, align 4
br label %loop2
}



; IR-LABEL: @infinite_loop_nest_ret(
; IR: br i1 %cond1, label %outer_loop, label %UnifiedReturnBlock

; IR: outer_loop:
; IR: br label %inner_loop

; IR: inner_loop:
; IR: store i32 999, i32 addrspace(1)* %out, align 4
; IR: %cond3 = icmp eq i32 %tmp, 3
; IR: br i1 true, label %TransitionBlock, label %UnifiedReturnBlock

; IR: TransitionBlock:
; IR: br i1 %cond3, label %inner_loop, label %outer_loop

; IR: UnifiedReturnBlock:
; IR: ret void

; SI-LABEL: {{^}}infinite_loop_nest_ret:
; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]

; SI: s_mov_b32
; SI: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %outer_loop

; SI: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %inner_loop
; SI: s_waitcnt expcnt(0)
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
; SI: v_cmp_ne_u32_e32
; SI: s_waitcnt lgkmcnt(0)
; SI: buffer_store_dword [[REG]]

; SI: s_andn2_b64 exec
; SI: s_cbranch_execnz [[INNER_LOOP]]

; SI: s_andn2_b64 exec
; SI: s_cbranch_execnz [[OUTER_LOOP]]

; SI: [[RET]]: ; %UnifiedReturnBlock
; SI: s_endpgm
define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
entry:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%cond1 = icmp eq i32 %tmp, 1
br i1 %cond1, label %outer_loop, label %return

outer_loop:
; %cond2 = icmp eq i32 %tmp, 2
; br i1 %cond2, label %outer_loop, label %inner_loop
br label %inner_loop

inner_loop: ; preds = %LeafBlock, %LeafBlock1
store i32 999, i32 addrspace(1)* %out, align 4
%cond3 = icmp eq i32 %tmp, 3
br i1 %cond3, label %inner_loop, label %outer_loop

return:
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
Expand Up @@ -70,7 +70,7 @@

; GCN: [[BB9:BB[0-9]+_[0-9]+]]: ; %bb9
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_branch [[BB9]]
; GCN-NEXT: s_cbranch_vccnz [[BB9]]
define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* nocapture %arg) #0 {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
Expand Up @@ -96,7 +96,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
; SI-NEXT: s_cbranch_scc1 [[ENDPGM]]

; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]
; SI: s_branch [[INFLOOP]]
; SI: s_cbranch_vccnz [[INFLOOP]]

; SI: [[ENDPGM]]:
; SI: s_endpgm
Expand Down

0 comments on commit 391bcf8

Please sign in to comment.