Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Fix masked control flow with fallthrough blocks
Browse files Browse the repository at this point in the history
Unlike SelectionDAGBuilder, IRTranslator omits the unconditional
branch in fallthrough cases. Confusingly, the control flow pseudos
function in the opposite way the intrinsics are used, and the branch
targets always need to be swapped. We're inverting the target blocks,
so we need to figure out the old fallthrough block and insert a branch
to the original unconditional branch target.
  • Loading branch information
arsenm committed May 22, 2020
1 parent 5a230a1 commit 66fe602
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 39 deletions.
52 changes: 32 additions & 20 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2278,22 +2278,30 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
// Return the use branch instruction, otherwise null if the usage is invalid.
static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineInstr *&Br) {
MachineInstr *&Br,
MachineBasicBlock *&UncondBrTarget) {
Register CondDef = MI.getOperand(0).getReg();
if (!MRI.hasOneNonDBGUse(CondDef))
return nullptr;

MachineBasicBlock *Parent = MI.getParent();
MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
if (UseMI.getParent() != MI.getParent() ||
if (UseMI.getParent() != Parent ||
UseMI.getOpcode() != AMDGPU::G_BRCOND)
return nullptr;

// Make sure the cond br is followed by a G_BR
// Make sure the cond br is followed by a G_BR, or is the last instruction.
MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
if (Next != MI.getParent()->end()) {
if (Next == Parent->end()) {
MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
return nullptr;
UncondBrTarget = &*NextMBB;
} else {
if (Next->getOpcode() != AMDGPU::G_BR)
return nullptr;
Br = &*Next;
UncondBrTarget = Br->getOperand(0).getMBB();
}

return &UseMI;
Expand Down Expand Up @@ -4110,33 +4118,37 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else: {
MachineInstr *Br = nullptr;
if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
MachineBasicBlock *UncondBrTarget = nullptr;
if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());

B.setInstr(*BrCond);
Register Def = MI.getOperand(1).getReg();
Register Use = MI.getOperand(3).getReg();

MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
if (Br)
BrTarget = Br->getOperand(0).getMBB();

MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
if (IntrID == Intrinsic::amdgcn_if) {
B.buildInstr(AMDGPU::SI_IF)
.addDef(Def)
.addUse(Use)
.addMBB(BrTarget);
.addMBB(UncondBrTarget);
} else {
B.buildInstr(AMDGPU::SI_ELSE)
.addDef(Def)
.addUse(Use)
.addMBB(BrTarget)
.addMBB(UncondBrTarget)
.addImm(0);
}

if (Br)
Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
if (Br) {
Br->getOperand(0).setMBB(CondBrTarget);
} else {
// The IRTranslator skips inserting the G_BR for fallthrough cases, but
// since we're swapping branch targets it needs to be reinserted.
// FIXME: IRTranslator should probably not do this
B.buildBr(*CondBrTarget);
}

MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
Expand All @@ -4149,23 +4161,23 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
}
case Intrinsic::amdgcn_loop: {
MachineInstr *Br = nullptr;
if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
MachineBasicBlock *UncondBrTarget = nullptr;
if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());

B.setInstr(*BrCond);

MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
if (Br)
BrTarget = Br->getOperand(0).getMBB();

MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
Register Reg = MI.getOperand(2).getReg();
B.buildInstr(AMDGPU::SI_LOOP)
.addUse(Reg)
.addMBB(BrTarget);
.addMBB(UncondBrTarget);

if (Br)
Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
Br->getOperand(0).setMBB(CondBrTarget);
else
B.buildBr(*CondBrTarget);

MI.eraseFromParent();
BrCond->eraseFromParent();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4714,6 +4714,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
} else {
// Get the target from BR if we don't negate the condition
BR = findUser(BRCOND, ISD::BR);
assert(BR && "brcond missing unconditional branch user");
Target = BR->getOperand(1);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_cbranch_execnz BB1_2
; CHECK-NEXT: s_cbranch_execz BB1_2
; CHECK-NEXT: ; %bb.1: ; %if.true
; CHECK-NEXT: global_load_dword v0, v[0:1], off
; CHECK-NEXT: BB1_2: ; %endif
Expand Down
21 changes: 21 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s

# Make sure there's no crash if there is somehow no successor block.

# ERR: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_no_succ_block)

---
name: brcond_si_if_no_succ_block
body: |
bb.0:
S_NOP 0
bb.1:
successors: %bb.1
liveins: $vgpr0, $vgpr1
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s1) = G_ICMP intpred(ne), %0, %1
%3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
G_BRCOND %3, %bb.1
...
130 changes: 125 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ body: |
; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE64: [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
; WAVE64: [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: G_BR %bb.1
; WAVE64: bb.1:
; WAVE32-LABEL: name: brcond_si_if
; WAVE32: bb.0:
Expand All @@ -116,6 +117,7 @@ body: |
; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE32: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
; WAVE32: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: G_BR %bb.1
; WAVE32: bb.1:
bb.0:
successors: %bb.1
Expand All @@ -139,6 +141,7 @@ body: |
; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE64: [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
; WAVE64: [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: G_BR %bb.1
; WAVE64: bb.1:
; WAVE32-LABEL: name: brcond_si_else
; WAVE32: bb.0:
Expand All @@ -147,6 +150,7 @@ body: |
; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE32: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
; WAVE32: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: G_BR %bb.1
; WAVE32: bb.1:
bb.0:
successors: %bb.1
Expand All @@ -161,32 +165,148 @@ body: |
...

---
name: brcond_si_loop
name: brcond_si_loop_brcond
tracksRegLiveness: true
body: |
; WAVE64-LABEL: name: brcond_si_loop
; WAVE64-LABEL: name: brcond_si_loop_brcond
; WAVE64: bb.0:
; WAVE64: successors: %bb.1(0x80000000)
; WAVE64: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE64: [[COPY2:%[0-9]+]]:sreg_64_xexec(s64) = COPY $sgpr0_sgpr1
; WAVE64: SI_LOOP [[COPY2]](s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: bb.1:
; WAVE32-LABEL: name: brcond_si_loop
; WAVE64: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; WAVE64: S_NOP 0
; WAVE64: SI_LOOP [[COPY2]](s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: G_BR %bb.2
; WAVE64: bb.2:
; WAVE64: S_NOP 0
; WAVE32-LABEL: name: brcond_si_loop_brcond
; WAVE32: bb.0:
; WAVE32: successors: %bb.1(0x80000000)
; WAVE32: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE32: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec(s64) = COPY $sgpr0_sgpr1
; WAVE32: bb.1:
; WAVE32: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; WAVE32: S_NOP 0
; WAVE32: SI_LOOP [[COPY2]](s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: G_BR %bb.2
; WAVE32: bb.2:
; WAVE32: S_NOP 0
bb.0:
liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s64) = COPY $sgpr0_sgpr1
bb.1:
successors: %bb.1, %bb.2
S_NOP 0
%3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
G_BRCOND %3, %bb.2
G_BR %bb.1
bb.2:
S_NOP 0
...

# This usage is backwards from how the intrinsic is supposed to be
# used.
---
name: brcond_si_loop_brcond_back
tracksRegLiveness: true
body: |
; WAVE64-LABEL: name: brcond_si_loop_brcond_back
; WAVE64: bb.0:
; WAVE64: successors: %bb.1(0x80000000)
; WAVE64: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE64: [[COPY2:%[0-9]+]]:sreg_64_xexec(s64) = COPY $sgpr0_sgpr1
; WAVE64: bb.1:
; WAVE64: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; WAVE64: S_NOP 0
; WAVE64: SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: G_BR %bb.1
; WAVE64: bb.2:
; WAVE64: S_NOP 0
; WAVE32-LABEL: name: brcond_si_loop_brcond_back
; WAVE32: bb.0:
; WAVE32: successors: %bb.1(0x80000000)
; WAVE32: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE32: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec(s64) = COPY $sgpr0_sgpr1
; WAVE32: bb.1:
; WAVE32: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; WAVE32: S_NOP 0
; WAVE32: SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: G_BR %bb.1
; WAVE32: bb.2:
; WAVE32: S_NOP 0
bb.0:
successors: %bb.1
liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s64) = COPY $sgpr0_sgpr1
bb.1:
successors: %bb.1, %bb.2
S_NOP 0
%3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
G_BRCOND %3, %bb.1
G_BR %bb.2
bb.2:
S_NOP 0
...

# This usage is backwards from how the intrinsic is supposed to be
# used.
---
name: brcond_si_loop_brcond_back_fallthrough
tracksRegLiveness: true
body: |
; WAVE64-LABEL: name: brcond_si_loop_brcond_back_fallthrough
; WAVE64: bb.0:
; WAVE64: successors: %bb.1(0x80000000)
; WAVE64: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
; WAVE64: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; WAVE64: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE64: [[COPY2:%[0-9]+]]:sreg_64_xexec(s64) = COPY $sgpr0_sgpr1
; WAVE64: bb.1:
; WAVE64: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; WAVE64: S_NOP 0
; WAVE64: SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: G_BR %bb.1
; WAVE64: bb.2:
; WAVE32-LABEL: name: brcond_si_loop_brcond_back_fallthrough
; WAVE32: bb.0:
; WAVE32: successors: %bb.1(0x80000000)
; WAVE32: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
; WAVE32: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; WAVE32: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; WAVE32: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec(s64) = COPY $sgpr0_sgpr1
; WAVE32: bb.1:
; WAVE32: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; WAVE32: S_NOP 0
; WAVE32: SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: G_BR %bb.1
; WAVE32: bb.2:
bb.0:
liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s64) = COPY $sgpr0_sgpr1
bb.1:
successors: %bb.1, %bb.2
S_NOP 0
%3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
G_BRCOND %3, %bb.1
bb.2:
...
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ define void @localize_internal_globals(i1 %cond) {
; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX9-NEXT: s_cbranch_execnz BB2_2
; GFX9-NEXT: s_cbranch_execz BB2_2
; GFX9-NEXT: ; %bb.1: ; %bb1
; GFX9-NEXT: s_getpc_b64 s[6:7]
; GFX9-NEXT: s_add_u32 s6, s6, static.gv2@rel32@lo+4
Expand Down
Loading

0 comments on commit 66fe602

Please sign in to comment.