AMDGPU/GlobalISel: Fix masked control flow with fallthrough blocks

Unlike SelectionDAGBuilder, IRTranslator omits the unconditional branch in fallthrough cases. Confusingly, the control flow pseudos function in the opposite way the intrinsics are used, and the branch targets always need to be swapped. We're inverting the target blocks, so we need to figure out the old fallthrough block and insert a branch to the original unconditional branch target.
llvm · May 22, 2020 · 66fe602 · 66fe602
1 parent 5a230a1
commit 66fe602
Show file tree

Hide file tree

Showing 8 changed files with 193 additions and 39 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2278,22 +2278,30 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
 // Return the use branch instruction, otherwise null if the usage is invalid.
 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
                                        MachineRegisterInfo &MRI,
-                                       MachineInstr *&Br) {
+                                       MachineInstr *&Br,
+                                       MachineBasicBlock *&UncondBrTarget) {
   Register CondDef = MI.getOperand(0).getReg();
   if (!MRI.hasOneNonDBGUse(CondDef))
     return nullptr;
 
+  MachineBasicBlock *Parent = MI.getParent();
   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
-  if (UseMI.getParent() != MI.getParent() ||
+  if (UseMI.getParent() != Parent ||
       UseMI.getOpcode() != AMDGPU::G_BRCOND)
     return nullptr;
 
-  // Make sure the cond br is followed by a G_BR
+  // Make sure the cond br is followed by a G_BR, or is the last instruction.
   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
-  if (Next != MI.getParent()->end()) {
+  if (Next == Parent->end()) {
+    MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
+    if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
+      return nullptr;
+    UncondBrTarget = &*NextMBB;
+  } else {
     if (Next->getOpcode() != AMDGPU::G_BR)
       return nullptr;
     Br = &*Next;
+    UncondBrTarget = Br->getOperand(0).getMBB();
   }
 
   return &UseMI;
@@ -4110,33 +4118,37 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
   case Intrinsic::amdgcn_if:
   case Intrinsic::amdgcn_else: {
     MachineInstr *Br = nullptr;
-    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
+    MachineBasicBlock *UncondBrTarget = nullptr;
+    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
       const SIRegisterInfo *TRI
         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
 
       B.setInstr(*BrCond);
       Register Def = MI.getOperand(1).getReg();
       Register Use = MI.getOperand(3).getReg();
 
-      MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
-      if (Br)
-        BrTarget = Br->getOperand(0).getMBB();
-
+      MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
       if (IntrID == Intrinsic::amdgcn_if) {
         B.buildInstr(AMDGPU::SI_IF)
           .addDef(Def)
           .addUse(Use)
-          .addMBB(BrTarget);
+          .addMBB(UncondBrTarget);
       } else {
         B.buildInstr(AMDGPU::SI_ELSE)
           .addDef(Def)
           .addUse(Use)
-          .addMBB(BrTarget)
+          .addMBB(UncondBrTarget)
           .addImm(0);
       }
 
-      if (Br)
-        Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
+      if (Br) {
+        Br->getOperand(0).setMBB(CondBrTarget);
+      } else {
+        // The IRTranslator skips inserting the G_BR for fallthrough cases, but
+        // since we're swapping branch targets it needs to be reinserted.
+        // FIXME: IRTranslator should probably not do this
+        B.buildBr(*CondBrTarget);
+      }
 
       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
@@ -4149,23 +4161,23 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
   }
   case Intrinsic::amdgcn_loop: {
     MachineInstr *Br = nullptr;
-    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
+    MachineBasicBlock *UncondBrTarget = nullptr;
+    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
       const SIRegisterInfo *TRI
         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
 
       B.setInstr(*BrCond);
 
-      MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
-      if (Br)
-        BrTarget = Br->getOperand(0).getMBB();
-
+      MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
       Register Reg = MI.getOperand(2).getReg();
       B.buildInstr(AMDGPU::SI_LOOP)
         .addUse(Reg)
-        .addMBB(BrTarget);
+        .addMBB(UncondBrTarget);
 
       if (Br)
-        Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
+        Br->getOperand(0).setMBB(CondBrTarget);
+      else
+        B.buildBr(*CondBrTarget);
 
       MI.eraseFromParent();
       BrCond->eraseFromParent();

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4714,6 +4714,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   } else {
     // Get the target from BR if we don't negate the condition
     BR = findUser(BRCOND, ISD::BR);
+    assert(BR && "brcond missing unconditional branch user");
     Target = BR->getOperand(1);
   }
 

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -37,7 +37,7 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CHECK-NEXT:    s_cbranch_execnz BB1_2
+; CHECK-NEXT:    s_cbranch_execz BB1_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
 ; CHECK-NEXT:    global_load_dword v0, v[0:1], off
 ; CHECK-NEXT:  BB1_2: ; %endif

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir
@@ -0,0 +1,21 @@
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
+
+# Make sure there's no crash if there is somehow no successor block.
+
+# ERR: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_no_succ_block)
+
+---
+name: brcond_si_if_no_succ_block
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s1) = G_ICMP intpred(ne), %0, %1
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    G_BRCOND %3, %bb.1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
@@ -108,6 +108,7 @@ body:             |
   ; WAVE64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64:   [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
   ; WAVE64:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64:   G_BR %bb.1
   ; WAVE64: bb.1:
   ; WAVE32-LABEL: name: brcond_si_if
   ; WAVE32: bb.0:
@@ -116,6 +117,7 @@ body:             |
   ; WAVE32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
   ; WAVE32:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32:   G_BR %bb.1
   ; WAVE32: bb.1:
   bb.0:
     successors: %bb.1
@@ -139,6 +141,7 @@ body:             |
   ; WAVE64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64:   [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
   ; WAVE64:   [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64:   G_BR %bb.1
   ; WAVE64: bb.1:
   ; WAVE32-LABEL: name: brcond_si_else
   ; WAVE32: bb.0:
@@ -147,6 +150,7 @@ body:             |
   ; WAVE32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
   ; WAVE32:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, 0, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32:   G_BR %bb.1
   ; WAVE32: bb.1:
   bb.0:
     successors: %bb.1
@@ -161,32 +165,148 @@ body:             |
 ...
 
 ---
-name: brcond_si_loop
+name: brcond_si_loop_brcond
+tracksRegLiveness: true
 body:             |
-  ; WAVE64-LABEL: name: brcond_si_loop
+  ; WAVE64-LABEL: name: brcond_si_loop_brcond
   ; WAVE64: bb.0:
   ; WAVE64:   successors: %bb.1(0x80000000)
+  ; WAVE64:   liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
   ; WAVE64:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64:   [[COPY2:%[0-9]+]]:sreg_64_xexec(s64) = COPY $sgpr0_sgpr1
-  ; WAVE64:   SI_LOOP [[COPY2]](s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE64: bb.1:
-  ; WAVE32-LABEL: name: brcond_si_loop
+  ; WAVE64:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; WAVE64:   S_NOP 0
+  ; WAVE64:   SI_LOOP [[COPY2]](s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64:   G_BR %bb.2
+  ; WAVE64: bb.2:
+  ; WAVE64:   S_NOP 0
+  ; WAVE32-LABEL: name: brcond_si_loop_brcond
   ; WAVE32: bb.0:
   ; WAVE32:   successors: %bb.1(0x80000000)
+  ; WAVE32:   liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
   ; WAVE32:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32:   [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec(s64) = COPY $sgpr0_sgpr1
+  ; WAVE32: bb.1:
+  ; WAVE32:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; WAVE32:   S_NOP 0
   ; WAVE32:   SI_LOOP [[COPY2]](s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32:   G_BR %bb.2
+  ; WAVE32: bb.2:
+  ; WAVE32:   S_NOP 0
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s64) = COPY $sgpr0_sgpr1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    S_NOP 0
+    %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
+    G_BRCOND %3, %bb.2
+    G_BR %bb.1
+
+  bb.2:
+    S_NOP 0
+...
+
+# This usage is backwards from how the intrinsic is supposed to be
+# used.
+---
+name: brcond_si_loop_brcond_back
+tracksRegLiveness: true
+body:             |
+  ; WAVE64-LABEL: name: brcond_si_loop_brcond_back
+  ; WAVE64: bb.0:
+  ; WAVE64:   successors: %bb.1(0x80000000)
+  ; WAVE64:   liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
+  ; WAVE64:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; WAVE64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; WAVE64:   [[COPY2:%[0-9]+]]:sreg_64_xexec(s64) = COPY $sgpr0_sgpr1
+  ; WAVE64: bb.1:
+  ; WAVE64:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; WAVE64:   S_NOP 0
+  ; WAVE64:   SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64:   G_BR %bb.1
+  ; WAVE64: bb.2:
+  ; WAVE64:   S_NOP 0
+  ; WAVE32-LABEL: name: brcond_si_loop_brcond_back
+  ; WAVE32: bb.0:
+  ; WAVE32:   successors: %bb.1(0x80000000)
+  ; WAVE32:   liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
+  ; WAVE32:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; WAVE32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; WAVE32:   [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec(s64) = COPY $sgpr0_sgpr1
   ; WAVE32: bb.1:
+  ; WAVE32:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; WAVE32:   S_NOP 0
+  ; WAVE32:   SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32:   G_BR %bb.1
+  ; WAVE32: bb.2:
+  ; WAVE32:   S_NOP 0
   bb.0:
-    successors: %bb.1
     liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s64) = COPY $sgpr0_sgpr1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    S_NOP 0
     %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
     G_BRCOND %3, %bb.1
+    G_BR %bb.2
+
+  bb.2:
+    S_NOP 0
+...
+
+# This usage is backwards from how the intrinsic is supposed to be
+# used.
+---
+name: brcond_si_loop_brcond_back_fallthrough
+tracksRegLiveness: true
+body:             |
+  ; WAVE64-LABEL: name: brcond_si_loop_brcond_back_fallthrough
+  ; WAVE64: bb.0:
+  ; WAVE64:   successors: %bb.1(0x80000000)
+  ; WAVE64:   liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
+  ; WAVE64:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; WAVE64:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; WAVE64:   [[COPY2:%[0-9]+]]:sreg_64_xexec(s64) = COPY $sgpr0_sgpr1
+  ; WAVE64: bb.1:
+  ; WAVE64:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; WAVE64:   S_NOP 0
+  ; WAVE64:   SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64:   G_BR %bb.1
+  ; WAVE64: bb.2:
+  ; WAVE32-LABEL: name: brcond_si_loop_brcond_back_fallthrough
+  ; WAVE32: bb.0:
+  ; WAVE32:   successors: %bb.1(0x80000000)
+  ; WAVE32:   liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
+  ; WAVE32:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; WAVE32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; WAVE32:   [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec(s64) = COPY $sgpr0_sgpr1
+  ; WAVE32: bb.1:
+  ; WAVE32:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; WAVE32:   S_NOP 0
+  ; WAVE32:   SI_LOOP [[COPY2]](s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32:   G_BR %bb.1
+  ; WAVE32: bb.2:
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s64) = COPY $sgpr0_sgpr1
 
   bb.1:
+    successors: %bb.1, %bb.2
+    S_NOP 0
+    %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
+    G_BRCOND %3, %bb.1
+
+  bb.2:
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -164,7 +164,7 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
-; GFX9-NEXT:    s_cbranch_execnz BB2_2
+; GFX9-NEXT:    s_cbranch_execz BB2_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, static.gv2@rel32@lo+4