AMDGPU/SILowerI1Copies process phi incomings in specific order #72375

petar-avramovic · 2023-11-15T11:58:51Z

When merging lane masks, value from block that is always visited first
(PrevReg in buildMergeLaneMasks) needs to exist because we do on-the-fly
constant folding. For PrevReg to exist, basic block that should contain
PrevReg definition must be processed first. Sort the incomings such that
incoming values that dominate other incoming values are processed first.

Sorting of phi incomings makes no changes for phis created by SDAG
because SDAG adds phi incomings as it selects basic blocks in reversed
post order traversal.

This change is required by upcoming lane mask merging implementation
for GlobalISel that leaves phi incomings as they are in IR.

Stack:
patch2 #72375
patch1 #72374

llvmbot · 2023-11-15T11:59:21Z

@llvm/pr-subscribers-backend-amdgpu

Author: None (petar-avramovic)

Changes

When merging lane masks, value from block that is always visited first
(PrevReg in buildMergeLaneMasks) needs to exist because we do on-the-fly
constant folding. For PrevReg to exist, basic block that should contain
PrevReg definition must be processed first. Sort the incomings such that
incoming values that dominate other incoming values are processed first.

Sorting of phi incomings makes no changes for phis created by SDAG
because SDAG adds phi incomings as it selects basic blocks in reversed
post order traversal.

This change is required by upcoming lane mask merging implementation
for GlobalISel that leaves phi incomings as they are in IR.

Full diff: https://github.com/llvm/llvm-project/pull/72375.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp (+54-40)
(added) llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir (+151)

diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index a11f02e7db3504b..68c8f4024e73007 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -39,6 +39,15 @@ static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
 
 namespace {
 
+struct Incoming {
+  Register Reg;
+  MachineBasicBlock *Block;
+  Register UpdatedReg;
+
+  Incoming(Register Reg, MachineBasicBlock *Block, Register UpdatedReg)
+      : Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
+};
+
 class SILowerI1Copies : public MachineFunctionPass {
 public:
   static char ID;
@@ -145,8 +154,7 @@ class PhiIncomingAnalysis {
 
   ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
 
-  void analyze(MachineBasicBlock &DefBlock,
-               ArrayRef<MachineBasicBlock *> IncomingBlocks) {
+  void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) {
     assert(Stack.empty());
     ReachableMap.clear();
     ReachableOrdered.clear();
@@ -157,7 +165,8 @@ class PhiIncomingAnalysis {
     ReachableMap.try_emplace(&DefBlock, false);
     ReachableOrdered.push_back(&DefBlock);
 
-    for (MachineBasicBlock *MBB : IncomingBlocks) {
+    for (auto Incoming : Incomings) {
+      MachineBasicBlock *MBB = Incoming.Block;
       if (MBB == &DefBlock) {
         ReachableMap[&DefBlock] = true; // self-loop on DefBlock
         continue;
@@ -302,20 +311,20 @@ class LoopFinder {
   /// blocks, so that the SSA updater doesn't have to search all the way to the
   /// function entry.
   void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
-                      ArrayRef<MachineBasicBlock *> Blocks = {}) {
+                      ArrayRef<Incoming> Incomings = {}) {
     assert(LoopLevel < CommonDominators.size());
 
     MachineBasicBlock *Dom = CommonDominators[LoopLevel];
-    for (MachineBasicBlock *MBB : Blocks)
-      Dom = DT.findNearestCommonDominator(Dom, MBB);
+    for (auto &Incoming : Incomings)
+      Dom = DT.findNearestCommonDominator(Dom, Incoming.Block);
 
-    if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
+    if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
       SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
     } else {
       // The dominator is part of the loop or the given blocks, so add the
       // undef value to unreachable predecessors instead.
       for (MachineBasicBlock *Pred : Dom->predecessors()) {
-        if (!inLoopLevel(*Pred, LoopLevel, Blocks))
+        if (!inLoopLevel(*Pred, LoopLevel, Incomings))
           SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
       }
     }
@@ -323,13 +332,14 @@ class LoopFinder {
 
 private:
   bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
-                   ArrayRef<MachineBasicBlock *> Blocks) const {
+                   ArrayRef<Incoming> Incomings) const {
     auto DomIt = Visited.find(&MBB);
     if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
       return true;
 
-    if (llvm::is_contained(Blocks, &MBB))
-      return true;
+    for (auto &Incoming : Incomings)
+      if (Incoming.Block == &MBB)
+        return true;
 
     return false;
   }
@@ -534,9 +544,8 @@ bool SILowerI1Copies::lowerPhis() {
   LoopFinder LF(*DT, *PDT);
   PhiIncomingAnalysis PIA(*PDT, TII);
   SmallVector<MachineInstr *, 4> Vreg1Phis;
-  SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
-  SmallVector<unsigned, 4> IncomingRegs;
-  SmallVector<unsigned, 4> IncomingUpdated;
+  SmallVector<Incoming, 4> Incomings;
+
 #ifndef NDEBUG
   DenseSet<unsigned> PhiRegisters;
 #endif
@@ -550,6 +559,7 @@ bool SILowerI1Copies::lowerPhis() {
   if (Vreg1Phis.empty())
     return false;
 
+  DT->getBase().updateDFSNumbers();
   MachineBasicBlock *PrevMBB = nullptr;
   for (MachineInstr *MI : Vreg1Phis) {
     MachineBasicBlock &MBB = *MI->getParent();
@@ -581,10 +591,18 @@ bool SILowerI1Copies::lowerPhis() {
         assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
       }
 
-      IncomingBlocks.push_back(IncomingMBB);
-      IncomingRegs.push_back(IncomingReg);
+      Incomings.emplace_back(IncomingReg, IncomingMBB, Register{});
     }
 
+    // Sort the incomings such that incoming values that dominate other incoming
+    // values are sorted earlier. This allows us to do some amount of on-the-fly
+    // constant folding.
+    // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block.
+    llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) {
+      return DT->getNode(LHS.Block)->getDFSNumIn() <
+             DT->getNode(RHS.Block)->getDFSNumIn();
+    });
+
 #ifndef NDEBUG
     PhiRegisters.insert(DstReg);
 #endif
@@ -607,47 +625,45 @@ bool SILowerI1Copies::lowerPhis() {
     SSAUpdater.Initialize(DstReg);
 
     if (FoundLoopLevel) {
-      LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
+      LF.addLoopEntries(FoundLoopLevel, SSAUpdater, Incomings);
 
-      for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
-        IncomingUpdated.push_back(createLaneMaskReg(*MF));
-        SSAUpdater.AddAvailableValue(IncomingBlocks[i],
-                                     IncomingUpdated.back());
+      for (auto &Incoming : Incomings) {
+        Incoming.UpdatedReg = createLaneMaskReg(*MF);
+        SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg);
       }
 
-      for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
-        MachineBasicBlock &IMBB = *IncomingBlocks[i];
+      for (auto &Incoming : Incomings) {
+        MachineBasicBlock &IMBB = *Incoming.Block;
         buildMergeLaneMasks(
-            IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
-            SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+            IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg,
+            SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg);
       }
     } else {
       // The phi is not observed from outside a loop. Use a more accurate
       // lowering.
-      PIA.analyze(MBB, IncomingBlocks);
+      PIA.analyze(MBB, Incomings);
 
       for (MachineBasicBlock *MBB : PIA.predecessors())
         SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
 
-      for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
-        MachineBasicBlock &IMBB = *IncomingBlocks[i];
+      for (auto &Incoming : Incomings) {
+        MachineBasicBlock &IMBB = *Incoming.Block;
         if (PIA.isSource(IMBB)) {
-          IncomingUpdated.push_back(0);
-          SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
+          SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
         } else {
-          IncomingUpdated.push_back(createLaneMaskReg(*MF));
-          SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
+          Incoming.UpdatedReg = createLaneMaskReg(*MF);
+          SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg);
         }
       }
 
-      for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
-        if (!IncomingUpdated[i])
+      for (auto &Incoming : Incomings) {
+        if (!Incoming.UpdatedReg.isValid())
           continue;
 
-        MachineBasicBlock &IMBB = *IncomingBlocks[i];
+        MachineBasicBlock &IMBB = *Incoming.Block;
         buildMergeLaneMasks(
-            IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
-            SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+            IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg,
+            SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg);
       }
     }
 
@@ -657,9 +673,7 @@ bool SILowerI1Copies::lowerPhis() {
       MI->eraseFromParent();
     }
 
-    IncomingBlocks.clear();
-    IncomingRegs.clear();
-    IncomingUpdated.clear();
+    Incomings.clear();
   }
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir
new file mode 100644
index 000000000000000..695beab8dd24dc8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir
@@ -0,0 +1,151 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -run-pass=si-i1-copies -o - %s | FileCheck -check-prefixes=GCN %s
+
+# Order in which SILowerI1Copies build instructions to merge lane masks should
+# not depend on order of incoming operands in phi instruction.
+# SDAG adds phi incomings as it processes basic blocks in reversed post order
+# traversal. Because of that, incomings in phis created by SDAG are sorted,
+# compared to the how phi looked in IR, in convenient way for lowerPhis.
+
+# Here incomings for %20:vreg_1 = PHI %19, %bb.1, %26, %bb.2 are swapped
+# to verify that SILowerI1Copies sorts incomings from phi appropriately before
+# it starts merging lane masks.
+
+---
+name: phi
+tracksRegLiveness: true
+body: |
+  ; GCN-LABEL: name: phi
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GCN-NEXT:   [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $exec_lo
+  ; GCN-NEXT:   [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF5]], %bb.0, %20, %bb.3
+  ; GCN-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[COPY6]], %bb.0, %37, %bb.3
+  ; GCN-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %16, %bb.3
+  ; GCN-NEXT:   [[PHI3:%[0-9]+]]:vreg_64 = PHI [[COPY5]], %bb.0, %18, %bb.3
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[PHI1]]
+  ; GCN-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[PHI]], $exec_lo, implicit-def $scc
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[PHI1]], $exec_lo, implicit-def $scc
+  ; GCN-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_]], [[S_AND_B32_]], implicit-def $scc
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY7]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   S_BRANCH %bb.2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[PHI3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[GLOBAL_LOAD_DWORD]], killed [[S_MOV_B32_2]], implicit $exec
+  ; GCN-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
+  ; GCN-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc
+  ; GCN-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_1]], implicit-def $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3:
+  ; GCN-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.1, [[S_OR_B32_1]], %bb.2
+  ; GCN-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4
+  ; GCN-NEXT:   [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[PHI3]], killed [[S_MOV_B64_]], implicit-def dead $vcc, implicit $exec
+  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GCN-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[PHI2]], killed [[S_MOV_B32_3]], implicit-def dead $scc
+  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 9
+  ; GCN-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[PHI1]], $exec_lo, implicit-def $scc
+  ; GCN-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[PHI4]], $exec_lo, implicit-def $scc
+  ; GCN-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_2]], [[S_AND_B32_2]], implicit-def $scc
+  ; GCN-NEXT:   S_CMP_GT_I32 [[S_ADD_I32_]], killed [[S_MOV_B32_4]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.4
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.4:
+  ; GCN-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
+  ; GCN-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[PHI1]]
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_5]]
+  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_6]], 0, [[COPY9]], [[COPY8]], implicit $exec
+  ; GCN-NEXT:   FLAT_STORE_DWORD [[COPY4]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+  ; GCN-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    %0:vgpr_32 = COPY $vgpr4
+    %1:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %3:vgpr_32 = COPY $vgpr1
+    %4:sgpr_32 = IMPLICIT_DEF
+    %5:sgpr_32 = IMPLICIT_DEF
+    %6:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
+    %7:sgpr_32 = IMPLICIT_DEF
+    %8:sgpr_32 = IMPLICIT_DEF
+    %9:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
+    %10:sreg_32 = S_MOV_B32 -1
+    %11:sreg_32 = S_MOV_B32 -1
+    %12:vreg_64 = COPY %6
+    %13:vreg_64 = COPY %9
+    %14:vreg_1 = COPY %10, implicit $exec
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %15:sreg_32 = PHI %11, %bb.0, %16, %bb.3
+    %17:vreg_64 = PHI %13, %bb.0, %18, %bb.3
+    %19:vreg_1 = PHI %14, %bb.0, %20, %bb.3
+    %21:sreg_32 = COPY %19
+    %22:sreg_32 = SI_IF %21, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %23:vgpr_32 = GLOBAL_LOAD_DWORD %17, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %24:sreg_32 = S_MOV_B32 0
+    %25:sreg_32 = V_CMP_EQ_U32_e64 killed %23, killed %24, implicit $exec
+    %26:vreg_1 = COPY %25
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+    %20:vreg_1 = PHI %26, %bb.2, %19, %bb.1    ;%20:vreg_1 = PHI %19, %bb.1, %26, %bb.2 - this is original phi created by SDAG
+    SI_END_CF %22, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %27:sreg_64 = S_MOV_B64 4
+    %18:vreg_64 = V_ADD_U64_PSEUDO %17, killed %27, implicit-def dead $vcc, implicit $exec
+    %28:sreg_32 = S_MOV_B32 1
+    %16:sreg_32 = nsw S_ADD_I32 %15, killed %28, implicit-def dead $scc
+    %29:sreg_32 = S_MOV_B32 9
+    S_CMP_GT_I32 %16, killed %29, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit $scc
+    S_BRANCH %bb.4
+
+  bb.4:
+    %30:vreg_1 = PHI %19, %bb.3
+    %31:sgpr_32 = S_MOV_B32 1065353216
+    %32:sgpr_32 = S_MOV_B32 0
+    %33:sreg_32_xm0_xexec = COPY %30
+    %34:vgpr_32 = COPY killed %31
+    %35:vgpr_32 = V_CNDMASK_B32_e64 0, killed %32, 0, %34, %33, implicit $exec
+    FLAT_STORE_DWORD %12, killed %35, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    SI_RETURN
+...

When merging lane masks, value from block that is always visited first (PrevReg in buildMergeLaneMasks) needs to exist because we do on-the-fly constant folding. For PrevReg to exist, basic block that should contain PrevReg definition must be processed first. Sort the incomings such that incoming values that dominate other incoming values are processed first. Sorting of phi incomings makes no changes for phis created by SDAG because SDAG adds phi incomings as it selects basic blocks in reversed post order traversal. This change is required by upcoming lane mask merging implementation for GlobalISel that leaves phi incomings as they are in IR.

…72375) When merging lane masks, value from block that is always visited first (PrevReg in buildMergeLaneMasks) needs to exist because we do on-the-fly constant folding. For PrevReg to exist, basic block that should contain PrevReg definition must be processed first. Sort the incomings such that incoming values that dominate other incoming values are processed first. Sorting of phi incomings makes no changes for phis created by SDAG because SDAG adds phi incomings as it selects basic blocks in reversed post order traversal. This change is required by upcoming lane mask merging implementation for GlobalISel that leaves phi incomings as they are in IR.

petar-avramovic requested review from jayfoad, arsenm and nhaehnle November 15, 2023 11:58

llvmbot added the backend:AMDGPU label Nov 15, 2023

This was referenced Nov 15, 2023

AMDGPU/SILowerI1Copies: refactor phi incoming handling [NFC] #72374

Merged

AMDGPU/SILowerI1Copies process phi incomings in specific order #72124

Closed

petar-avramovic force-pushed the phi_incomings_sort_patch2 branch from 7901014 to a0b41bf Compare November 15, 2023 13:00

nhaehnle approved these changes Nov 15, 2023

View reviewed changes

petar-avramovic merged commit 95dd0b0 into llvm:main Nov 15, 2023
3 checks passed

petar-avramovic deleted the phi_incomings_sort_patch2 branch December 8, 2023 14:47

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AMDGPU/SILowerI1Copies process phi incomings in specific order #72375

AMDGPU/SILowerI1Copies process phi incomings in specific order #72375

petar-avramovic commented Nov 15, 2023 •

edited

Loading

llvmbot commented Nov 15, 2023

AMDGPU/SILowerI1Copies process phi incomings in specific order #72375

AMDGPU/SILowerI1Copies process phi incomings in specific order #72375

Conversation

petar-avramovic commented Nov 15, 2023 • edited Loading

llvmbot commented Nov 15, 2023

petar-avramovic commented Nov 15, 2023 •

edited

Loading