[AMDGPU] Lowering VGPR to SGPR copies to v_readfirstlane_b32 if profi…

…table. Since the divergence-driven instruction selection has been enabled for AMDGPU, all the uniform instructions are expected to be selected to SALU form, except those not having one. VGPR to SGPR copies appear in MIR to connect values producers and consumers. This change implements an algorithm that evolves a reasonable tradeoff between the profit achieved from keeping the uniform instructions in SALU form and overhead introduced by the data transfer between the VGPRs and SGPRs. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D128252
llvm · Jul 14, 2022 · 2e29b01 · 2e29b01
1 parent 5b0788f
commit 2e29b01
Show file tree

Hide file tree

Showing 27 changed files with 3,033 additions and 2,273 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -67,6 +67,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
@@ -81,9 +82,9 @@ static cl::opt<bool> EnableM0Merge(
   cl::init(true));
 
 namespace {
-
 class SIFixSGPRCopies : public MachineFunctionPass {
   MachineDominatorTree *MDT;
+  unsigned NextVGPRToSGPRCopyID;
 
 public:
   static char ID;
@@ -92,9 +93,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
-  SIFixSGPRCopies() : MachineFunctionPass(ID) {}
+  SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+  unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
+  void lowerVGPR2SGPRCopies(MachineFunction &MF);
+  // Handles copies which source register is:
+  // 1. Physical register
+  // 2. AGPR
+  // 3. Defined by the instruction the merely moves the immediate
+  bool lowerSpecialCase(MachineInstr &MI);
 
   MachineBasicBlock *processPHINode(MachineInstr &MI);
 
@@ -569,6 +577,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
 
+  // We have to lower VGPR to SGPR copies before the main loop
+  // because the REG_SEQUENCE and PHI lowering in main loop
+  // convert the def-use chains to VALU and close the opportunities
+  // for keeping them scalar.
+  // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch
+  // addresses their lowering and unify the processing in one main loop.
+  lowerVGPR2SGPRCopies(MF);
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
     MachineBasicBlock *MBB = &*BI;
@@ -640,42 +656,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
 
-        if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
-          Register SrcReg = MI.getOperand(1).getReg();
-          if (!SrcReg.isVirtual()) {
-            MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
-            if (NewBB && NewBB != MBB) {
-              MBB = NewBB;
-              E = MBB->end();
-              BI = MachineFunction::iterator(MBB);
-              BE = MF.end();
-            }
-            assert((!NewBB || NewBB == I->getParent()) &&
-                   "moveToVALU did not return the right basic block");
-            break;
-          }
-
-          MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-          unsigned SMovOp;
-          int64_t Imm;
-          // If we are just copying an immediate, we can replace the copy with
-          // s_mov_b32.
-          if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
-            MI.getOperand(1).ChangeToImmediate(Imm);
-            MI.addImplicitDefUseOperands(MF);
-            MI.setDesc(TII->get(SMovOp));
-            break;
-          }
-          MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
-          if (NewBB && NewBB != MBB) {
-            MBB = NewBB;
-            E = MBB->end();
-            BI = MachineFunction::iterator(MBB);
-            BE = MF.end();
-          }
-          assert((!NewBB || NewBB == I->getParent()) &&
-                 "moveToVALU did not return the right basic block");
-        } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+        if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
 
@@ -916,3 +897,269 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   }
   return CreatedBB;
 }
+
+bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getParent();
+  const TargetRegisterClass *SrcRC, *DstRC;
+  std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
+
+  // We return true to indicate that no further processing needed
+  if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
+    return true;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
+    TII->moveToVALU(MI, MDT);
+    return true;
+  }
+
+  unsigned SMovOp;
+  int64_t Imm;
+  // If we are just copying an immediate, we can replace the copy with
+  // s_mov_b32.
+  if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
+    MI.getOperand(1).ChangeToImmediate(Imm);
+    MI.addImplicitDefUseOperands(*MBB->getParent());
+    MI.setDesc(TII->get(SMovOp));
+    return true;
+  }
+  return false;
+}
+
+class V2SCopyInfo {
+public:
+  // VGPR to SGPR copy being processed
+  MachineInstr *Copy;
+  // All SALU instructions reachable from this copy in SSA graph
+  DenseSet<MachineInstr *> SChain;
+  // Number of SGPR to VGPR copies that are used to put the SALU computation
+  // results back to VALU.
+  unsigned NumSVCopies;
+
+  unsigned Score;
+  // Actual count of v_readfirstlane_b32
+  // which need to be inserted to keep SChain SALU
+  unsigned NumReadfirstlanes;
+  // Current score state. To speedup selection V2SCopyInfos for processing
+  bool NeedToBeConvertedToVALU = false;
+  // Unique ID. Used as a key for mapping to keep permanent order.
+  unsigned ID;
+
+  // Count of another VGPR to SGPR copies that contribute to the
+  // current copy SChain
+  unsigned SiblingPenalty = 0;
+  SetVector<unsigned> Siblings;
+  V2SCopyInfo() : Copy(nullptr), ID(0){};
+  V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
+      : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() {
+    dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
+           << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
+           << "\nScore: " << Score << "\n";
+  }
+#endif
+};
+
+void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
+
+  DenseMap<unsigned, V2SCopyInfo> Copies;
+  DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
+
+  // The main function that computes the VGPR to SGPR copy score
+  // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
+  auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
+    if (I->SChain.empty())
+      return true;
+    I->Siblings = SiblingPenalty[*std::max_element(
+        I->SChain.begin(), I->SChain.end(),
+        [&](MachineInstr *A, MachineInstr *B) -> bool {
+          return SiblingPenalty[A].size() < SiblingPenalty[B].size();
+        })];
+    I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; });
+    // The loop below computes the number of another VGPR to SGPR copies
+    // which contribute to the current copy SALU chain. We assume that all the
+    // copies with the same source virtual register will be squashed to one by
+    // regalloc. Also we take careof the copies of the differnt subregs of the
+    // same register.
+    SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
+    for (auto J : I->Siblings) {
+      auto InfoIt = Copies.find(J);
+      if (InfoIt != Copies.end()) {
+        MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
+        if (SiblingCopy->isImplicitDef())
+          // the COPY has already been MoveToVALUed
+          continue;
+
+        SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
+                                      SiblingCopy->getOperand(1).getSubReg()));
+      }
+    }
+    I->SiblingPenalty = SrcRegs.size();
+
+    unsigned Penalty =
+        I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes;
+    unsigned Profit = I->SChain.size();
+    I->Score = Penalty > Profit ? 0 : Profit - Penalty;
+    I->NeedToBeConvertedToVALU = I->Score < 3;
+    return I->NeedToBeConvertedToVALU;
+  };
+
+  auto needProcessing = [](MachineInstr &MI) -> bool {
+    switch (MI.getOpcode()) {
+    case AMDGPU::COPY:
+    case AMDGPU::WQM:
+    case AMDGPU::STRICT_WQM:
+    case AMDGPU::SOFT_WQM:
+    case AMDGPU::STRICT_WWM:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    MachineBasicBlock *MBB = &*BI;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+         ++I) {
+      MachineInstr &MI = *I;
+      if (!needProcessing(MI))
+        continue;
+      if (lowerSpecialCase(MI))
+        continue;
+
+      // Compute the COPY width to pass it to V2SCopyInfo Ctor
+      Register DstReg = MI.getOperand(0).getReg();
+
+      const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg);
+
+      V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI,
+                     TRI->getRegSizeInBits(*DstRC));
+
+      SmallVector<MachineInstr *, 8> AnalysisWorklist;
+      // Needed because the SSA is not a tree but a graph and may have
+      // forks and joins. We should not then go same way twice.
+      DenseSet<MachineInstr *> Visited;
+      AnalysisWorklist.push_back(&MI);
+      while (!AnalysisWorklist.empty()) {
+
+        MachineInstr *Inst = AnalysisWorklist.pop_back_val();
+
+        if (!Visited.insert(Inst).second)
+          continue;
+
+        // Copies and REG_SEQUENCE do not contribute to the final assembly
+        // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
+        if (Inst->isCopy() || Inst->isRegSequence()) {
+          if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+            if (!Inst->isCopy() ||
+                !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+              In.NumSVCopies++;
+              continue;
+            }
+          }
+        }
+
+        SiblingPenalty[Inst].insert(In.ID);
+
+        SmallVector<MachineInstr *, 4> Users;
+        if ((TII->isSALU(*Inst) && Inst->isCompare()) ||
+            (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) {
+          auto I = Inst->getIterator();
+          auto E = Inst->getParent()->end();
+          while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) {
+            if (I->readsRegister(AMDGPU::SCC))
+              Users.push_back(&*I);
+          }
+        } else if (Inst->getNumExplicitDefs() != 0) {
+          Register Reg = Inst->getOperand(0).getReg();
+          if (TRI->isSGPRReg(*MRI, Reg))
+            for (auto &U : MRI->use_instructions(Reg))
+              Users.push_back(&U);
+        }
+        for (auto U : Users) {
+          if (TII->isSALU(*U))
+            In.SChain.insert(U);
+          AnalysisWorklist.push_back(U);
+        }
+      }
+      Copies[In.ID] = In;
+    }
+  }
+
+  SmallVector<unsigned, 8> LoweringWorklist;
+  for (auto &C : Copies) {
+    if (needToBeConvertedToVALU(&C.second))
+      LoweringWorklist.push_back(C.second.ID);
+  }
+
+  while (!LoweringWorklist.empty()) {
+    unsigned CurID = LoweringWorklist.pop_back_val();
+    auto CurInfoIt = Copies.find(CurID);
+    if (CurInfoIt != Copies.end()) {
+      V2SCopyInfo C = CurInfoIt->getSecond();
+      LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
+      for (auto S : C.Siblings) {
+        auto SibInfoIt = Copies.find(S);
+        if (SibInfoIt != Copies.end()) {
+          V2SCopyInfo &SI = SibInfoIt->getSecond();
+          LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
+          if (!SI.NeedToBeConvertedToVALU) {
+            set_subtract(SI.SChain, C.SChain);
+            if (needToBeConvertedToVALU(&SI))
+              LoweringWorklist.push_back(SI.ID);
+          }
+          SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; });
+        }
+      }
+      LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
+                        << " is being turned to VALU\n");
+      Copies.erase(C.ID);
+      TII->moveToVALU(*C.Copy, MDT);
+    }
+  }
+
+  // Now do actual lowering
+  for (auto C : Copies) {
+    MachineInstr *MI = C.second.Copy;
+    MachineBasicBlock *MBB = MI->getParent();
+    // We decide to turn V2S copy to v_readfirstlane_b32
+    // remove it from the V2SCopies and remove it from all its siblings
+    LLVM_DEBUG(dbgs() << "V2S copy " << *MI
+                      << " is being turned to v_readfirstlane_b32"
+                      << " Score: " << C.second.Score << "\n");
+    Register DstReg = MI->getOperand(0).getReg();
+    Register SrcReg = MI->getOperand(1).getReg();
+    unsigned SubReg = MI->getOperand(1).getSubReg();
+    const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg);
+    SrcRC = TRI->getSubRegClass(SrcRC, SubReg);
+    size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
+    if (SrcSize == 16) {
+      // HACK to handle possible 16bit VGPR source
+      auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                         TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
+      MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
+    } else if (SrcSize == 32) {
+      auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                         TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
+      MIB.addReg(SrcReg, 0, SubReg);
+    } else {
+      auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                            TII->get(AMDGPU::REG_SEQUENCE), DstReg);
+      int N = TRI->getRegSizeInBits(*SrcRC) / 32;
+      for (int i = 0; i < N; i++) {
+        Register PartialSrc = TII->buildExtractSubReg(
+            Result, *MRI, MI->getOperand(1), SrcRC,
+            TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
+        Register PartialDst =
+            MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        BuildMI(*MBB, *Result, Result->getDebugLoc(),
+                TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
+            .addReg(PartialSrc);
+        Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
+      }
+    }
+    MI->eraseFromParent();
+  }
+}
diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll
@@ -223,7 +223,7 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; VI-NEXT:    v_add_f32_e32 v2, s4, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: add3_uniform_vgpr: