llvm · linuxrocks123 · Sep 15, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -248,6 +248,9 @@ extern char &AMDGPUPreloadKernArgPrologLegacyID;
 void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
 extern char &AMDGPUPreloadKernelArgumentsLegacyID;
 
+void initializeSIRestoreNormalEpilogLegacyPass(PassRegistry &);
+extern char &SIRestoreNormalEpilogLegacyID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -597,6 +597,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIOptimizeExecMaskingLegacyPass(*PR);
   initializeSIPreAllocateWWMRegsLegacyPass(*PR);
   initializeSIFormMemoryClausesLegacyPass(*PR);
+  initializeSIRestoreNormalEpilogLegacyPass(*PR);
   initializeSIPostRABundlerLegacyPass(*PR);
   initializeGCNCreateVOPDLegacyPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
@@ -1563,7 +1564,7 @@ void GCNPassConfig::addFastRegAlloc() {
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
+  //insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
 
@@ -1586,13 +1587,17 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (OptVGPRLiveRange)
     insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID);
 
+  insertPass(&SIOptimizeVGPRLiveRangeLegacyID, &SILowerControlFlowLegacyID);
+
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
+  //insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
+
+  insertPass(&RenameIndependentSubregsID,&SIRestoreNormalEpilogLegacyID);
 
   if (isPassEnabled(EnablePreRAOptimizations))
     insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID);
@@ -2256,7 +2261,7 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass<PHIEliminationPass>(SILowerControlFlowPass());
+  //insertPass<PHIEliminationPass>(SILowerControlFlowPass());
 
   if (EnableRewritePartialRegUses)
     insertPass<RenameIndependentSubregsPass>(GCNRewritePartialRegUsesPass());

diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -183,6 +183,7 @@ add_llvm_target(AMDGPUCodeGen
   SIPreEmitPeephole.cpp
   SIProgramInfo.cpp
   SIRegisterInfo.cpp
+  SIRestoreNormalEpilog.cpp
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
 

diff --git a/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h b/llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
@@ -0,0 +1,245 @@
+#pragma once
+
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include "SIInstrInfo.h"
+
+#include <cassert>
+#include <unordered_set>
+
+using namespace llvm;
+
+using std::unordered_set;
+using std::vector;
+
+static inline MachineInstr &getBranchWithDest(MachineBasicBlock &BranchingMBB,
+                                              MachineBasicBlock &DestMBB) {
+  auto &TII =
+      *BranchingMBB.getParent()->getSubtarget<GCNSubtarget>().getInstrInfo();
+  for (MachineInstr &BranchMI : reverse(BranchingMBB.instrs()))
+    if (BranchMI.isBranch() && TII.getBranchDestBlock(BranchMI) == &DestMBB)
+      return BranchMI;
+
+  llvm_unreachable("Don't call this if there's no branch to the destination.");
+}
+
+static inline void moveInsBeforePhis(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  auto &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+
+  bool PhiSeen = false;
+  MachineBasicBlock::iterator FirstPhi;
+  for (FirstPhi = MBB.begin(); FirstPhi != MBB.end(); FirstPhi++)
+    if (FirstPhi->getOpcode() == AMDGPU::PHI) {
+      PhiSeen = true;
+      break;
+    }
+
+  if (!PhiSeen) {
+    MI.removeFromParent();
+    MBB.insert(MBB.begin(), &MI);
+  } else {
+    auto Phi = BuildMI(MBB, FirstPhi, MI.getDebugLoc(), TII.get(AMDGPU::PHI),
+                       MI.getOperand(0).getReg());
+    for (auto *PredMBB : MBB.predecessors()) {
+      Register ClonedReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
+      MachineInstr &BranchMI = getBranchWithDest(*PredMBB, MBB);
+      MachineInstr *ClonedMI = MF.CloneMachineInstr(&MI);
+      ClonedMI->getOperand(0).setReg(ClonedReg);
+      Phi.addReg(ClonedReg).addMBB(PredMBB);
+      PredMBB->insertAfterBundle(BranchMI.getIterator(), ClonedMI);
+      ClonedMI->bundleWithPred();
+    }
+    MI.eraseFromParent();
+  }
+}
+
+struct EpilogIterator {
+  MachineBasicBlock::instr_iterator InternalIt;
+  EpilogIterator(MachineBasicBlock::instr_iterator I) : InternalIt(I) {}
+
+  bool operator==(const EpilogIterator &Other) {
+    return InternalIt == Other.InternalIt;
+  }
+  bool isEnd() { return InternalIt.isEnd(); }
+  MachineInstr &operator*() { return *InternalIt; }
+  MachineBasicBlock::instr_iterator operator->() { return InternalIt; }
+  EpilogIterator &operator++() {
+    ++InternalIt;
+    if (!InternalIt.isEnd() && InternalIt->isBranch())
+      InternalIt = InternalIt->getParent()->instr_end();
+    return *this;
+  }
+  EpilogIterator operator++(int Ignored) {
+    EpilogIterator ToReturn = *this;
+    ++*this;
+    return ToReturn;
+  }
+};
+
+static inline EpilogIterator getEpilogForSuccessor(MachineBasicBlock &PredMBB,
+                                                   MachineBasicBlock &SuccMBB) {
+  MachineFunction &MF = *PredMBB.getParent();
+  auto &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+
+  for (MachineInstr &BranchMI : reverse(PredMBB.instrs()))
+    if (BranchMI.isBranch() && TII.getBranchDestBlock(BranchMI) == &SuccMBB)
+      return ++EpilogIterator(BranchMI.getIterator());
+
+  llvm_unreachable("There should always be a branch to succ_MBB.");
+}
+
+static inline bool epilogsAreIdentical(const vector<MachineInstr *> Left,
+                                       const vector<MachineInstr *> Right,
+                                       const MachineBasicBlock &SuccMBB) {
+  if (Left.size() != Right.size())
+    return false;
+
+  for (unsigned I = 0; I < Left.size(); I++)
+    if (!Left[I]->isIdenticalTo(*Right[I]))
+      return false;
+  return true;
+}
+
+static inline void moveBody(vector<MachineInstr *> &Body,
+                            MachineBasicBlock &DestMBB) {
+  for (auto RevIt = Body.rbegin(); RevIt != Body.rend(); RevIt++) {
+    MachineInstr &BodyIns = **RevIt;
+    BodyIns.removeFromBundle();
+    DestMBB.insert(DestMBB.begin(), &BodyIns);
+  }
+}
+
+static inline void normalizeIrPostPhiElimination(MachineFunction &MF) {
+  auto &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+
+  struct CFGRewriteEntry {
+    unordered_set<MachineBasicBlock *> PredMBBs;
+    MachineBasicBlock *SuccMBB;
+    vector<MachineInstr *> Body;
+  };
+
+  vector<CFGRewriteEntry> CfgRewriteEntries;
+  for (MachineBasicBlock &MBB : MF) {
+    CFGRewriteEntry ToInsert = {{}, &MBB, {}};
+    for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
+      EpilogIterator EpIt = getEpilogForSuccessor(*PredMBB, MBB);
+
+      vector<MachineInstr *> Epilog;
+      while (!EpIt.isEnd())
+        Epilog.push_back(&*EpIt++);
+
+      if (!epilogsAreIdentical(ToInsert.Body, Epilog, MBB)) {
+        if (ToInsert.PredMBBs.size() && ToInsert.Body.size()) {
+          // Potentially, we need to insert a new entry.  But first see if we
+          // can find an existing entry with the same epilog.
+          bool ExistingEntryFound = false;
+          for (auto RevIt = CfgRewriteEntries.rbegin();
+               RevIt != CfgRewriteEntries.rend() && RevIt->SuccMBB == &MBB;
+               RevIt++)
+            if (epilogsAreIdentical(RevIt->Body, Epilog, MBB)) {
+              RevIt->PredMBBs.insert(PredMBB);
+              ExistingEntryFound = true;
+              break;
+            }
+
+          if (!ExistingEntryFound)
+            CfgRewriteEntries.push_back(ToInsert);
+        }
+        ToInsert.PredMBBs.clear();
+        ToInsert.Body = Epilog;
+      }
+
+      ToInsert.PredMBBs.insert(PredMBB);
+    }
+
+    // Handle the last potential rewrite entry.  Lower instead of journaling a
+    // rewrite entry if all predecessor MBBs are in this single entry.
+    if (ToInsert.PredMBBs.size() == MBB.pred_size()) {
+      moveBody(ToInsert.Body, MBB);
+      for (MachineBasicBlock *PredMBB : ToInsert.PredMBBs) {
+        // Delete instructions that were lowered from epilog
+        MachineInstr &BranchIns =
+            getBranchWithDest(*PredMBB, *ToInsert.SuccMBB);
+        auto EpilogIt = ++EpilogIterator(BranchIns.getIterator());
+        while (!EpilogIt.isEnd())
+          EpilogIt++->eraseFromBundle();
+      }
+
+    } else if (ToInsert.Body.size())
+      CfgRewriteEntries.push_back(ToInsert);
+  }
+
+  // Perform the journaled rewrites.
+  for (auto &Entry : CfgRewriteEntries) {
+    MachineBasicBlock *MezzanineMBB = MF.CreateMachineBasicBlock();
+    MF.insert(MF.end(), MezzanineMBB);
+
+    // Deal with mezzanine to successor succession.
+    BuildMI(MezzanineMBB, DebugLoc(), TII.get(AMDGPU::S_BRANCH))
+        .addMBB(Entry.SuccMBB);
+    MezzanineMBB->addSuccessor(Entry.SuccMBB);
+
+    // Move instructions to mezzanine block.
+    moveBody(Entry.Body, *MezzanineMBB);
+
+    for (MachineBasicBlock *PredMBB : Entry.PredMBBs) {
+      // Deal with predecessor to mezzanine succession.
+      MachineInstr &BranchIns = getBranchWithDest(*PredMBB, *Entry.SuccMBB);
+      assert(BranchIns.getOperand(0).isMBB() && "Branch instruction isn't.");
+      BranchIns.getOperand(0).setMBB(MezzanineMBB);
+      PredMBB->replaceSuccessor(Entry.SuccMBB, MezzanineMBB);
+
+      // Delete instructions that were lowered from epilog
+      auto EpilogIt = ++EpilogIterator(BranchIns.getIterator());
+      while (!EpilogIt.isEnd())
+        EpilogIt++->eraseFromBundle();
+    }
+  }
+}
+
+namespace std {
+template <> struct hash<Register> {
+  std::size_t operator()(const Register &R) const {
+    return hash<unsigned>()(R);
+  }
+};
+} // namespace std
+
+static inline void hoistUnrelatedCopies(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineInstr &BranchMI : MBB) {
+      if (!BranchMI.isBranch())
+        continue;
+
+      unordered_set<Register> RelatedCopySources;
+      EpilogIterator EpilogIt = BranchMI.getIterator();
+      EpilogIterator CopyMoveIt = ++EpilogIt;
+      while (!EpilogIt.isEnd()) {
+        if (EpilogIt->getOpcode() != AMDGPU::COPY)
+          RelatedCopySources.insert(EpilogIt->getOperand(0).getReg());
+        ++EpilogIt;
+      }
+
+      while (!CopyMoveIt.isEnd()) {
+        EpilogIterator Next = CopyMoveIt;
+        ++Next;
+        if (CopyMoveIt->getOpcode() == AMDGPU::COPY &&
+                !RelatedCopySources.count(CopyMoveIt->getOperand(1).getReg()) ||
+            CopyMoveIt->getOpcode() == AMDGPU::IMPLICIT_DEF) {
+          MachineInstr &MIToMove = *CopyMoveIt;
+          MIToMove.removeFromBundle();
+          MBB.insert(BranchMI.getIterator(), &MIToMove);
+        }
+
+        CopyMoveIt = Next;
+      }
+    }
+}
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -48,6 +48,7 @@
 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
 //===----------------------------------------------------------------------===//
 
+#include "SICustomBranchBundles.h"
 #include "SILowerControlFlow.h"
 #include "AMDGPU.h"
 #include "AMDGPULaneMaskUtils.h"
@@ -152,6 +153,14 @@ class SILowerControlFlowLegacy : public MachineFunctionPass {
     return "SI Lower control flow pseudo instructions";
   }
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().setIsSSA();
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().setNoPHIs();
+  }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
     // Should preserve the same set that TwoAddressInstructions does.
@@ -323,6 +332,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   if (LV)
     LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);
 
+  moveInsBeforePhis(*OrSaveExec);
+
   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
 
   MachineBasicBlock::iterator ElsePt(MI);
@@ -840,6 +851,12 @@ bool SILowerControlFlow::run(MachineFunction &MF) {
   LoweredIf.clear();
   KillBlocks.clear();
 
+  if (Changed)
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB)
+        if (MI.isBundled())
+          MI.unbundleFromSucc();
+
   return Changed;
 }
 

diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.h b/llvm/lib/Target/AMDGPU/SILowerControlFlow.h
@@ -16,6 +16,14 @@ class SILowerControlFlowPass : public PassInfoMixin<SILowerControlFlowPass> {
 public:
   PreservedAnalyses run(MachineFunction &MF,
                         MachineFunctionAnalysisManager &MFAM);
+
+  MachineFunctionProperties getRequiredProperties() const {
+    return MachineFunctionProperties().setIsSSA();
+  }
+
+  MachineFunctionProperties getClearedProperties() const {
+    return MachineFunctionProperties().setNoPHIs();
+  }
 };
 } // namespace llvm