Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,9 @@ extern char &AMDGPUPreloadKernArgPrologLegacyID;
void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
extern char &AMDGPUPreloadKernelArgumentsLegacyID;

void initializeSIRestoreNormalEpilogLegacyPass(PassRegistry &);
extern char &SIRestoreNormalEpilogLegacyID;

// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
Expand Down
11 changes: 8 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIOptimizeExecMaskingLegacyPass(*PR);
initializeSIPreAllocateWWMRegsLegacyPass(*PR);
initializeSIFormMemoryClausesLegacyPass(*PR);
initializeSIRestoreNormalEpilogLegacyPass(*PR);
initializeSIPostRABundlerLegacyPass(*PR);
initializeGCNCreateVOPDLegacyPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
Expand Down Expand Up @@ -1563,7 +1564,7 @@ void GCNPassConfig::addFastRegAlloc() {
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
//insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);

insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);

Expand All @@ -1586,13 +1587,17 @@ void GCNPassConfig::addOptimizedRegAlloc() {
if (OptVGPRLiveRange)
insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID);

insertPass(&SIOptimizeVGPRLiveRangeLegacyID, &SILowerControlFlowLegacyID);

// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
//insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);

if (EnableRewritePartialRegUses)
insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);

insertPass(&RenameIndependentSubregsID,&SIRestoreNormalEpilogLegacyID);

if (isPassEnabled(EnablePreRAOptimizations))
insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID);
Expand Down Expand Up @@ -2256,7 +2261,7 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass<PHIEliminationPass>(SILowerControlFlowPass());
//insertPass<PHIEliminationPass>(SILowerControlFlowPass());

if (EnableRewritePartialRegUses)
insertPass<RenameIndependentSubregsPass>(GCNRewritePartialRegUsesPass());
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ add_llvm_target(AMDGPUCodeGen
SIPreEmitPeephole.cpp
SIProgramInfo.cpp
SIRegisterInfo.cpp
SIRestoreNormalEpilog.cpp
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp

Expand Down
245 changes: 245 additions & 0 deletions llvm/lib/Target/AMDGPU/SICustomBranchBundles.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
#pragma once

#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/Support/ErrorHandling.h"

#include "SIInstrInfo.h"

#include <cassert>
#include <unordered_set>

using namespace llvm;

using std::unordered_set;
using std::vector;

static inline MachineInstr &getBranchWithDest(MachineBasicBlock &BranchingMBB,
MachineBasicBlock &DestMBB) {
auto &TII =
*BranchingMBB.getParent()->getSubtarget<GCNSubtarget>().getInstrInfo();
for (MachineInstr &BranchMI : reverse(BranchingMBB.instrs()))
if (BranchMI.isBranch() && TII.getBranchDestBlock(BranchMI) == &DestMBB)
return BranchMI;

llvm_unreachable("Don't call this if there's no branch to the destination.");
}

static inline void moveInsBeforePhis(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
auto &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
auto &MRI = MF.getRegInfo();

bool PhiSeen = false;
MachineBasicBlock::iterator FirstPhi;
for (FirstPhi = MBB.begin(); FirstPhi != MBB.end(); FirstPhi++)
if (FirstPhi->getOpcode() == AMDGPU::PHI) {
PhiSeen = true;
break;
}

if (!PhiSeen) {
MI.removeFromParent();
MBB.insert(MBB.begin(), &MI);
} else {
auto Phi = BuildMI(MBB, FirstPhi, MI.getDebugLoc(), TII.get(AMDGPU::PHI),
MI.getOperand(0).getReg());
for (auto *PredMBB : MBB.predecessors()) {
Register ClonedReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
MachineInstr &BranchMI = getBranchWithDest(*PredMBB, MBB);
MachineInstr *ClonedMI = MF.CloneMachineInstr(&MI);
ClonedMI->getOperand(0).setReg(ClonedReg);
Phi.addReg(ClonedReg).addMBB(PredMBB);
PredMBB->insertAfterBundle(BranchMI.getIterator(), ClonedMI);
ClonedMI->bundleWithPred();
}
MI.eraseFromParent();
}
}

struct EpilogIterator {
MachineBasicBlock::instr_iterator InternalIt;
EpilogIterator(MachineBasicBlock::instr_iterator I) : InternalIt(I) {}

bool operator==(const EpilogIterator &Other) {
return InternalIt == Other.InternalIt;
}
bool isEnd() { return InternalIt.isEnd(); }
MachineInstr &operator*() { return *InternalIt; }
MachineBasicBlock::instr_iterator operator->() { return InternalIt; }
EpilogIterator &operator++() {
++InternalIt;
if (!InternalIt.isEnd() && InternalIt->isBranch())
InternalIt = InternalIt->getParent()->instr_end();
return *this;
}
EpilogIterator operator++(int Ignored) {
EpilogIterator ToReturn = *this;
++*this;
return ToReturn;
}
};

static inline EpilogIterator getEpilogForSuccessor(MachineBasicBlock &PredMBB,
MachineBasicBlock &SuccMBB) {
MachineFunction &MF = *PredMBB.getParent();
auto &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();

for (MachineInstr &BranchMI : reverse(PredMBB.instrs()))
if (BranchMI.isBranch() && TII.getBranchDestBlock(BranchMI) == &SuccMBB)
return ++EpilogIterator(BranchMI.getIterator());

llvm_unreachable("There should always be a branch to succ_MBB.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is about fall through?
bb.1: successors: %bb.2(0x80000000)

%64:vgpr_32 = V_MOV_B32_e32 100, implicit $exec

bb.2: successors: %bb.5(0x40000000), %bb.3(0x40000000)

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Sep 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looked like the IR wasn't allowed to have fall-throughs at this stage, because I didn't see any and I did see unconditional-branch-to-next. Are fall-throughs instead allowed and just not common at this stage?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fall-throughs are not prohibited and I have seen enough valid MIR in SSA with fall-through CF.

}

static inline bool epilogsAreIdentical(const vector<MachineInstr *> Left,
const vector<MachineInstr *> Right,
const MachineBasicBlock &SuccMBB) {
if (Left.size() != Right.size())
return false;

for (unsigned I = 0; I < Left.size(); I++)
if (!Left[I]->isIdenticalTo(*Right[I]))
return false;
return true;
}

static inline void moveBody(vector<MachineInstr *> &Body,
MachineBasicBlock &DestMBB) {
for (auto RevIt = Body.rbegin(); RevIt != Body.rend(); RevIt++) {
MachineInstr &BodyIns = **RevIt;
BodyIns.removeFromBundle();
DestMBB.insert(DestMBB.begin(), &BodyIns);
}
}

static inline void normalizeIrPostPhiElimination(MachineFunction &MF) {
auto &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();

struct CFGRewriteEntry {
unordered_set<MachineBasicBlock *> PredMBBs;
MachineBasicBlock *SuccMBB;
vector<MachineInstr *> Body;
};

vector<CFGRewriteEntry> CfgRewriteEntries;
for (MachineBasicBlock &MBB : MF) {
CFGRewriteEntry ToInsert = {{}, &MBB, {}};
for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
EpilogIterator EpIt = getEpilogForSuccessor(*PredMBB, MBB);

vector<MachineInstr *> Epilog;
while (!EpIt.isEnd())
Epilog.push_back(&*EpIt++);

if (!epilogsAreIdentical(ToInsert.Body, Epilog, MBB)) {
if (ToInsert.PredMBBs.size() && ToInsert.Body.size()) {
// Potentially, we need to insert a new entry. But first see if we
// can find an existing entry with the same epilog.
bool ExistingEntryFound = false;
for (auto RevIt = CfgRewriteEntries.rbegin();
RevIt != CfgRewriteEntries.rend() && RevIt->SuccMBB == &MBB;
RevIt++)
if (epilogsAreIdentical(RevIt->Body, Epilog, MBB)) {
RevIt->PredMBBs.insert(PredMBB);
ExistingEntryFound = true;
break;
}

if (!ExistingEntryFound)
CfgRewriteEntries.push_back(ToInsert);
}
ToInsert.PredMBBs.clear();
ToInsert.Body = Epilog;
}

ToInsert.PredMBBs.insert(PredMBB);
}

// Handle the last potential rewrite entry. Lower instead of journaling a
// rewrite entry if all predecessor MBBs are in this single entry.
if (ToInsert.PredMBBs.size() == MBB.pred_size()) {
moveBody(ToInsert.Body, MBB);
for (MachineBasicBlock *PredMBB : ToInsert.PredMBBs) {
// Delete instructions that were lowered from epilog
MachineInstr &BranchIns =
getBranchWithDest(*PredMBB, *ToInsert.SuccMBB);
auto EpilogIt = ++EpilogIterator(BranchIns.getIterator());
while (!EpilogIt.isEnd())
EpilogIt++->eraseFromBundle();
}

} else if (ToInsert.Body.size())
CfgRewriteEntries.push_back(ToInsert);
}

// Perform the journaled rewrites.
for (auto &Entry : CfgRewriteEntries) {
MachineBasicBlock *MezzanineMBB = MF.CreateMachineBasicBlock();
MF.insert(MF.end(), MezzanineMBB);

// Deal with mezzanine to successor succession.
BuildMI(MezzanineMBB, DebugLoc(), TII.get(AMDGPU::S_BRANCH))
.addMBB(Entry.SuccMBB);
MezzanineMBB->addSuccessor(Entry.SuccMBB);

// Move instructions to mezzanine block.
moveBody(Entry.Body, *MezzanineMBB);

for (MachineBasicBlock *PredMBB : Entry.PredMBBs) {
// Deal with predecessor to mezzanine succession.
MachineInstr &BranchIns = getBranchWithDest(*PredMBB, *Entry.SuccMBB);
assert(BranchIns.getOperand(0).isMBB() && "Branch instruction isn't.");
BranchIns.getOperand(0).setMBB(MezzanineMBB);
PredMBB->replaceSuccessor(Entry.SuccMBB, MezzanineMBB);

// Delete instructions that were lowered from epilog
auto EpilogIt = ++EpilogIterator(BranchIns.getIterator());
while (!EpilogIt.isEnd())
EpilogIt++->eraseFromBundle();
}
}
}

namespace std {
template <> struct hash<Register> {
std::size_t operator()(const Register &R) const {
return hash<unsigned>()(R);
}
};
} // namespace std

static inline void hoistUnrelatedCopies(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF)
for (MachineInstr &BranchMI : MBB) {
if (!BranchMI.isBranch())
continue;

unordered_set<Register> RelatedCopySources;
EpilogIterator EpilogIt = BranchMI.getIterator();
EpilogIterator CopyMoveIt = ++EpilogIt;
while (!EpilogIt.isEnd()) {
if (EpilogIt->getOpcode() != AMDGPU::COPY)
RelatedCopySources.insert(EpilogIt->getOperand(0).getReg());
++EpilogIt;
}

while (!CopyMoveIt.isEnd()) {
EpilogIterator Next = CopyMoveIt;
++Next;
if (CopyMoveIt->getOpcode() == AMDGPU::COPY &&
!RelatedCopySources.count(CopyMoveIt->getOperand(1).getReg()) ||
CopyMoveIt->getOpcode() == AMDGPU::IMPLICIT_DEF) {
MachineInstr &MIToMove = *CopyMoveIt;
MIToMove.removeFromBundle();
MBB.insert(BranchMI.getIterator(), &MIToMove);
}

CopyMoveIt = Next;
}
}
}
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
/// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits
//===----------------------------------------------------------------------===//

#include "SICustomBranchBundles.h"
#include "SILowerControlFlow.h"
#include "AMDGPU.h"
#include "AMDGPULaneMaskUtils.h"
Expand Down Expand Up @@ -152,6 +153,14 @@ class SILowerControlFlowLegacy : public MachineFunctionPass {
return "SI Lower control flow pseudo instructions";
}

MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().setIsSSA();
}

MachineFunctionProperties getClearedProperties() const override {
return MachineFunctionProperties().setNoPHIs();
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
// Should preserve the same set that TwoAddressInstructions does.
Expand Down Expand Up @@ -323,6 +332,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
if (LV)
LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);

moveInsBeforePhis(*OrSaveExec);

MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();

MachineBasicBlock::iterator ElsePt(MI);
Expand Down Expand Up @@ -840,6 +851,12 @@ bool SILowerControlFlow::run(MachineFunction &MF) {
LoweredIf.clear();
KillBlocks.clear();

if (Changed)
for (MachineBasicBlock &MBB : MF)
for (MachineInstr &MI : MBB)
if (MI.isBundled())
MI.unbundleFromSucc();

return Changed;
}

Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SILowerControlFlow.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ class SILowerControlFlowPass : public PassInfoMixin<SILowerControlFlowPass> {
public:
PreservedAnalyses run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM);

MachineFunctionProperties getRequiredProperties() const {
return MachineFunctionProperties().setIsSSA();
}

MachineFunctionProperties getClearedProperties() const {
return MachineFunctionProperties().setNoPHIs();
}
};
} // namespace llvm

Expand Down
Loading
Loading