-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AMDGPU] Add Early Register Spilling #171523
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Konstantina Mitropoulou (kmitropoulou) ChangesPatch is 477.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171523.diff 23 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5df11a45b4889..b46ee685f674f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -45,6 +45,8 @@ FunctionPass *createSIWholeQuadModeLegacyPass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIOptimizeVGPRLiveRangeLegacyPass();
+FunctionPass *createAMDGPUNextUseAnalysisPass();
+FunctionPass *createAMDGPUEarlyRegisterSpillingPass();
FunctionPass *createSIFixSGPRCopiesLegacyPass();
FunctionPass *createLowerWWMCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
@@ -191,6 +193,12 @@ extern char &SIFixSGPRCopiesLegacyID;
void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
+void initializeAMDGPUNextUseAnalysisPassPass(PassRegistry &);
+extern char &AMDGPUNextUseAnalysisID;
+
+void initializeAMDGPUEarlyRegisterSpillingPass(PassRegistry &);
+extern char &AMDGPUEarlyRegisterSpillingID;
+
void initializeSILowerWWMCopiesLegacyPass(PassRegistry &);
extern char &SILowerWWMCopiesLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.cpp b/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.cpp
new file mode 100644
index 0000000000000..905f788906613
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.cpp
@@ -0,0 +1,940 @@
+//===------------------- AMDGPUEarlyRegisterSpilling.cpp -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUEarlyRegisterSpilling.h"
+#include "AMDGPU.h"
+#include "AMDGPUNextUseAnalysis.h"
+#include "GCNSubtarget.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-early-register-spilling"
+
+STATISTIC(NumOfERSSpills, "Number of ERS spills");
+
+static cl::opt<bool> EarlyRegisterSpilling("early-register-spilling",
+ cl::init(true), cl::Hidden);
+static cl::opt<unsigned>
+ VGPRMaxNums("max-vgprs", cl::init(0), cl::Hidden,
+ cl::desc("The maximum number of VGPRs per wave."));
+
+static bool isVCCRegister(Register Reg) {
+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
+}
+
+static bool isSpillSaveInstr(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AMDGPU::SI_SPILL_S32_SAVE:
+ case AMDGPU::SI_SPILL_S64_SAVE:
+ case AMDGPU::SI_SPILL_S96_SAVE:
+ case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S160_SAVE:
+ case AMDGPU::SI_SPILL_S192_SAVE:
+ case AMDGPU::SI_SPILL_S224_SAVE:
+ case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S288_SAVE:
+ case AMDGPU::SI_SPILL_S320_SAVE:
+ case AMDGPU::SI_SPILL_S352_SAVE:
+ case AMDGPU::SI_SPILL_S384_SAVE:
+ case AMDGPU::SI_SPILL_S512_SAVE:
+ case AMDGPU::SI_SPILL_S1024_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V160_SAVE:
+ case AMDGPU::SI_SPILL_V192_SAVE:
+ case AMDGPU::SI_SPILL_V224_SAVE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V288_SAVE:
+ case AMDGPU::SI_SPILL_V320_SAVE:
+ case AMDGPU::SI_SPILL_V352_SAVE:
+ case AMDGPU::SI_SPILL_V384_SAVE:
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V1024_SAVE:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+}
+
+static bool isSpillRestoreInstr(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AMDGPU::SI_SPILL_S32_RESTORE:
+ case AMDGPU::SI_SPILL_S64_RESTORE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
+ case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S160_RESTORE:
+ case AMDGPU::SI_SPILL_S192_RESTORE:
+ case AMDGPU::SI_SPILL_S224_RESTORE:
+ case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S288_RESTORE:
+ case AMDGPU::SI_SPILL_S320_RESTORE:
+ case AMDGPU::SI_SPILL_S352_RESTORE:
+ case AMDGPU::SI_SPILL_S384_RESTORE:
+ case AMDGPU::SI_SPILL_S512_RESTORE:
+ case AMDGPU::SI_SPILL_S1024_RESTORE:
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_V96_RESTORE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V160_RESTORE:
+ case AMDGPU::SI_SPILL_V192_RESTORE:
+ case AMDGPU::SI_SPILL_V224_RESTORE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ case AMDGPU::SI_SPILL_V288_RESTORE:
+ case AMDGPU::SI_SPILL_V320_RESTORE:
+ case AMDGPU::SI_SPILL_V352_RESTORE:
+ case AMDGPU::SI_SPILL_V384_RESTORE:
+ case AMDGPU::SI_SPILL_V512_RESTORE:
+ case AMDGPU::SI_SPILL_V1024_RESTORE:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+}
+
+void AMDGPUEarlyRegisterSpilling::updateIndexes(MachineInstr *MI) {
+ if (Indexes->hasIndex(*MI))
+ Indexes->removeMachineInstrFromMaps(*MI);
+ Indexes->insertMachineInstrInMaps(*MI);
+}
+
+void AMDGPUEarlyRegisterSpilling::updateLiveness(Register Reg) {
+ if (LIS->hasInterval(Reg))
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+}
+
+void AMDGPUEarlyRegisterSpilling::updateLiveness(MachineInstr *MI) {
+ for (auto &MO : MI->operands()) {
+ if (MO.isReg()) {
+ auto Reg = MO.getReg();
+ if (Reg.isVirtual()) {
+ if (LIS->hasInterval(Reg))
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ }
+ }
+}
+
+// We need this because it does not make sense to spill a def which has a use in
+// a phi at the beginning of a basic block and it is defined a bit later.
+bool AMDGPUEarlyRegisterSpilling::hasPHIUseInSameBB(Register Reg,
+ MachineBasicBlock *CurMBB) {
+ for (auto &UseMI : MRI->use_nodbg_instructions(Reg))
+ if (UseMI.isPHI() && UseMI.getParent() == CurMBB)
+ return true;
+ return false;
+}
+
+MachineInstr *
+AMDGPUEarlyRegisterSpilling::emitRestore(Register DefRegToSpill,
+ MachineInstr *DefRegUseInstr, int FI) {
+ const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, DefRegToSpill);
+ Register NewReg = MRI->createVirtualRegister(RC);
+ RestoredRegs.insert(NewReg);
+ MachineBasicBlock *DefRegUseInstrBB = DefRegUseInstr->getParent();
+ MachineInstr *Restore = nullptr;
+ assert(DefRegUseInstr->getOpcode() != AMDGPU::PHI &&
+ "We cannot emit a restore instruction before a phi node");
+ TII->loadRegFromStackSlot(*DefRegUseInstrBB, DefRegUseInstr->getIterator(),
+ NewReg, FI, RC, 0);
+ Restore = DefRegUseInstr->getPrevNode();
+ DefRegUseInstr->substituteRegister(DefRegToSpill, NewReg, 0, *TRI);
+ LIS->InsertMachineInstrInMaps(*Restore);
+ LLVM_DEBUG(dbgs() << "Emit restore before use: " << *DefRegUseInstr);
+ LLVM_DEBUG(dbgs() << "Restore instruction = " << *Restore);
+ LLVM_DEBUG(dbgs() << "Restore block = " << Restore->getParent()->getName()
+ << "\n");
+ LLVM_DEBUG(dbgs() << "Register to replace spilled register = "
+ << printReg(NewReg, TRI) << "\n");
+ return Restore;
+}
+
+MachineInstr *
+AMDGPUEarlyRegisterSpilling::emitRestore(Register DefRegToSpill,
+ MachineBasicBlock *InsertBB, int FI) {
+ assert(InsertBB && "The value of the block is nullptr.");
+ const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, DefRegToSpill);
+ Register NewReg = MRI->createVirtualRegister(RC);
+ RestoredRegs.insert(NewReg);
+ auto It = InsertBB->getFirstTerminator();
+ if (It == InsertBB->end())
+ It = InsertBB->instr_end();
+ TII->loadRegFromStackSlot(*InsertBB, It, NewReg, FI, RC, 0);
+ MachineInstr *Restore = &*(std::prev(It));
+ LIS->InsertMachineInstrInMaps(*Restore);
+ LLVM_DEBUG(dbgs() << "Restore instruction = " << *Restore);
+ LLVM_DEBUG(dbgs() << "Emit restore at the end of basic block: = "
+ << Restore->getParent()->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "Register to replace spilled register = "
+ << printReg(NewReg, TRI) << "\n");
+ return Restore;
+}
+
+// TODO: Enable spill spill and restore instructions.
+bool AMDGPUEarlyRegisterSpilling::isLegalToSpill(Register CandidateReg) {
+ assert(MRI->hasOneDef(CandidateReg) &&
+ "The Register does not have one definition");
+ MachineInstr *CandidateMI = MRI->getOneDef(CandidateReg)->getParent();
+ return !hasPHIUseInSameBB(CandidateReg, CandidateMI->getParent()) &&
+ !MRI->use_nodbg_empty(CandidateReg) &&
+ !isSpillSaveInstr(CandidateMI) && !isSpillRestoreInstr(CandidateMI) &&
+ !isSpilledReg(CandidateReg) && !isRestoredReg(CandidateReg) &&
+ !CandidateReg.isPhysical() && !TRI->isAGPR(*MRI, CandidateReg) &&
+ !isVCCRegister(CandidateReg) && !CandidateMI->isTerminator() &&
+ TRI->isVGPR(*MRI, CandidateReg);
+}
+
+SmallVector<Register> AMDGPUEarlyRegisterSpilling::getRegistersToSpill(
+ MachineInstr *CurMI, GCNDownwardRPTracker &RPTracker) {
+ MachineBasicBlock *CurMBB = CurMI->getParent();
+ bool IsCurMIInLoop = MLI->getLoopFor(CurMI->getParent());
+ SmallVector<std::pair<Register, uint64_t>> RegCandidates;
+ MachineLoop *OutermostLoop = nullptr;
+ uint64_t LoopDistance = 0;
+ LLVM_DEBUG(dbgs() << "===========================================\n");
+ if (IsCurMIInLoop) {
+ OutermostLoop = MLI->getLoopFor(CurMI->getParent())->getOutermostLoop();
+ auto [DistanceFromHeaderToExitingLatch, ExitingLatch] =
+ NUA.getLoopDistanceAndExitingLatch(OutermostLoop->getHeader());
+ LoopDistance =
+ (DistanceFromHeaderToExitingLatch + ExitingLatch->size()) * LoopWeight;
+ }
+
+ for (auto [CandidateReg, Mask] : RPTracker.getLiveRegs()) {
+
+ if (!isLegalToSpill(CandidateReg))
+ continue;
+
+ assert(MRI->hasOneDef(CandidateReg) &&
+ "The Register does not have one definition");
+ MachineInstr *CandidateMI = MRI->getOneDef(CandidateReg)->getParent();
+ MachineLoop *CandidateLoop = MLI->getLoopFor(CandidateMI->getParent());
+ bool IsLoopCandidate =
+ IsCurMIInLoop &&
+ (!CandidateLoop || (CandidateLoop && OutermostLoop &&
+ ((CandidateLoop != OutermostLoop) ||
+ !OutermostLoop->contains(CandidateLoop))));
+
+ if (IsCurMIInLoop && !IsLoopCandidate)
+ continue;
+
+ SmallVector<MachineInstr *> Uses;
+ for (auto &UseMI : MRI->use_nodbg_instructions(CandidateReg)) {
+ MachineBasicBlock *UseMBB = UseMI.getParent();
+ if (isReachable(CurMBB, UseMBB) ||
+ (CurMBB == UseMBB && DT->dominates(CurMI, &UseMI)))
+ Uses.push_back(&UseMI);
+ }
+
+ if (Uses.empty())
+ continue;
+
+ auto NextUseDist = NUA.getNextUseDistance(CandidateReg, CurMI, Uses);
+
+ if (!IsCurMIInLoop) {
+ // If CurMI is not in a loop, then we collect the registers that we
+ // can spill based on their next-use-distance from CurMI in
+ // 'RegCandidates'.
+ RegCandidates.push_back(std::make_pair(CandidateReg, *NextUseDist));
+ LLVM_DEBUG(dbgs() << "Candidate register to spill = "
+ << printReg(CandidateReg, TRI)
+ << " with distance = " << *NextUseDist << "\n");
+ } else if (IsLoopCandidate && (NextUseDist > LoopDistance)) {
+ // Collect only the live-through values.
+ RegCandidates.push_back(std::make_pair(CandidateReg, *NextUseDist));
+ LLVM_DEBUG(dbgs() << "Candidate register to spill = "
+ << printReg(CandidateReg, TRI)
+ << " with distance = " << *NextUseDist << "\n");
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "==========================================\n");
+ if (RegCandidates.empty())
+ return {};
+
+ // Return the registers with the longest next-use distance.
+ llvm::sort(RegCandidates, [](auto &Pair1, auto &Pair2) {
+ return Pair1.second > Pair2.second;
+ });
+
+ SmallVector<Register> RegistersToSpill;
+ RegistersToSpill.reserve(RegCandidates.size());
+ for (auto P : RegCandidates)
+ RegistersToSpill.push_back(P.first);
+
+ return RegistersToSpill;
+}
+
+// Helper function for finding the incoming blocks that are related to
+// DefRegToSpill
+static SmallVector<MachineBasicBlock *>
+getPhiBlocksOfSpillReg(MachineInstr *UseMI, Register DefRegToSpill) {
+ assert(UseMI->isPHI() && "The use is not phi instruction");
+ SmallVector<MachineBasicBlock *> Blocks;
+ auto Ops = UseMI->operands();
+ for (auto It = std::next(Ops.begin()), ItE = Ops.end(); It != ItE;
+ It = std::next(It, 2)) {
+ auto &RegMO = *It;
+ if (RegMO.isUndef())
+ continue;
+ auto &MBBMO = *std::next(It);
+ assert(RegMO.isReg() && "Expected register operand of PHI");
+ assert(MBBMO.isMBB() && "Expected MBB operand of PHI");
+ if (RegMO.getReg() == DefRegToSpill)
+ Blocks.push_back(MBBMO.getMBB());
+ }
+ return Blocks;
+}
+
+// TODO: check if the common dominator of restores is profitable
+bool AMDGPUEarlyRegisterSpilling::shouldEmitRestoreInCommonDominator(
+ MachineBasicBlock *SpillBlock, MachineBasicBlock *CurMBB,
+ MachineBasicBlock *CommonDominatorToRestore) {
+ if (SpillBlock == CommonDominatorToRestore)
+ return false;
+ else if (CurMBB == CommonDominatorToRestore)
+ return false;
+ else if (DT->dominates(CommonDominatorToRestore, SpillBlock))
+ return false;
+ else if (isReachable(CommonDominatorToRestore, SpillBlock))
+ return false;
+ else if (!DT->dominates(SpillBlock, CommonDominatorToRestore))
+ return false;
+ else if (MLI->getLoopFor(CommonDominatorToRestore))
+ return false;
+ return true;
+}
+
+/// Helper data structure for grouping together uses where the head of the group
+/// dominates all the other uses in the group.
+class DomGroup {
+ SmallVector<MachineInstr *> Uses;
+ SmallVector<MachineBasicBlock *> UseBlocks;
+ MachineBasicBlock *CommonDominator = nullptr;
+ bool Deleted = false;
+
+public:
+ DomGroup(MachineInstr *MI, MachineBasicBlock *RestoreBlock) {
+ Uses.push_back(MI);
+ UseBlocks.push_back(RestoreBlock);
+ }
+ MachineInstr *getHead() const { return Uses.front(); }
+ bool isDeleted() const { return Deleted; }
+ void merge(DomGroup &Other) {
+ for (auto *MI : Other.Uses)
+ Uses.push_back(MI);
+
+ for (auto *UseMBB : Other.UseBlocks)
+ UseBlocks.push_back(UseMBB);
+
+ Other.Deleted = true;
+ }
+ const auto &getUses() const { return Uses; }
+ const auto &getUseBlocks() const { return UseBlocks; }
+ size_t size() const { return Uses.size(); }
+ void setCommonDominator(MachineBasicBlock *CD) { CommonDominator = CD; }
+ MachineBasicBlock *getCommonDominator() const { return CommonDominator; }
+ bool hasCommonDominator() const { return CommonDominator != nullptr; }
+ MachineBasicBlock *getRestoreBlock() const { return UseBlocks.front(); }
+};
+
+void AMDGPUEarlyRegisterSpilling::emitRestoreInstrsForDominatedUses(
+ Register DefRegToSpill, MachineInstr *SpillInstruction, MachineInstr *CurMI,
+ SetVectorType &DominatedUses, SmallVector<MachineInstr *> &RestoreInstrs,
+ SmallVector<MachineInstr *> &RestoreUses, int FI) {
+ MachineBasicBlock *SpillBlock = SpillInstruction->getParent();
+ MachineBasicBlock *CurMBB = CurMI->getParent();
+ MachineLoop *SpillLoop = MLI->getLoopFor(SpillBlock);
+ assert(!SpillLoop && "There should not be a spill loop.");
+
+ std::vector<DomGroup> Groups;
+ for (auto *Use : DominatedUses) {
+ MachineLoop *UseLoop = MLI->getLoopFor(Use->getParent());
+ if (UseLoop) {
+ // If a use is in a loop then the restore instruction is emitted in the
+ // outermost loop's preheader.
+ MachineLoop *OutermostLoop = UseLoop->getOutermostLoop();
+ MachineBasicBlock *OutermostLoopPreheader =
+ OutermostLoop->getLoopPreheader();
+ Groups.emplace_back(Use, OutermostLoopPreheader);
+ } else if (Use->isPHI()) {
+ // In case of phi nodes, the restore instructions are emitted at the
+ // bottom of the incoming blocks.
+ for (MachineBasicBlock *PhiOpMBB :
+ getPhiBlocksOfSpillReg(Use, DefRegToSpill)) {
+ Groups.emplace_back(Use, PhiOpMBB);
+ }
+ } else {
+ // Emit restore before Use.
+ Groups.emplace_back(Use, Use->getParent());
+ }
+ }
+
+ // Our goal is to emit as few restores as possible by avoiding emitting
+ // restore instructions if an earlier restore can be reused.
+ //
+ // Create groups of instructions where the group head dominates the rest in
+ // the group. In addition, we check if we can find an eligible common
+ // dominator that we can emit the restore isntruction.
+ //
+ // In the following example, there are two groups. The first group consists of
+ // the uses in BB3 and BB5 and the second group consists of the uses in BB4
+ // and BB6. The head of the first group is the use in BB3 and the head of the
+ // second group is the use in BB4.
+ //
+ // BB1
+ // r1 = ...
+ // |
+ // BB2
+ // spill r1 <-- high register pressure block
+ // / \
+ // BB3 BB4
+ // r2 = restore r1 r3 = restore r1
+ // ... = r2 ... = r3
+ // | |
+ // BB5 BB6
+ // ... = r2 ... = r3
+ //
+ // In the following example, we emit the restore instruction in the common
+ // dominator of the two uses in BB4 and BB5.
+ // BB1
+ // r1 = ...
+ // |
+ // BB2
+ // spill r1 <-- high register pressure block
+ // |
+ // BB3
+ // r2 = restore r1
+ // / \
+ // BB4 BB5
+ // ... = r2 ... = r2
+ //
+ for (unsigned Idx1 = 0, E = Groups.size(); Idx1 != E; ++Idx1) {
+ auto &G1 = Groups[Idx1];
+ if (G1.isDeleted())
+ continue;
+ for (unsigned Idx2 = Idx1 + 1; Idx2 < E; ++Idx2) {
+ auto &G2 = Groups[Idx2];
+ if (G2.isDeleted())
+ continue;
+
+ MachineInstr *Head1 = G1.getHead();
+ MachineInstr *Head2 = G2.getHead();
+ MachineBasicBlock *RestoreBlock1 = G1.getRestoreBlock();
+ MachineBasicBlock *RestoreBlock2 = G2.getRestoreBlock();
+ SmallVector<MachineBasicBlock *> UseBlocks;
+ for (auto *Block : G1.getUseBlocks())
+ UseBlocks.push_back(Block);
+
+ for (auto *Block : G2.getUseBlocks())
+ UseBlocks.push_back(Block);
+
+ MachineBasicBlock *CommonDom = DT->findNearestCommonDominator(
+ make_range(UseBlocks.begin(), UseBlocks.end()));
+
+ if ((RestoreBlock1 != RestoreBlock2) &&
+ shouldEmitRestoreInCommonDominator(SpillBlock, CurMBB, CommonDom)) {
+ // Set a common dominator if the two restore blocks are different.
+ G1.merge(G2);
+ G1.setCommonDominator(CommonDom);
+ } else if (DT->dominates(Head1, Head2) && !G1.getCommonDominator() &&
+ !G2.getCommonDominator()) {
+ // If there is no common dominator and one Head dominates the other,
+ // then we can merge the two groups.
+ G1.merge(G2);
+ }
+ }
+ }
+
+ // For each group emit one restore for the group header in the parent block of
+ // the group header or the common dominator. The rest of the uses in the group
+ // will reuse the value loaded by the restore of the header.
+ for (auto &G1 : Groups) {
+ if (G1.isDeleted())
+ continue;
+ MachineInstr *Head = G1.getHead();
+ MachineBasicBlock *HeadMBB = G1.getRestoreBlock();
+ MachineInstr *Restore = nullptr;
+ if (G1.hasCommonDominator()) {
+ MachineBasicBlock *CommonDominator = G1.getCommonDominator();
+ MachineInstr *UseInCommonDominator = nullptr;
+ for (auto *U : G1.getUses()) {
+ if (U->getParent() == CommonDominator) {
+ if (UseInCommonDominator) {
+ if (DT->dominates(U, UseInCommonDominator))
+ Us...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.cpp llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.h llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.h llvm/test/CodeGen/AMDGPU/test_ers_basic_case.ll llvm/test/CodeGen/AMDGPU/test_ers_do_not_spill_restore_inside_loop.ll llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_common_dominator.ll llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader1.ll llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader2.ll llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader3.ll llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader4.ll llvm/test/CodeGen/AMDGPU/test_ers_keep_spilled_reg_live.ll llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills1.ll llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills2.ll llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills3.ll llvm/test/CodeGen/AMDGPU/test_ers_nested_loops.ll llvm/test/CodeGen/AMDGPU/test_ers_spill_in_common_dominator_and_optimize_restores.ll llvm/test/CodeGen/AMDGPU/test_ers_spill_loop_livethrough_reg.ll llvm/test/CodeGen/AMDGPU/test_ers_spill_loop_value_in_exit_block.ll llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/test/CodeGen/AMDGPU/llc-pipeline.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
rovka
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(I'm still making my way through this, but I'm off for lunch and I don't particularly trust github to preserve these comments).
| AMDGPUNextUseAnalysis NUA; | ||
| /// Keep the registers that are spilled. | ||
| DenseSet<Register> SpilledRegs; | ||
| /// keep the output registers of the restored instructions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// keep the output registers of the restored instructions. | |
| /// Keep the output registers of the restored instructions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| DenseSet<Register> SpilledRegs; | ||
| /// keep the output registers of the restored instructions. | ||
| DenseSet<Register> RestoredRegs; | ||
| /// Similar to next-use distance analysis, ee assume an approximate trip count |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Similar to next-use distance analysis, ee assume an approximate trip count | |
| /// Similar to next-use distance analysis, we assume an approximate trip count |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| const SetVectorType &UnreachableUses, | ||
| const TargetRegisterClass *RC, int FI); | ||
|
|
||
| /// Main spill functions that emits the spill and restore code. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Main spill functions that emits the spill and restore code. | |
| /// Main spill function that emits the spill and restore code. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| void spill(MachineInstr *CurMI, GCNDownwardRPTracker &RPTracker, | ||
| unsigned NumOfSpills); | ||
|
|
||
| /// Emit restore instructions where it is needed |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Emit restore instructions where it is needed | |
| /// Emit restore instruction where it is needed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
|
|
||
| /// Emit restore instructions where it is needed | ||
| MachineInstr *emitRestore(Register SpillReg, MachineInstr *UseMI, int FI); | ||
| /// Emit restore instruction at the end of a basic block |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Emit restore instruction at the end of a basic block | |
| /// Emit restore instruction at the end of a basic block. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| void AMDGPUEarlyRegisterSpilling::updateIndexes(MachineInstr *MI) { | ||
| if (Indexes->hasIndex(*MI)) | ||
| Indexes->removeMachineInstrFromMaps(*MI); | ||
| Indexes->insertMachineInstrInMaps(*MI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, interesting! I don't think this was marked as preserved in getAnalysisUsage though... (Ditto for Liveness)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's add a TODO for now and I will remove it later.
| MachineInstr * | ||
| AMDGPUEarlyRegisterSpilling::emitRestore(Register DefRegToSpill, | ||
| MachineBasicBlock *InsertBB, int FI) { | ||
| assert(InsertBB && "The value of the block is nullptr."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make it a reference?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| !isSpillSaveInstr(CandidateMI) && !isSpillRestoreInstr(CandidateMI) && | ||
| !isSpilledReg(CandidateReg) && !isRestoredReg(CandidateReg) && | ||
| !CandidateReg.isPhysical() && !TRI->isAGPR(*MRI, CandidateReg) && | ||
| !isVCCRegister(CandidateReg) && !CandidateMI->isTerminator() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't VCC already covered by the check for physical registers?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch!
| return Restore; | ||
| } | ||
|
|
||
| // TODO: Enable spill spill and restore instructions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did you mean SGPR spill?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I updated the comment. I meant that this function needs more tuning.
| return Pair1.second > Pair2.second; | ||
| }); | ||
|
|
||
| SmallVector<Register> RegistersToSpill; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| SmallVector<Register> RegistersToSpill; | |
| SmallVector<Register> RegistersToSpill(llvm::make_first_range(RegCandidates)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you!
| MachineBasicBlock *CommonDominatorToRestore) { | ||
| if (SpillBlock == CommonDominatorToRestore) | ||
| return false; | ||
| else if (CurMBB == CommonDominatorToRestore) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need for else after return.
(Maybe a matter of taste, but this whole function can just be one return statement).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like your taste :)
| // | ||
| // Create groups of instructions where the group head dominates the rest in | ||
| // the group. In addition, we check if we can find an eligible common | ||
| // dominator that we can emit the restore isntruction. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // dominator that we can emit the restore isntruction. | |
| // dominator where we can emit the restore instruction. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| FI); | ||
| } | ||
| // Reset the tracker because it has already read the next instruction which | ||
| // we might have modified by a emitting a spill or restore instruction. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // we might have modified by a emitting a spill or restore instruction. | |
| // we might have modified by emitting a spill or restore instruction. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
jmmartinez
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some more comments (sorry it's a big PR)
| MachineLoop *LoopBB = MLI->getLoopFor(BB); | ||
| MachineLoop *LoopTo = MLI->getLoopFor(ToMBB); | ||
| if (LoopBB && LoopTo && | ||
| (LoopTo->contains(LoopBB) && (LoopTo != LoopBB))) { | ||
| return BB->size() * LoopWeight * | ||
| (MLI->getLoopDepth(BB) - MLI->getLoopDepth(ToMBB)); | ||
| } | ||
| if ((LoopBB && LoopTo && LoopBB->contains(LoopTo))) { | ||
| return BB->size(); | ||
| } | ||
| if ((!LoopTo && LoopBB) || | ||
| (LoopBB && LoopTo && !LoopTo->contains(LoopBB))) { | ||
| return BB->size() * LoopWeight * MLI->getLoopDepth(BB); | ||
| } else | ||
| return BB->size(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe use some intermediate boolean variables to carry the intent of the condition
| MachineLoop *LoopBB = MLI->getLoopFor(BB); | |
| MachineLoop *LoopTo = MLI->getLoopFor(ToMBB); | |
| if (LoopBB && LoopTo && | |
| (LoopTo->contains(LoopBB) && (LoopTo != LoopBB))) { | |
| return BB->size() * LoopWeight * | |
| (MLI->getLoopDepth(BB) - MLI->getLoopDepth(ToMBB)); | |
| } | |
| if ((LoopBB && LoopTo && LoopBB->contains(LoopTo))) { | |
| return BB->size(); | |
| } | |
| if ((!LoopTo && LoopBB) || | |
| (LoopBB && LoopTo && !LoopTo->contains(LoopBB))) { | |
| return BB->size() * LoopWeight * MLI->getLoopDepth(BB); | |
| } else | |
| return BB->size(); | |
| MachineLoop *LoopBB = MLI->getLoopFor(BB); | |
| MachineLoop *LoopTo = MLI->getLoopFor(ToMBB); | |
| bool LoopBBIsStrictSubloopOfLoopTo = LoopBB && LoopTo && | |
| LoopTo != LoopBB && LoopTo->contains(LoopBB); | |
| if (LoopBBIsStrictSubloopOfLoopTo) { | |
| return BB->size() * LoopWeight * | |
| (MLI->getLoopDepth(BB) - MLI->getLoopDepth(ToMBB)); | |
| } | |
| bool LoopBBIsSubloopOfLoopTo = LoopBB && LoopTo && LoopBB->contains(LoopTo); | |
| if (LoopBBIsSubloopOfLoopTo) { | |
| return BB->size(); | |
| } | |
| bool LoopBBInLoopUnrealtedFromLoopTo = (!LoopTo && LoopBB) || | |
| (LoopBB && LoopTo && !LoopTo->contains(LoopBB)); | |
| if (LoopBBInLoopUnrealtedFromLoopTo) { | |
| return BB->size() * LoopWeight * MLI->getLoopDepth(BB); | |
| } else | |
| return BB->size(); |
For the LoopBBIsSubloopOfLoopTo, since !LoopBBIsStrictSubloopOfLoopTo is implied, I wonder if the actual condition is bool InTheSameLoop = LoopBB && LoopTo == LoopBB would be enough.
One thing I find odd in the formula is that the score formula doesn't behave consistently if BB and ToMBB are in the same loop. Consider this:
BB->size() * LoopWeight * (MLI->getLoopDepth(BB) - MLI->getLoopDepth(ToMBB)) != BB->size();If I get the intention of the LoopWeigh, it tries to represent the number of iterations that a loop would make. If we have the following CFG:
FromBB -> LoopBB -> ToBB
^_|
The distance from FromBB to ToBB would be:
size(FromBB) + size(LoopBB) * LoopWeight
If we have a 2 depth loop:
FromBB -> OuterLoopBB -> InnerLoopBB -> ToBB
^_________| ^_| The distance would be:
size(FromBB) + size(OuterLoopBB) * LoopWeight + size(InnerLoopBB) * 2 * LoopWeightBut if we consider that the outer and inner loops may execute LoopWeight times, a more consistent distance would be:
size(FromBB) + (size(OuterLoopBB)+ size(InnerLoopBB) * LoopWeight) * LoopWeightPerhaps, a more consistent formula for nested loops would be:
BB->size() * pow(LoopWeight, (MLI->getLoopDepth(BB) - MLI->getLoopDepth(ToMBB))); // LoopWeight may have to be adjustedThen, if BB and ToMBB are in the same loop we return BB->size(); and every loop depth contributes.
The goal is to end up with this code for the distance:
bool LoopBBIsSubloop = LoopBB && LoopTo && LoopBB->contains(LoopTo);
unsinged LoopBBDepth = LoopBB ? MLI->getLoopDepth(BB) : 0;
unsigned LoopToDepth = LoopTo ? MLI->getLoopDepth(ToMBB) : 0;
unsinged DepthDiff = LoopBBIsSubloop : LoopBBDepth - LoopToDepth ? LoopBBDepth;
return BB->size() * pow(LoopWeight, DepthDiff)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If FromBB and ToBB are in the same loop, then you do not need to take into consideration the loopWeight.
In addition, you should take into consideration that the nested loops have overlapping basic blocks.
Let's assume that we have the following GFG. To calculate the next-use distance of r1, we should first find the distance of the loop nest as folllows:
loop1_header * loopDepth * loopWeight = 10 * 1 * 1000 = 10000
loop1_bb * loopDepth * loopWeight = 10 * 1 * 1000 = 10000
loop2_header * loopDepth *loopWeight = 10 * 2 * 1000 = 20000
loop3 * loopDepth *loopWeight = 10 * 3 * 1000 = 30000
loop2_latch * loopDepth *loopWeight = 10 * 2 * 1000 = 20000
loop1_latch * loopDepth *loopWeight = 10 * 1 * 1000 = 10000
Hence, the loop distance is 100000.
Next, you add the distance from r1 to end of bb1 and the distance from the beginning of bb2 to r1.
+------------------+
| bb1 |
| r1 = ... |
+------------------+
|
+<-----------------+
+------------------+ |
| loop1_header | |
| 10 instrs | |
+------------------+ |
| |
+------------------+ |
| loop1_bb | |
| 10 instrs | |
+------------------+ |
| |
+<--------------+ |
+------------------+ | |
| loop2_header | | |
| 10 instrs | | |
+------------------+ | |
| | |
+<-----------+ | |
+------------------+ | | |
| loop3 | | | |
| 10 instrs | | | |
+------------------+ | | |
| | | |
+------------+ | |
+------------------+ | |
| loop2_latch | | |
| 10 instrs | | |
+------------------+ | |
| | |
+---------------+ |
+------------------+ |
| loop1_latch | |
| 10 instrs | |
+------------------+ |
| |
+------------------+
+------------------+
| bb2 |
| ... = r1 |
+------------------+
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jmmartinez The #171520 prints for each register of each test the distances, you can double check.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If FromBB and ToBB are in the same loop, then you do not need to take into consideration the loopWeight.
Yes. In the formula I propose we'd end up with BB->size() * pow(LoopWeight, 0) since they are in the same loop, which simplifies to BB->size().
What I wanted to point out with my comment is that I think the weight that you give to a basic-block should grow exponentially with the nesting, and not linearly.
Here bb1 is executed n times, but bb2 is executed n*n times.
r1 = define()
for 0 ... n
// bb1
for 0 ... n
// bb2
use(r1)
Between r1 definition and its use we get n*#bb1 + n*n*#bb2. If we replace n by the loop weight we get the formula I proposed. Using n*#bb1 + 2*n*#bb2 underapproximates the weight of nested loops by an order of magnitude.
Is this a bad thing? I'm not sure since in any case both formulas are rough approximations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I will update the patch soon. This has been in my TODO list for long time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
| MachineInstr *InstrOfDefRegToSpill = | ||
| MRI->getOneDef(DefRegToSpill)->getParent(); | ||
|
|
||
| // Collect the uses that are dominated by SpillInstruction |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment is confusing. It should explicitly mention that excluding uses in UnreachableUses.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree. I changed the name of the function to collectUsesThatNeedRestoreInstrs()
| if (SpillBlock == nullptr) | ||
| continue; | ||
| // The next step is to check if there are any uses which are reachable | ||
| // from the SpillBlock. In this case, we have to emit the spill in the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see why we have to. You can still do the reload in the post dominator of SpillBlock and constructing a PHI in the successor block somehow to merge the values keeping in register and the reloaded.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This creates correctness issues in divergent execution.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you help explain why?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the following example, the spill and the restore will be executed under different execution masks:
+--------------+
| BB1 |
| r1 = ... |
+--------------+
/ |
+--------------+ |
| BB2 | |
| spill r1 | |
+--------------+ |
\ |
+--------------+
| BB3 |
| |
+--------------+
|
+--------------+
| BB4 |
|r1'=restore r1|
| ... = r1' |
+--------------+
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's weird selection of spill/restore points. For this case, why do you spill r1 in BB2? I think this usually happen when you try to reduce the register pressure in BB2, right? If BB2 is the only high register pressure region. We could let r1 in stack only in BB2, while in other live ranges it is still kept in register. Like:
+--------------+
| BB1 |
| r1 = ... |
+--------------+
/ |
+--------------+ |
| BB2 | |
| spill r1 | |
| high pressure| |
|r1'= reload r1| |
+--------------+ |
\ |
+--------------+
| BB3 |
|r1''=phi(r1,r1')|
| |
+--------------+
|
+--------------+
| BB4 |
| |
| ... = r1'' |
+--------------+
The point is we really don't need to find the dominator of spill-block and reachable uses to be correct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree we have to tune the algorithm. So, we have to explore more options. Spill and restore instructions are expensive. We should find ways to reduce their overhead. Can we fix the problem by adding a spill and restore inside the high register pressure block?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As the paper says, if we spill registers with long next-use distance, then other high register pressure blocks might benefit from it.
| // This is due to the fact that some unreachable uses might become reachable if | ||
| // we spill in common dominator. | ||
| std::pair<SetVectorType, SetVectorType> | ||
| AMDGPUEarlyRegisterSpilling::collectReachableAndUnreachableUses( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is misuse of reachable. More accurate description would be ReachableUsesExcludingDominated or NonDominatedReachableUses. The dominated uses are reachable too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree. I had something similar in the past. It got lost in a refactoring. I replaced the ReachableUses with NonDominatedRechableUses.
…ops at each loop level.
Currently, the loop weight is the loop depth multiplied by 1000(which represents the loop iterations). Now, the loop weight is 1000 to the power of the loop depth.
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions cpp,h -- llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.cpp llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.h llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.h llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --diff_from_common_commit
View the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp
index a413b097b..b9428be3a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp
@@ -468,7 +468,6 @@ void AMDGPUNextUseAnalysis::printAllDistances(MachineFunction &MF) {
}
}
-
// TODO: Remove it. It is only used for testing.
std::optional<double>
AMDGPUNextUseAnalysis::getNextUseDistance(Register DefReg) {
|
ERS (Early Register Spilling) tries to spill close to high register pressure. In this way, the spill overhead will impact only the high register pressure blocks. It uses the Register Pressure Tracker to find the parts of the code that have high register pressure. It uses next-use distance analysis to find which register to spill. ERS chooses to spill the live registers with the longest next-use distances (uses that are far away).
We emit the spill instruction as follows:
We optimize the number of the restore instructions as follows:
ERS is based on ideas from the paper of Matthias Braun and Sebastian Hack: Register Spilling and Live-Range Splitting for
SSA-Form Programs ( https://pp.ipd.kit.edu/publication.php?id=braun09cc )