-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] Add register usage diagnostics at the point of maximum register pressure. #161850
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Valery Pykhtin (vpykhtin) ChangesBasically this allows to analyze "why so many VGPRs used?". It prints all live registers at the point of maximum register pressure and for each register its defs/uses are dumped. Currently can be run before and after the scheduler but would be nice if it can be ran inbetween any passes (not sure this is possible with legacy pass-manager). Requires debug or built with asserts compiler. Highly recommended to run with debug info to have debug locations for instructions. Example output:
Full diff: https://github.com/llvm/llvm-project/pull/161850.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index ef63acc6355d2..114ae3d12ae1c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -14,6 +14,7 @@
#include "GCNRegPressure.h"
#include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind) {
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
auto Reg = Register::index2VirtReg(I);
+ if (RegKind != GCNRegPressure::TOTAL_KINDS &&
+ GCNRegPressure::getRegKind(Reg, MRI) != RegKind)
+ continue;
if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
#undef PFX
}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI) {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ auto &OS = dbgs();
+ const char *RegName = GCNRegPressure::getName(Kind);
+
+ unsigned MaxNumRegs = 0;
+ MachineInstr *MaxPressureMI = nullptr;
+ GCNUpwardRPTracker RPT(LIS);
+ for (auto &MBB : MF) {
+ RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot());
+ for (auto &MI : reverse(MBB)) {
+ RPT.recede(MI);
+ unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind);
+ if (NumRegs > MaxNumRegs) {
+ MaxNumRegs = NumRegs;
+ MaxPressureMI = &MI;
+ }
+ }
+ }
+
+ SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI);
+
+ // Max pressure can occur at either the early-clobber or register slot.
+ // Choose the maximum liveset between both slots. This is ugly but this is
+ // diagnostic code.
+ SlotIndex ECSlot = MISlot.getRegSlot(true);
+ SlotIndex RSlot = MISlot.getRegSlot(false);
+ GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind);
+ GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind);
+ unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind);
+ unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind);
+ GCNRPTracker::LiveRegSet *LiveSet =
+ ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet;
+ SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot;
+ assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs);
+
+ // Split live registers into single-def and multi-def sets.
+ GCNRegPressure SDefPressure, MDefPressure;
+ SmallVector<Register, 16> SDefRegs, MDefRegs;
+ for (auto [Reg, LaneMask] : *LiveSet) {
+ assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind);
+ LiveInterval &LI = LIS.getInterval(Reg);
+ if (LI.getNumValNums() == 1 ||
+ (LI.hasSubRanges() &&
+ llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) {
+ return SR.getNumValNums() == 1;
+ }))) {
+ SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ SDefRegs.push_back(Reg);
+ } else {
+ MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ MDefRegs.push_back(Reg);
+ }
+ }
+ unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind);
+ unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind);
+ assert(SDefNumRegs + MDefNumRegs == MaxNumRegs);
+
+ auto printLoc = [&](MachineBasicBlock *MBB, SlotIndex SI) {
+ return Printable([&, MBB, SI](raw_ostream &OS) {
+ OS << SI << "@BB." << MBB->getNumber();
+ if (MLI)
+ if (const MachineLoop *ML = MLI->getLoopFor(MBB))
+ OS << " (LoopHdr BB." << ML->getHeader()->getNumber() << ", Depth "
+ << ML->getLoopDepth() << ")";
+ });
+ };
+
+ auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) {
+ GCNRegPressure RegPressure;
+ RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI);
+ OS << " " << printReg(Reg, TRI) << ':'
+ << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask "
+ << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' '
+ << RegName << "s)\n";
+
+ // Use std::map to sort def/uses by SlotIndex.
+ std::map<SlotIndex, MachineInstr *> Instrs;
+ for (auto &MI : MRI.reg_nodbg_instructions(Reg)) {
+ Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI;
+ }
+
+ for (const auto &[SI, MI] : Instrs) {
+ OS << " ";
+ if (MI->definesRegister(Reg, TRI))
+ OS << "def ";
+ if (MI->readsRegister(Reg, TRI))
+ OS << "use ";
+ OS << printLoc(MI->getParent(), SI) << ": " << *MI;
+ }
+ };
+
+ OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName()
+ << " ***\n";
+ OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at "
+ << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": "
+ << *MaxPressureMI;
+
+ OS << "\nLive registers with single definition (" << SDefNumRegs << ' '
+ << RegName << "s):\n";
+
+ // Sort OneDefRegs by number of uses (smallest first)
+ llvm::sort(SDefRegs, [&](Register A, Register B) {
+ return std::distance(MRI.use_begin(A), MRI.use_end()) <
+ std::distance(MRI.use_begin(B), MRI.use_end());
+ });
+
+ for (const auto Reg : SDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+
+ OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' '
+ << RegName << "s):\n";
+ for (const auto Reg : MDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+}
+#endif
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a9c58bb90ef03..0cf5ed0513fae 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -41,6 +41,11 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ unsigned getNumRegs(RegKind Kind) const {
+ assert(Kind < TOTAL_KINDS);
+ return Value[Kind];
+ }
+
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
/// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
@@ -138,6 +143,18 @@ struct GCNRegPressure {
void dump() const;
+ static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) {
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI);
+ return (RegKind)getRegKind(MRI.getRegClass(Reg), STI);
+ }
+
+ static const char *getName(RegKind Kind) {
+ const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"};
+ assert(Kind < TOTAL_KINDS);
+ return Names[Kind];
+ }
+
private:
static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2;
@@ -294,8 +311,10 @@ class GCNRPTracker {
}
};
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
+GCNRPTracker::LiveRegSet
+getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS);
////////////////////////////////////////////////////////////////////////////////
// GCNUpwardRPTracker
@@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
const MachineRegisterInfo &MRI,
LaneBitmask LaneMaskFilter = LaneBitmask::getAll());
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
-
/// creates a map MachineInstr -> LiveRegSet
/// R - range of iterators on instructions
/// After - upon entry or exit of every instruction
@@ -524,6 +540,11 @@ struct GCNRegPressurePrinter : public MachineFunctionPass {
}
};
+LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bdc08101c7119..c29d65e0671dc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define DUMP_MAX_REG_PRESSURE
+static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
+ "amdgpu-print-maxrp-regusage-before-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure before scheduling."),
+ cl::init(false));
+
+static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
+ "amdgpu-print-maxrp-regusage-after-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure after scheduling."),
+ cl::init(false));
+#endif
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -960,6 +975,16 @@ void GCNScheduleDAGMILive::runSchedStages() {
RegionLiveOuts.buildLiveRegMap();
}
+#ifdef DUMP_MAX_REG_PRESSURE
+ auto dumpRegUsage = [this]() {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ };
+ if (PrintMaxRPRegUsageBeforeScheduler)
+ dumpRegUsage();
+#endif
+
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
@@ -995,6 +1020,11 @@ void GCNScheduleDAGMILive::runSchedStages() {
Stage->finalizeGCNSchedStage();
}
+
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageAfterScheduler)
+ dumpRegUsage();
+#endif
}
#ifndef NDEBUG
|
3ae2438
to
28c8172
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A couple of non-show stopper minor comments, but LGTM.
unsigned MaxNumRegs = 0; | ||
MachineInstr *MaxPressureMI = nullptr; | ||
GCNUpwardRPTracker RPT(LIS); | ||
for (auto &MBB : MF) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just a suggestion for potentially increased utility. Most of the time the highest register pressure in the program unit is what you are interested in, but not always. It could be useful to have this dump be more targeted. That is, provide an option to make it specific to a single basic block or even a scheduling region. This isn't something that needs to be done now, of course.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a very nice feature. I think we can specify slot index to print the info. Also it could print all slot indexes where register pressure reaches maximum.
dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); | ||
dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); | ||
LIS->dump(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lambda not doing anything, just inline it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But it called twice, before and after scheduling.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's still trivial enough to just duplicate
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe fix title, this is debug printing not actual diagnostics
dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); | ||
dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); | ||
LIS->dump(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's still trivial enough to just duplicate
✅ With the latest revision this PR passed the C/C++ code formatter. |
…ter pressure. (llvm#161850) Basically this allows to analyze "why so many VGPRs used?". It prints all live registers at the point of maximum register pressure and for each register its defs/uses are dumped. Currently can be run before and after the scheduler but would be nice if it can be ran inbetween any passes (not sure this is possible with legacy pass-manager). Requires debug or built with asserts compiler. Highly recommended to run with debug info to have debug locations for instructions. Example output: ``` *** Register pressure info (VGPRs) for _ZN7ck_tile6ken.... *** Max pressure is 256 VGPRs at 41780e@BB.18 (LoopHdr BB.16, Depth 1): %9858:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10137:vreg_128_align2, %10141:vreg_128_align2, %9858:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec Live registers with single definition (123 VGPRs): %10126:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs) def 41600r@BB.18 (LoopHdr BB.16, Depth 1): undef %10126.sub0_sub1:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 15232, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1314, !noalias !60, addrspace 3) def 41608r@BB.18 (LoopHdr BB.16, Depth 1): %10126.sub2_sub3:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 16320, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1315, !noalias !60, addrspace 3) use 41848r@BB.18 (LoopHdr BB.16, Depth 1): %9856:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10126:vreg_128_align2, %10138:vreg_128_align2, %9856:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec %10136:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs) def 41264r@BB.18 (LoopHdr BB.16, Depth 1): undef %10136.sub0_sub1:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 2176, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1294, !noalias !60, addrspace 3) def 41272r@BB.18 (LoopHdr BB.16, Depth 1): %10136.sub2_sub3:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 3264, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1295, !noalias !60, addrspace 3) use 41788r@BB.18 (LoopHdr BB.16, Depth 1): %9858:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10136:vreg_128_align2, %10140:vreg_128_align2, %9858:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec %10129:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs) ... Live registers with multiple definitions (133 VGPRs): %9856:VReg_512_Align2, LiveMask 00000000FFFFFFFF (16 VGPRs) def 16544r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def undef %9856.sub0_sub1:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16592r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub2_sub3:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16608r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub4_sub5:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16656r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub6_sub7:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16672r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub8_sub9:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16720r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub10_sub11:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16736r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub12_sub13:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16784r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub14_sub15:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def use 41828r@BB.18 (LoopHdr BB.16, Depth 1): %9856:vreg_512_align2 = contract V_MFMA_F32_... ... ********** INTERVALS ********** ... ********** MACHINEINSTRS ********** # Machine code for function _ZN7ck_tile6kentr... ```
Basically this allows to analyze "why so many VGPRs used?".
It prints all live registers at the point of maximum register pressure and for each register its defs/uses are dumped.
Currently can be run before and after the scheduler but would be nice if it can be ran inbetween any passes (not sure this is possible with legacy pass-manager). Requires debug or built with asserts compiler.
Highly recommended to run with debug info to have debug locations for instructions.
Example output: