Skip to content

Commit 8823efe

Browse files
authored
[AMDGPU] Add register usage debug printing the point of maximum register pressure. (#161850)
Basically this allows to analyze "why so many VGPRs used?". It prints all live registers at the point of maximum register pressure and for each register its defs/uses are dumped. Currently can be run before and after the scheduler but would be nice if it can be ran inbetween any passes (not sure this is possible with legacy pass-manager). Requires debug or built with asserts compiler. Highly recommended to run with debug info to have debug locations for instructions. Example output: ``` *** Register pressure info (VGPRs) for _ZN7ck_tile6ken.... *** Max pressure is 256 VGPRs at 41780e@BB.18 (LoopHdr BB.16, Depth 1): %9858:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10137:vreg_128_align2, %10141:vreg_128_align2, %9858:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec Live registers with single definition (123 VGPRs): %10126:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs) def 41600r@BB.18 (LoopHdr BB.16, Depth 1): undef %10126.sub0_sub1:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 15232, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1314, !noalias !60, addrspace 3) def 41608r@BB.18 (LoopHdr BB.16, Depth 1): %10126.sub2_sub3:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 16320, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1315, !noalias !60, addrspace 3) use 41848r@BB.18 (LoopHdr BB.16, Depth 1): %9856:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10126:vreg_128_align2, %10138:vreg_128_align2, %9856:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec %10136:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs) def 41264r@BB.18 (LoopHdr BB.16, Depth 1): undef %10136.sub0_sub1:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 2176, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1294, !noalias !60, addrspace 3) def 41272r@BB.18 (LoopHdr BB.16, Depth 1): %10136.sub2_sub3:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 3264, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1295, !noalias !60, addrspace 3) use 41788r@BB.18 (LoopHdr BB.16, Depth 1): %9858:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10136:vreg_128_align2, %10140:vreg_128_align2, %9858:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec %10129:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs) ... Live registers with multiple definitions (133 VGPRs): %9856:VReg_512_Align2, LiveMask 00000000FFFFFFFF (16 VGPRs) def 16544r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def undef %9856.sub0_sub1:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16592r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub2_sub3:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16608r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub4_sub5:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16656r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub6_sub7:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16672r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub8_sub9:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16720r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub10_sub11:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16736r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub12_sub13:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def 16784r@BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub14_sub15:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52 def use 41828r@BB.18 (LoopHdr BB.16, Depth 1): %9856:vreg_512_align2 = contract V_MFMA_F32_... ... ********** INTERVALS ********** ... ********** MACHINEINSTRS ********** # Machine code for function _ZN7ck_tile6kentr... ```
1 parent a80a6b3 commit 8823efe

File tree

3 files changed

+188
-6
lines changed

3 files changed

+188
-6
lines changed

llvm/lib/Target/AMDGPU/GCNRegPressure.cpp

Lines changed: 131 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "GCNRegPressure.h"
1515
#include "AMDGPU.h"
1616
#include "SIMachineFunctionInfo.h"
17+
#include "llvm/CodeGen/MachineLoopInfo.h"
1718
#include "llvm/CodeGen/RegisterPressure.h"
1819

1920
using namespace llvm;
@@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
459460

460461
GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
461462
const LiveIntervals &LIS,
462-
const MachineRegisterInfo &MRI) {
463+
const MachineRegisterInfo &MRI,
464+
GCNRegPressure::RegKind RegKind) {
463465
GCNRPTracker::LiveRegSet LiveRegs;
464466
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
465467
auto Reg = Register::index2VirtReg(I);
468+
if (RegKind != GCNRegPressure::TOTAL_KINDS &&
469+
GCNRegPressure::getRegKind(Reg, MRI) != RegKind)
470+
continue;
466471
if (!LIS.hasInterval(Reg))
467472
continue;
468473
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
986991

987992
#undef PFX
988993
}
994+
995+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
996+
LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF,
997+
GCNRegPressure::RegKind Kind,
998+
LiveIntervals &LIS,
999+
const MachineLoopInfo *MLI) {
1000+
1001+
const MachineRegisterInfo &MRI = MF.getRegInfo();
1002+
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
1003+
auto &OS = dbgs();
1004+
const char *RegName = GCNRegPressure::getName(Kind);
1005+
1006+
unsigned MaxNumRegs = 0;
1007+
const MachineInstr *MaxPressureMI = nullptr;
1008+
GCNUpwardRPTracker RPT(LIS);
1009+
for (const MachineBasicBlock &MBB : MF) {
1010+
RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot());
1011+
for (const MachineInstr &MI : reverse(MBB)) {
1012+
RPT.recede(MI);
1013+
unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind);
1014+
if (NumRegs > MaxNumRegs) {
1015+
MaxNumRegs = NumRegs;
1016+
MaxPressureMI = &MI;
1017+
}
1018+
}
1019+
}
1020+
1021+
SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI);
1022+
1023+
// Max pressure can occur at either the early-clobber or register slot.
1024+
// Choose the maximum liveset between both slots. This is ugly but this is
1025+
// diagnostic code.
1026+
SlotIndex ECSlot = MISlot.getRegSlot(true);
1027+
SlotIndex RSlot = MISlot.getRegSlot(false);
1028+
GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind);
1029+
GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind);
1030+
unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind);
1031+
unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind);
1032+
GCNRPTracker::LiveRegSet *LiveSet =
1033+
ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet;
1034+
SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot;
1035+
assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs);
1036+
1037+
// Split live registers into single-def and multi-def sets.
1038+
GCNRegPressure SDefPressure, MDefPressure;
1039+
SmallVector<Register, 16> SDefRegs, MDefRegs;
1040+
for (auto [Reg, LaneMask] : *LiveSet) {
1041+
assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind);
1042+
LiveInterval &LI = LIS.getInterval(Reg);
1043+
if (LI.getNumValNums() == 1 ||
1044+
(LI.hasSubRanges() &&
1045+
llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) {
1046+
return SR.getNumValNums() == 1;
1047+
}))) {
1048+
SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
1049+
SDefRegs.push_back(Reg);
1050+
} else {
1051+
MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
1052+
MDefRegs.push_back(Reg);
1053+
}
1054+
}
1055+
unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind);
1056+
unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind);
1057+
assert(SDefNumRegs + MDefNumRegs == MaxNumRegs);
1058+
1059+
auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) {
1060+
return Printable([&, MBB, SI](raw_ostream &OS) {
1061+
OS << SI << ':' << printMBBReference(*MBB);
1062+
if (MLI)
1063+
if (const MachineLoop *ML = MLI->getLoopFor(MBB))
1064+
OS << " (LoopHdr " << printMBBReference(*ML->getHeader())
1065+
<< ", Depth " << ML->getLoopDepth() << ")";
1066+
});
1067+
};
1068+
1069+
auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) {
1070+
GCNRegPressure RegPressure;
1071+
RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI);
1072+
OS << " " << printReg(Reg, TRI) << ':'
1073+
<< TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask "
1074+
<< PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' '
1075+
<< RegName << "s)\n";
1076+
1077+
// Use std::map to sort def/uses by SlotIndex.
1078+
std::map<SlotIndex, const MachineInstr *> Instrs;
1079+
for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) {
1080+
Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI;
1081+
}
1082+
1083+
for (const auto &[SI, MI] : Instrs) {
1084+
OS << " ";
1085+
if (MI->definesRegister(Reg, TRI))
1086+
OS << "def ";
1087+
if (MI->readsRegister(Reg, TRI))
1088+
OS << "use ";
1089+
OS << printLoc(MI->getParent(), SI) << ": " << *MI;
1090+
}
1091+
};
1092+
1093+
OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName()
1094+
<< " ***\n";
1095+
OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at "
1096+
<< printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": "
1097+
<< *MaxPressureMI;
1098+
1099+
OS << "\nLive registers with single definition (" << SDefNumRegs << ' '
1100+
<< RegName << "s):\n";
1101+
1102+
// Sort SDefRegs by number of uses (smallest first)
1103+
llvm::sort(SDefRegs, [&](Register A, Register B) {
1104+
return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) <
1105+
std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end());
1106+
});
1107+
1108+
for (const Register Reg : SDefRegs) {
1109+
PrintRegInfo(Reg, LiveSet->lookup(Reg));
1110+
}
1111+
1112+
OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' '
1113+
<< RegName << "s):\n";
1114+
for (const Register Reg : MDefRegs) {
1115+
PrintRegInfo(Reg, LiveSet->lookup(Reg));
1116+
}
1117+
}
1118+
#endif

llvm/lib/Target/AMDGPU/GCNRegPressure.h

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ class SlotIndex;
3131
struct GCNRegPressure {
3232
enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
3333

34+
static constexpr const char *getName(RegKind Kind) {
35+
const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"};
36+
assert(Kind < TOTAL_KINDS);
37+
return Names[Kind];
38+
}
39+
3440
GCNRegPressure() {
3541
clear();
3642
}
@@ -41,6 +47,11 @@ struct GCNRegPressure {
4147

4248
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
4349

50+
unsigned getNumRegs(RegKind Kind) const {
51+
assert(Kind < TOTAL_KINDS);
52+
return Value[Kind];
53+
}
54+
4455
/// \returns the SGPR32 pressure
4556
unsigned getSGPRNum() const { return Value[SGPR]; }
4657
/// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
@@ -138,6 +149,12 @@ struct GCNRegPressure {
138149

139150
void dump() const;
140151

152+
static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) {
153+
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
154+
const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI);
155+
return (RegKind)getRegKind(MRI.getRegClass(Reg), STI);
156+
}
157+
141158
private:
142159
static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2;
143160

@@ -294,8 +311,10 @@ class GCNRPTracker {
294311
}
295312
};
296313

297-
GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
298-
const MachineRegisterInfo &MRI);
314+
GCNRPTracker::LiveRegSet
315+
getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
316+
const MachineRegisterInfo &MRI,
317+
GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS);
299318

300319
////////////////////////////////////////////////////////////////////////////////
301320
// GCNUpwardRPTracker
@@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
428447
const MachineRegisterInfo &MRI,
429448
LaneBitmask LaneMaskFilter = LaneBitmask::getAll());
430449

431-
GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
432-
const MachineRegisterInfo &MRI);
433-
434450
/// creates a map MachineInstr -> LiveRegSet
435451
/// R - range of iterators on instructions
436452
/// After - upon entry or exit of every instruction
@@ -524,6 +540,11 @@ struct GCNRegPressurePrinter : public MachineFunctionPass {
524540
}
525541
};
526542

543+
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF,
544+
GCNRegPressure::RegKind Kind,
545+
LiveIntervals &LIS,
546+
const MachineLoopInfo *MLI);
547+
527548
} // end namespace llvm
528549

529550
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers(
6969
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
7070
cl::init(false));
7171

72+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
73+
#define DUMP_MAX_REG_PRESSURE
74+
static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
75+
"amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
76+
cl::desc("Print a list of live registers along with their def/uses at the "
77+
"point of maximum register pressure before scheduling."),
78+
cl::init(false));
79+
80+
static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
81+
"amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
82+
cl::desc("Print a list of live registers along with their def/uses at the "
83+
"point of maximum register pressure after scheduling."),
84+
cl::init(false));
85+
#endif
86+
7287
const unsigned ScheduleMetrics::ScaleFactor = 100;
7388

7489
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -960,6 +975,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
960975
RegionLiveOuts.buildLiveRegMap();
961976
}
962977

978+
#ifdef DUMP_MAX_REG_PRESSURE
979+
if (PrintMaxRPRegUsageBeforeScheduler) {
980+
dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
981+
dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
982+
LIS->dump();
983+
}
984+
#endif
985+
963986
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
964987
while (S.advanceStage()) {
965988
auto Stage = createSchedStage(S.getCurrentStage());
@@ -995,6 +1018,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
9951018

9961019
Stage->finalizeGCNSchedStage();
9971020
}
1021+
1022+
#ifdef DUMP_MAX_REG_PRESSURE
1023+
if (PrintMaxRPRegUsageAfterScheduler) {
1024+
dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1025+
dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1026+
LIS->dump();
1027+
}
1028+
#endif
9981029
}
9991030

10001031
#ifndef NDEBUG

0 commit comments

Comments
 (0)