-
Notifications
You must be signed in to change notification settings - Fork 15k
[AMDGPU] GCNRegPressure printing pass for testing. #70031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
* Added printing of live-in, live-out and live-through sets. * Empty BB aren't skipped now.
@llvm/pr-subscribers-backend-amdgpu Author: Valery Pykhtin (vpykhtin) ChangesTake a look at the tests to see the output. This probably should be also enabled for llc tests, I haven't tried it yet. Patch is 48.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70031.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 97a413296c55e55..2c29710f8c8cb46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -191,6 +191,9 @@ extern char &AMDGPUImageIntrinsicOptimizerID;
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
extern char &AMDGPUPerfHintAnalysisID;
+void initializeGCNRegPressurePrinterPass(PassRegistry &);
+extern char &GCNRegPressurePrinterID;
+
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dc7321cd5de9fcd..375df27206f7b41 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -428,6 +428,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGCNPreRAOptimizationsPass(*PR);
initializeGCNPreRALongBranchRegPass(*PR);
initializeGCNRewritePartialRegUsesPass(*PR);
+ initializeGCNRegPressurePrinterPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 1ca0f3b6e06b823..1db4e9dd151d847 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "GCNRegPressure.h"
+#include "AMDGPU.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -487,3 +488,77 @@ LLVM_DUMP_METHOD
void GCNRegPressure::dump() const { dbgs() << print(*this); }
#endif
+
+char llvm::GCNRegPressurePrinter::ID = 0;
+char &llvm::GCNRegPressurePrinterID = GCNRegPressurePrinter::ID;
+
+INITIALIZE_PASS(GCNRegPressurePrinter, "amdgpu-print-rp", "", true, true)
+
+bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+ GCNUpwardRPTracker RPT(LIS);
+
+ auto &OS = dbgs();
+
+// Leading spaces are important for YAML syntax.
+#define PFX " "
+
+ OS << "---\nname: " << MF.getName() << "\nbody: |\n";
+
+ auto printRP = [](const GCNRegPressure &RP) {
+ return Printable([&RP](raw_ostream &OS) {
+ OS << format(PFX " %-5d", RP.getSGPRNum())
+ << format(" %-5d", RP.getVGPRNum(false));
+ });
+ };
+
+ // Register pressure before and at an instruction (in program order).
+ SmallVector<std::pair<GCNRegPressure, GCNRegPressure>, 16> RP;
+
+ for (auto &MBB : MF) {
+ OS << PFX;
+ MBB.printName(OS);
+ OS << ":\n";
+
+ if (MBB.empty()) {
+ SlotIndex MBBSI = LIS.getSlotIndexes()->getMBBStartIdx(&MBB);
+ GCNRPTracker::LiveRegSet LRThrough = getLiveRegs(MBBSI, LIS, MRI);
+ GCNRegPressure RP = getRegPressure(MRI, LRThrough);
+ OS << PFX " Live-through:" << llvm::print(LRThrough, MRI);
+ OS << PFX " SGPR VGPR\n" << printRP(RP) << '\n';
+ continue;
+ }
+
+ RPT.reset(MBB.instr_back());
+ RPT.moveMaxPressure(); // Clear max pressure.
+
+ GCNRPTracker::LiveRegSet LRAtMBBEnd = RPT.getLiveRegs();
+ GCNRegPressure RPAtMBBEnd = RPT.getPressure();
+
+ RP.clear();
+ RP.reserve(MBB.size());
+ for (auto &MI : reverse(MBB)) {
+ RPT.recede(MI);
+ RP.emplace_back(RPT.getPressure(), RPT.moveMaxPressure());
+ }
+
+ OS << PFX " Live-in:" << llvm::print(RPT.getLiveRegs(), MRI);
+ OS << PFX " SGPR VGPR\n";
+ auto I = RP.rbegin();
+ for (auto &MI : MBB) {
+ auto &[RPBeforeInstr, RPAtInstr] = *I++;
+ OS << printRP(RPBeforeInstr) << '\n' << printRP(RPAtInstr) << " ";
+ MI.print(OS);
+ }
+ OS << printRP(RPAtMBBEnd) << '\n';
+ OS << PFX " Live-out:" << llvm::print(LRAtMBBEnd, MRI);
+ }
+ OS << "...\n";
+ return false;
+
+#undef PFX
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 72e18acc1b8e494..f2256f68c2c7037 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -128,6 +128,8 @@ class GCNRPTracker {
void clearMaxPressure() { MaxPressure.clear(); }
+ GCNRegPressure getPressure() const { return CurPressure; }
+
// returns MaxPressure, resetting it
decltype(MaxPressure) moveMaxPressure() {
auto Res = MaxPressure;
@@ -277,6 +279,21 @@ Printable reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
const GCNRPTracker::LiveRegSet &TrackedL,
const TargetRegisterInfo *TRI);
+struct GCNRegPressurePrinter : public MachineFunctionPass {
+ static char ID;
+
+public:
+ GCNRegPressurePrinter() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
index e4f56cc328e4782..138c8e785dec280 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=null --run-pass=amdgpu-print-rp %s 2>&1 >/dev/null | FileCheck %s --check-prefix=RP
# The sequence of DBG_VALUEs forms a scheduling region with 0 real
# instructions. The RegPressure tracker would end up skipping over any
@@ -27,33 +28,33 @@ body: |
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
- ; CHECK-NEXT: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
- ; CHECK-NEXT: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[V_ADD_F32_e32_:%[0-9]+]].sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead undef [[V_ADD_F32_e32_:%[0-9]+]].sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec
- ; CHECK-NEXT: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+ ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
- ; CHECK-NEXT: undef %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
- ; CHECK-NEXT: %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, implicit $exec
- ; CHECK-NEXT: undef %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
+ ; CHECK-NEXT: undef [[V_ADD_F32_e32_1:%[0-9]+]].sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]].sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[V_ADD_F32_e32_1]], [[V_MOV_B32_e32_]], 32, 0, implicit $exec
+ ; CHECK-NEXT: undef [[GLOBAL_LOAD_DWORD2:%[0-9]+]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec
- ; CHECK-NEXT: %11.sub1:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD2:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, implicit $exec
- ; CHECK-NEXT: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, implicit $exec
- ; CHECK-NEXT: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, implicit $exec
+ ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[GLOBAL_LOAD_DWORD2]], 0, 0, implicit $exec
+ ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, implicit $exec
+ ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, implicit $exec
; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF2]], implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF2]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_1]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
@@ -65,9 +66,114 @@ body: |
; CHECK-NEXT: S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: S_NOP 0, implicit [[DEF8]]
; CHECK-NEXT: S_ENDPGM 0
+ ;
+ ; RP-LABEL: name: only_dbg_value_sched_region
+ ; RP: bb.0:
+ ; RP-NEXT: Live-in:
+ ; RP-NEXT: SGPR VGPR
+ ; RP-NEXT: 0 0
+ ; RP-NEXT: 0 1 %0:vgpr_32 = COPY $vgpr0
+ ; RP-NEXT: 0 1
+ ; RP-NEXT: 0 3 %1:vreg_64 = IMPLICIT_DEF
+ ; RP-NEXT: 0 3
+ ; RP-NEXT: 0 5 %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1:vreg_64, 0, 0, implicit $exec
+ ; RP-NEXT: 0 5
+ ; RP-NEXT: 0 6 %3:vgpr_32 = GLOBAL_LOAD_DWORD %1:vreg_64, 8, 0, implicit $exec
+ ; RP-NEXT: 0 6
+ ; RP-NEXT: 0 7 undef %4.sub1:vreg_64 = V_ADD_U32_e32 %0:vgpr_32, %0:vgpr_32, implicit $exec
+ ; RP-NEXT: 0 7
+ ; RP-NEXT: 0 8 %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+ ; RP-NEXT: 0 8
+ ; RP-NEXT: 0 10 %5:vreg_64 = COPY %2:vreg_64
+ ; RP-NEXT: 0 9
+ ; RP-NEXT: 0 9 undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0:vreg_64, %5.sub0:vreg_64, implicit $mode, implicit $exec
+ ; RP-NEXT: 0 8
+ ; RP-NEXT: 0 8 dead %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1:vreg_64, %5.sub0:vreg_64, implicit $mode, implicit $exec
+ ; RP-NEXT: 0 7
+ ; RP-NEXT: 0 8 %7:vgpr_32 = GLOBAL_LOAD_DWORD %5:vreg_64, 0, 0, implicit $exec
+ ; RP-NEXT: 0 6
+ ; RP-NEXT: 0 7 %8:vreg_64 = IMPLICIT_DEF
+ ; RP-NEXT: 0 7
+ ; RP-NEXT: 0 9 %9:vreg_64 = IMPLICIT_DEF
+ ; RP-NEXT: 0 9
+ ; RP-NEXT: 0 11 %10:vreg_64 = IMPLICIT_DEF
+ ; RP-NEXT: 0 11
+ ; RP-NEXT: 0 12 undef %11.sub1:vreg_64 = IMPLICIT_DEF
+ ; RP-NEXT: 0 12
+ ; RP-NEXT: 0 13 %12:vgpr_32 = IMPLICIT_DEF
+ ; RP-NEXT: 0 13
+ ; RP-NEXT: 0 14 %13:vgpr_32 = IMPLICIT_DEF
+ ; RP-NEXT: 0 14
+ ; RP-NEXT: 0 16 %14:vreg_64 = IMPLICIT_DEF
+ ; RP-NEXT: 0 16
+ ; RP-NEXT: 0 18 %15:vreg_64 = IMPLICIT_DEF
+ ; RP-NEXT: 0 18
+ ; RP-NEXT: 0 19 %16:vgpr_32 = IMPLICIT_DEF
+ ; RP-NEXT: 0 19
+ ; RP-NEXT: 0 20 %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; RP-NEXT: 0 20
+ ; RP-NEXT: 0 21 %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; RP-NEXT: 0 21
+ ; RP-NEXT: 0 22 undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7:vgpr_32, %2.sub0:vreg_64, implicit $mode, implicit $exec
+ ; RP-NEXT: 0 20
+ ; RP-NEXT: 0 21 %19.sub1:vreg_64 = V_ADD_F32_e32 %3:vgpr_32, %3:vgpr_32, implicit $mode, implicit $exec
+ ; RP-NEXT: 0 20
+ ; RP-NEXT: 0 20 GLOBAL_STORE_DWORDX2 %19:vreg_64, %4:vreg_64, 32, 0, implicit $exec
+ ; RP-NEXT: 0 16
+ ; RP-NEXT: 0 17 %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9:vreg_64, 0, 0, implicit $exec
+ ; RP-NEXT: 0 15
+ ; RP-NEXT: 0 16 %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10:vreg_64, 0, 0, implicit $exec
+ ; RP-NEXT: 0 14
+ ; RP-NEXT: 0 14 dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11:vreg_64, 0, 0, implicit $exec
+ ; RP-NEXT: 0 12
+ ; RP-NEXT: 0 12 dead %21:vgpr_32 = GLOBAL_LOAD_DWORD %14:vreg_64, 0, 0, implicit $exec
+ ; RP-NEXT: 0 10
+ ; RP-NEXT: 0 10 dead %22:vgpr_32 = GLOBAL_LOAD_DWORD %15:vreg_64, 0, 0, implicit $exec
+ ; RP-NEXT: 0 10
+ ; RP-NEXT: 0 11 %23:vreg_64 = V_LSHLREV_B64_e64 2, %8:vreg_64, implicit $exec
+ ; RP-NEXT: 0 9
+ ; RP-NEXT: 0 9 S_NOP 0, implicit %13:vgpr_32, implicit %23.sub0:vreg_64, implicit %12:vgpr_32, implicit %17:vgpr_32
+ ; RP-NEXT: 0 5
+ ; RP-NEXT: 0 5 GLOBAL_STORE_DWORD %15:vreg_64, %18:vgpr_32, 0, 0, implicit $exec
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: Live-out: %0:0000000000000003 %16:0000000000000003
+ ; RP-NEXT: bb.1:
+ ; RP-NEXT: Live-in: %0:0000000000000003 %16:0000000000000003
+ ; RP-NEXT: SGPR VGPR
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: 0 2 S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: 0 0 DBG_VALUE
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: 0 0 DBG_VALUE
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: 0 0 DBG_VALUE
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: 0 2 S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: Live-out: %0:0000000000000003 %16:0000000000000003
+ ; RP-NEXT: bb.2:
+ ; RP-NEXT: Live-through: %0:0000000000000003 %16:0000000000000003
+ ; RP-NEXT: SGPR VGPR
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: bb.3:
+ ; RP-NEXT: Live-in: %0:0000000000000003 %16:0000000000000003
+ ; RP-NEXT: SGPR VGPR
+ ; RP-NEXT: 0 2
+ ; RP-NEXT: 0 2 S_NOP 0, implicit %0:vgpr_32
+ ; RP-NEXT: 0 1
+ ; RP-NEXT: 0 1 S_NOP 0, implicit %16:vgpr_32
+ ; RP-NEXT: 0 0
+ ; RP-NEXT: 0 0 S_ENDPGM 0
+ ; RP-NEXT: 0 0
+ ; RP-NEXT: Live-out:
bb.0:
liveins: $vgpr0
@@ -111,6 +217,8 @@ body: |
DBG_VALUE
S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+ bb.3:
+
bb.2:
S_NOP 0, implicit %0
S_NOP 0, implicit %16
diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
index f8c7be8e414ca15..3f33e795e1e6ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
@@ -1,4 +1,6 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=machine-scheduler -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa --filetype=null --run-pass=amdgpu-print-rp %s 2>&1 >/dev/null | FileCheck %s --check-prefix=RP
--- |
%struct.widget.0 = type { float, i32, i32 }
@@ -171,8 +173,6 @@
...
---
-# CHECK: name: sched_dbg_value_crash
-# CHECK: DBG_VALUE %99, $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
name: sched_dbg_value_crash
alignment: 1
@@ -197,7 +197,420 @@ constants:
body: |
bb.0.bb:
liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr32, $sgpr101
-
+ ; CHECK-LABEL: name: sched_dbg_value_crash
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr32, $sgpr101
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 16, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 24, 0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[DEF]], [[COPY2]], implicit-def dead $vcc, implicit $exec
+ ; CHECK-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, dead [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_64 = V_MAD_I64_I32_e64 [[V_ADD_CO_U32_e32_]], 12, [[S_LOAD_DWORDX2_IMM]], 0, implicit $exec
+ ; CHECK-NEXT: dead [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 32, 0
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM3:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY1]], 4, 0
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[V_MAD_I64_I32_e64_]], 4, 0, implicit $exec
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM4:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`, addrspace 4)
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_MAD_I64_I32_e64_2:%[0-9]+]]:vreg_64, dead [[V_MAD_I64_I32_e64_3:%[0-9]+]]:sreg_64 = V_MAD_I64_I32_e64 [[GLOBAL_LOAD_DWORD]], [[DEF1]], 0, 0, implicit $exec
+ ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sreg_64_xexec = S_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM4]], 0, 0
+ ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM1:%[0-9]+]].sub0:sreg_64_xexec = S_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM4]], 4, 0
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[V_MAD_I64_I32_e64_2]], 32, 0, implicit $exec
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM5:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sreg_64_xexec = S_MOV_B32 0
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32_xm0 = S_LSHR_B32 [[S_LOAD_DWORDX2_IMM3]].sub0, 16, implicit-def d...
[truncated]
|
|
||
const MachineRegisterInfo &MRI = MF.getRegInfo(); | ||
const LiveIntervals &LIS = getAnalysis<LiveIntervals>(); | ||
GCNUpwardRPTracker RPT(LIS); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought we are using GCNDownwardRPTracker in the scheduling.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What kind of difference are we expecting between the two?
Would it makes sense if it was configurable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I had to provide more context about this.
Piotr has found the problem with upward tracker D33289 and Jay suggested to make a debug pass to test our RP values before fixing the tracker.
I think we should test both trackers and in theory they should give the same results so we can do an upward and downward runs and check their output with a single Filecheck prefix. If they don't match test update tool will warn about this and we can add a second and third prefixes to see the difference.
I've faced a situation when the trackers disagree, let me add the downward tracker option to this pass and I'll try to illustrate how it works.
Very briefly:
undef %0.sub1:sgpr_64 = ...
use %0
LiveIntervals creates only a subrange for sub1 subreg but not a subrange for undefined sub0, but sub0 is used when %0 is accessed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They disagree from the very first attempt and it is very inconvenient to have two Filecheck prefixes for upward and downward trackers because it's not easy to compare. We can track RP both ways and print the difference at each position if it exists.
MBB.printName(OS); | ||
OS << ":\n"; | ||
|
||
if (MBB.empty()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this could be tweaked to avoid duplication of the printing logic.
It would also be helpful to have live-through output for all blocks, even if this can be inferred from live-in versus live-out, it is helpful to human readers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, good idea to add live-through for every block.
The logic cannot be fully deduplicate though, because trackers require an instruction to start with and we don't have it for empty blocks, so I used SlotIndex instead.
|
||
const MachineRegisterInfo &MRI = MF.getRegInfo(); | ||
const LiveIntervals &LIS = getAnalysis<LiveIntervals>(); | ||
GCNUpwardRPTracker RPT(LIS); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What kind of difference are we expecting between the two?
Would it makes sense if it was configurable?
I've added downward tracker as a separate pass to show how it looks. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
Improved version:
I decided to leave separate passes for upward and downward trackers. This is close to the final version but I need to try running this pass with llc lit tests. |
…nes. * Fixed formatting.
I believe my printing pass is inapplicable for .ll tests because I cannot run separate passes there? I think I'm done with this patch. P.S. If I understood correctly -print<...> syntax is only available with the new pass manager which isn't available for our target yet? |
Yeah, it does not support MachineIR passes (for any target) yet. |
GCNRPTracker::LiveRegSet LRThr = getIntersection(LRAtMBBBegin, LRAtMBBEnd); | ||
OS << PFX " Live-thr:" << llvm::print(LRThr, MRI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this what people normally mean by "live through"? I wondered if it should only include things that have no defs or uses in the block.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought "live through" means that a register isn't redefined in the block. However I forgot that we can have redefinitions after register coalescer and using only intersection isn't enough to tell if a register keeps its value through the block. I can use the result of intersection to check if a register is redefined (or used?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I mean using registers from the intersection I need to perform additional pass on def/used for a given block.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMHO it shall mean 'no defs or kills in the block'. Uses itself are included.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be enough to test whether an MBB is in one liverange segment of reg or subreg.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed live-through register set calculation, assuming live-through register is in live-in and live-out sets, has no redefinitions but may have uses in the block.
# This testcase shows the problem with LiveIntervals: it doesn't create | ||
# subranges for undefined but used subregisters. Upward tracker is able to see | ||
# the use of undefined subregister and tracks it correctly. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would say that undef-read values should not contribute to register pressure, because the register allocator does not have to assign them a distinct register. So LIS is "right" and RPU is "wrong".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But it cannot allocate half of register if it's required by an instruction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Between the first V_MOV and the last S_NOP, only half of %0:vreg_64 is live so the allocator only needs to dedicate one VGPR to it. All other VGPRs can be reused for other things, so they should not contribute to the register pressure.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, given that both trackers are already optimistic in that they believe the allocator can reuse for example, unused subregs of %0.sub0:sgpr_128 use, so this is the same situation here.
; RPU-NEXT: 0 7 | ||
; RPU-NEXT: 0 8 %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec | ||
; RPU-NEXT: 0 8 | ||
; RPU-NEXT: 0 10 %5:vreg_64 = COPY %2:vreg_64 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RPU and RPD are both wrong here. Pressure "at" this instruction should be 9.
; RPD-NEXT: 0 8 | ||
; RPD-NEXT: 0 10 %5:vreg_64 = COPY %2:vreg_64 | ||
; RPD-NEXT: 0 9 | ||
; RPD-NEXT: 0 10 undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0:vreg_64, %5.sub0:vreg_64, implicit $mode, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RPD seems particularly wrong here. Pressure does not go as high as 10 here.
I'm really sorry for spam here, wrong move within VSCode. |
Should we move on and submit this patch? @jayfoad do you have concerns about live-through register set computation or others? I believe fixing trackers should go to another PR. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we move on and submit this patch?
Yes!
@jayfoad do you have concerns about live-through register set computation or others?
I personally have no interest in the live-through part. You could remove it from this patch, but I don't mind if others want to keep it.
I believe fixing trackers should go to another PR.
Agreed.
I've removed live-through registers printing from this PR and will submit it separately. |
Take a look at the tests to see the output.
This probably should be also enabled for llc tests, I haven't tried it yet.