-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Support wide register or subregister access when emitting s_singleuse_vdst instructions. #88520
[AMDGPU] Support wide register or subregister access when emitting s_singleuse_vdst instructions. #88520
Conversation
…singleuse_vdst instructions. Both single use producer and consumer instructions using wide/sub registers are now correctly tracked and eligible for being marked as single use.
@llvm/pr-subscribers-backend-amdgpu Author: Scott Egerton (ScottEgerton) ChangesBoth single use producer and consumer instructions using wide/sub Full diff: https://github.com/llvm/llvm-project/pull/88520.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index 93ed77bb6f7efe..dcae8c60429ef9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -64,13 +64,15 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
bool InstructionEmitted = false;
for (MachineBasicBlock &MBB : MF) {
- DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits
+ DenseMap<MCRegUnit, unsigned> RegisterUseCount;
// Handle boundaries at the end of basic block separately to avoid
// false positives. If they are live at the end of a basic block then
// assume it has more uses later on.
- for (const auto &Liveouts : MBB.liveouts())
- RegisterUseCount[Liveouts.PhysReg] = 2;
+ for (const auto &Liveout : MBB.liveouts()) {
+ for (const MCRegUnit &Unit : TRI->regunits(Liveout.PhysReg))
+ RegisterUseCount[Unit] = 2;
+ }
for (MachineInstr &MI : reverse(MBB.instrs())) {
// All registers in all operands need to be single use for an
@@ -84,7 +86,8 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
// Count the number of times each register is read.
if (Operand.readsReg())
- RegisterUseCount[Reg]++;
+ for (const MCRegUnit &Unit : TRI->regunits(Reg))
+ RegisterUseCount[Unit]++;
// Do not attempt to optimise across exec mask changes.
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
@@ -96,10 +99,16 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
// check if the operands are single use.
if (!MI.modifiesRegister(Reg, TRI))
continue;
- if (RegisterUseCount[Reg] > 1)
+
+ const auto RegUnits = TRI->regunits(Reg);
+ if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit &Unit) {
+ return RegisterUseCount[Unit] > 1;
+ }))
AllProducerOperandsAreSingleUse = false;
+
// Reset uses count when a register is no longer live.
- RegisterUseCount.erase(Reg);
+ for (const MCRegUnit &Unit : RegUnits)
+ RegisterUseCount[Unit] = 0;
}
if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
// TODO: Replace with candidate logging for instruction grouping
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index 833699b4656b64..065226e0a42f0d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -398,7 +398,6 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec = COPY $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -424,7 +423,6 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec_lo = COPY $sgpr0
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -450,7 +448,6 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec_hi = COPY $sgpr0
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -521,9 +518,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
@@ -582,6 +577,32 @@ body: |
liveins: $vgpr1
...
+# Write low 16-bits and then read 32-bit vgpr twice.
+---
+name: write_lo_read_full_twice
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: write_lo_read_full_twice
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ bb.0:
+ liveins: $vgpr0
+ $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ bb.1:
+ liveins: $vgpr1, $vgpr2
+...
+
# Write high 16-bits and then read 32-bit vgpr.
---
name: write_hi_read_full
@@ -605,3 +626,81 @@ body: |
bb.1:
liveins: $vgpr1
...
+
+# Write high 16-bits and then read 32-bit vgpr twice.
+---
+name: write_hi_read_full_twice
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: write_hi_read_full_twice
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ bb.0:
+ liveins: $vgpr0
+ $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ bb.1:
+ liveins: $vgpr1, $vgpr2
+...
+
+# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr.
+---
+name: write_both_read_full
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: write_both_read_full
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_SINGLEUSE_VDST 1
+ ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: S_SINGLEUSE_VDST 1
+ ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ bb.0:
+ $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ bb.1:
+ liveins: $vgpr1
+...
+
+# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr twice.
+---
+name: write_both_read_full_twice
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: write_both_read_full_twice
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ bb.0:
+ $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ bb.1:
+ liveins: $vgpr1, $vgpr2
+...
|
…ting s_singleuse_vdst instructions. Change-Id: If41aa496d852d0ac1fddfc75f19acbfa20d2309e
Changing erasing unused registers to setting them to 0 fixes an issue where single use values were being incorrectly detected across exec mask changes. However this is not relevant to wise/sub register changes and will be included in a different patch.
This allows checking that liveout registers are enabled in Liveout.LaneMask before excluding them from single use optimisations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Both single use producer and consumer instructions using wide/sub
registers are now correctly tracked and eligible for being marked as
single use.