-
Notifications
You must be signed in to change notification settings - Fork 15k
AMDGPU: Try to constrain av registers to VGPR to enable ds_write2 formation #156400
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Try to constrain av registers to VGPR to enable ds_write2 formation #156400
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesIn future changes we will have more AV_ virtual registers, which currently Also relaxes the constraint in flat merging case. We already have the necessary Addresses the easy half of #155769 Patch is 22.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156400.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 263f7127fbf10..69d02e7c2934c 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -119,7 +119,7 @@ class SILoadStoreOptimizer {
unsigned DMask;
InstClassEnum InstClass;
unsigned CPol = 0;
- bool IsAGPR;
+ const TargetRegisterClass *DataRC;
bool UseST64;
int AddrIdx[MaxAddressRegs];
const MachineOperand *AddrReg[MaxAddressRegs];
@@ -203,6 +203,7 @@ class SILoadStoreOptimizer {
using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
private:
+ MachineFunction *MF = nullptr;
const GCNSubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
@@ -245,6 +246,8 @@ class SILoadStoreOptimizer {
unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
+ unsigned getWrite2Opcode(const CombineInfo &CI) const;
+
MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore);
@@ -846,7 +849,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
if (InstClass == UNKNOWN)
return;
- IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
+ DataRC = LSO.getDataRegClass(*MI);
switch (InstClass) {
case DS_READ:
@@ -1313,6 +1316,50 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
// have already been confirmed to be mergeable.
if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
offsetsCanBeCombined(CI, *STM, Paired, true);
+
+ if (CI.InstClass == DS_WRITE) {
+ // Both data operands must be AGPR or VGPR, so the data registers needs to
+ // be constrained to one or the other. We expect to only emit the VGPR form
+ // here for now.
+ //
+ // FIXME: There is currently a hack in getRegClass to report that the write2
+ // operands are VGPRs. In the future we should have separate agpr
+ // instruction definitions.
+ const MachineOperand *Data0 =
+ TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
+ const MachineOperand *Data1 =
+ TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
+
+ const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
+ AMDGPU::OpName::data0);
+ int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
+ AMDGPU::OpName::data1);
+
+ const TargetRegisterClass *DataRC0 =
+ TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF);
+
+ const TargetRegisterClass *DataRC1 =
+ TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF);
+
+ if (unsigned SubReg = Data0->getSubReg()) {
+ DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
+ DataRC0, SubReg);
+ }
+
+ if (unsigned SubReg = Data1->getSubReg()) {
+ DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),
+ DataRC1, SubReg);
+ }
+
+ if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) ||
+ !MRI->constrainRegClass(Data1->getReg(), DataRC1))
+ return nullptr;
+
+ // TODO: If one register can be constrained, and not the other, insert a
+ // copy.
+ }
+
return Where;
}
@@ -1462,6 +1509,10 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
: AMDGPU::DS_WRITE2ST64_B64_gfx9;
}
+unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {
+ return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+}
+
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
@@ -1478,8 +1529,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
unsigned NewOffset0 = CI.Offset;
unsigned NewOffset1 = Paired.Offset;
- unsigned Opc =
- CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+ unsigned Opc = getWrite2Opcode(CI);
if (NewOffset0 > NewOffset1) {
// Canonicalize the merged instruction so the smaller offset comes first.
@@ -2032,6 +2082,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
}
}
+ // FIXME: This should compute the instruction to use, and then use the result
+ // of TII->getRegClass.
unsigned BitWidth = 32 * (CI.Width + Paired.Width);
return TRI->isAGPRClass(getDataRegClass(*CI.I))
? TRI->getAGPRClassForBitWidth(BitWidth)
@@ -2400,7 +2452,6 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
std::list<std::list<CombineInfo> > &MergeableInsts) const {
for (std::list<CombineInfo> &AddrList : MergeableInsts) {
if (AddrList.front().InstClass == CI.InstClass &&
- AddrList.front().IsAGPR == CI.IsAGPR &&
AddrList.front().hasSameBaseAddress(CI)) {
AddrList.emplace_back(CI);
return;
@@ -2465,19 +2516,6 @@ SILoadStoreOptimizer::collectMergeableInsts(
if (!CI.hasMergeableAddress(*MRI))
continue;
- if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
- LLVM_DEBUG(
- dbgs() << "cannot merge ds writes with mixed AGPR and VGPR data\n");
-
- // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
- // operands. However we are reporting that ds_write2 shall have
- // only VGPR data so that machine copy propagation does not
- // create an illegal instruction with a VGPR and AGPR sources.
- // Consequenctially if we create such instruction the verifier
- // will complain.
- continue;
- }
-
LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
addInstToMergeableList(CI, MergeableInsts);
@@ -2650,6 +2688,7 @@ bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
}
bool SILoadStoreOptimizer::run(MachineFunction &MF) {
+ this->MF = &MF;
STM = &MF.getSubtarget<GCNSubtarget>();
if (!STM->loadStoreOptEnabled())
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir
new file mode 100644
index 0000000000000..33f210533e10b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir
@@ -0,0 +1,210 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -o - %s | FileCheck %s
+
+---
+name: ds_write_b32__av32_x2
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: ds_write_b32__av32_x2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_32 = COPY $vgpr1
+ %2:av_32 = COPY $vgpr2
+ DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3)
+ DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3)
+
+...
+
+---
+name: ds_write_b32__av32_x2_subregs_different_reg
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
+
+ ; CHECK-LABEL: name: ds_write_b32__av32_x2_subregs_different_reg
+ ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY2]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_64_align2 = COPY $vgpr2_vgpr3
+ %2:av_64_align2 = COPY $vgpr4_vgpr5
+ DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3)
+ DS_WRITE_B32_gfx9 %0, %2.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3)
+
+...
+
+---
+name: ds_write_b32__unaligned_av64_subregs
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
+
+ ; CHECK-LABEL: name: ds_write_b32__unaligned_av64_subregs
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
+ ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY2]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_64 = COPY $vgpr1_vgpr2
+ %2:av_64 = COPY $vgpr3_vgpr4
+ DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3)
+ DS_WRITE_B32_gfx9 %0, %2.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3)
+
+...
+
+---
+name: ds_write_b32__av32_x2_subregs_same_reg
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
+
+ ; CHECK-LABEL: name: ds_write_b32__av32_x2_subregs_same_reg
+ ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY1]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_64_align2 = COPY $vgpr2_vgpr3
+ DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3)
+ DS_WRITE_B32_gfx9 %0, %1.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3)
+
+...
+
+---
+name: ds_write_b32__av32__vgpr32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: ds_write_b32__av32__vgpr32
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3)
+ DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3)
+
+...
+
+---
+name: ds_write_b32__vgpr32__av32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: ds_write_b32__vgpr32__av32
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:av_32 = COPY $vgpr2
+ DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3)
+ DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3)
+
+...
+
+---
+name: ds_write_b64__av64_x2
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
+
+ ; CHECK-LABEL: name: ds_write_b64__av64_x2
+ ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: DS_WRITE2_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 5, 12, 0, implicit $exec :: (store (s64), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_64_align2 = COPY $vgpr2_vgpr3
+ %2:av_64_align2 = COPY $vgpr4_vgpr5
+ DS_WRITE_B64_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s64), addrspace 3)
+ DS_WRITE_B64_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s64), addrspace 3)
+
+...
+
+---
+name: ds_write_b64__av64_x2_subregs
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8_vgpr9
+
+ ; CHECK-LABEL: name: ds_write_b64__av64_x2_subregs
+ ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8_vgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY $vgpr6_vgpr7_vgpr8_vgpr9
+ ; CHECK-NEXT: DS_WRITE2_B64_gfx9 [[COPY]], [[COPY1]].sub2_sub3, [[COPY2]].sub2_sub3, 5, 12, 0, implicit $exec :: (store (s64), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_128_align2 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+ %2:av_128_align2 = COPY $vgpr6_vgpr7_vgpr8_vgpr9
+ DS_WRITE_B64_gfx9 %0, %1.sub2_sub3, 40, 0, implicit $exec :: (store (s64), addrspace 3)
+ DS_WRITE_B64_gfx9 %0, %2.sub2_sub3, 96, 0, implicit $exec :: (store (s64), addrspace 3)
+
+...
+
+---
+name: ds_writest64_b32__av32_x2
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: ds_writest64_b32__av32_x2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: DS_WRITE2ST64_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 1, 3, 0, implicit $exec :: (store (s32), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_32 = COPY $vgpr1
+ %2:av_32 = COPY $vgpr2
+ DS_WRITE_B32_gfx9 %0, %1, 256, 0, implicit $exec :: (store (s32), addrspace 3)
+ DS_WRITE_B32_gfx9 %0, %2, 768, 0, implicit $exec :: (store (s32), addrspace 3)
+
+...
+
+---
+name: ds_writest64_b64__av64_x2
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
+
+ ; CHECK-LABEL: name: ds_writest64_b64__av64_x2
+ ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: DS_WRITE2ST64_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 1, 3, 0, implicit $exec :: (store (s64), addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:av_64_align2 = COPY $vgpr2_vgpr3
+ %2:av_64_align2 = COPY $vgpr4_vgpr5
+ DS_WRITE_B64_gfx9 %0, %1, 512, 0, implicit $exec :: (store (s64), addrspace 3)
+ DS_WRITE_B64_gfx9 %0, %2, 1536, 0, implicit $exec :: (store (s64), addrspace 3)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir
index 31ff5bd841f86..09aae9152c4ee 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir
@@ -172,9 +172,10 @@ body: |
; GCN-LABEL: name: no_merge_flat_load_dword_agpr_with_vgpr
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
- ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`)
- ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:agpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`)
- ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
+ ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr poison`, align 4)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
+ ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`, align 4)
%2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`, align 4)
@@ -398,8 +399,8 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
- ; GCN-NEXT: FLAT_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
+ ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr poison`, align 4)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:agpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
index 6071caf07011d..cd4610eed41a1 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
@@ -172,9 +172,10 @@ body: |
; GCN-LABEL: name: no_merge_global_load_dword_agpr_with_vgpr
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
- ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, addrspace 1)
- ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:agpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, addrspace 1)
- ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
+ ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) poison`, align 4, addrspace 1)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1
+ ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, align 4, addrspace 1)
%2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, align 4, addrspace 1)
@@ -604,8 +605,8 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: GLOBAL_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; GCN-NEXT: GLOBAL_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
+ ; GCN-NEXT: GLOBAL_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 4, addrspace 1)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:agpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir
index 0e9c02113e441..e8fc734b126c9 100644
--- a/llvm/test/C...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM modulo test names.
…mation In future changes we will have more AV_ virtual registers, which currently block the formation of write2. Most of the time these registers can simply be constrained to VGPR, so do that. Also relaxes the constraint in flat merging case. We already have the necessary code to insert copies to the original result registers, so there's no point in avoiding it. Addresses the easy half of #155769
b60de2d to
beb9b6c
Compare

In future changes we will have more AV_ virtual registers, which currently
block the formation of write2. Most of the time these registers can simply
be constrained to VGPR, so do that.
Also relaxes the constraint in flat merging case. We already have the necessary
code to insert copies to the original result registers, so there's no point
in avoiding it.
Addresses the easy half of #155769