From b0c453633f83a8e3f031c9e3dd8ba11e38a9d7de Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 1 Sep 2025 13:23:52 +0900 Subject: [PATCH 1/2] AMDGPU: Try to constrain av registers to VGPR to enable ds_write2 formation In future changes we will have more AV_ virtual registers, which currently block the formation of write2. Most of the time these registers can simply be constrained to VGPR, so do that. Also relaxes the constraint in flat merging case. We already have the necessary code to insert copies to the original result registers, so there's no point in avoiding it. Addresses the easy half of #155769 --- .../Target/AMDGPU/SILoadStoreOptimizer.cpp | 75 +++++-- .../load-store-opt-ds-regclass-constrain.mir | 210 ++++++++++++++++++ .../CodeGen/AMDGPU/merge-flat-load-store.mir | 11 +- .../AMDGPU/merge-global-load-store.mir | 11 +- .../CodeGen/AMDGPU/merge-load-store-agpr.mir | 10 +- 5 files changed, 285 insertions(+), 32 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 263f7127fbf10..69d02e7c2934c 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -119,7 +119,7 @@ class SILoadStoreOptimizer { unsigned DMask; InstClassEnum InstClass; unsigned CPol = 0; - bool IsAGPR; + const TargetRegisterClass *DataRC; bool UseST64; int AddrIdx[MaxAddressRegs]; const MachineOperand *AddrReg[MaxAddressRegs]; @@ -203,6 +203,7 @@ class SILoadStoreOptimizer { using MemInfoMap = DenseMap; private: + MachineFunction *MF = nullptr; const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; @@ -245,6 +246,8 @@ class SILoadStoreOptimizer { unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; + unsigned getWrite2Opcode(const CombineInfo &CI) const; + MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore); @@ -846,7 +849,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, if (InstClass == UNKNOWN) return; - IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); + DataRC = LSO.getDataRegClass(*MI); switch (InstClass) { case DS_READ: @@ -1313,6 +1316,50 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, // have already been confirmed to be mergeable. if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) offsetsCanBeCombined(CI, *STM, Paired, true); + + if (CI.InstClass == DS_WRITE) { + // Both data operands must be AGPR or VGPR, so the data registers needs to + // be constrained to one or the other. We expect to only emit the VGPR form + // here for now. + // + // FIXME: There is currently a hack in getRegClass to report that the write2 + // operands are VGPRs. In the future we should have separate agpr + // instruction definitions. + const MachineOperand *Data0 = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); + const MachineOperand *Data1 = + TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); + + const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI)); + int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), + AMDGPU::OpName::data0); + int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), + AMDGPU::OpName::data1); + + const TargetRegisterClass *DataRC0 = + TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF); + + const TargetRegisterClass *DataRC1 = + TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF); + + if (unsigned SubReg = Data0->getSubReg()) { + DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()), + DataRC0, SubReg); + } + + if (unsigned SubReg = Data1->getSubReg()) { + DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()), + DataRC1, SubReg); + } + + if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) || + !MRI->constrainRegClass(Data1->getReg(), DataRC1)) + return nullptr; + + // TODO: If one register can be constrained, and not the other, insert a + // copy. + } + return Where; } @@ -1462,6 +1509,10 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { : AMDGPU::DS_WRITE2ST64_B64_gfx9; } +unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const { + return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); +} + MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { @@ -1478,8 +1529,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( unsigned NewOffset0 = CI.Offset; unsigned NewOffset1 = Paired.Offset; - unsigned Opc = - CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); + unsigned Opc = getWrite2Opcode(CI); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -2032,6 +2082,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, } } + // FIXME: This should compute the instruction to use, and then use the result + // of TII->getRegClass. unsigned BitWidth = 32 * (CI.Width + Paired.Width); return TRI->isAGPRClass(getDataRegClass(*CI.I)) ? TRI->getAGPRClassForBitWidth(BitWidth) @@ -2400,7 +2452,6 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, std::list > &MergeableInsts) const { for (std::list &AddrList : MergeableInsts) { if (AddrList.front().InstClass == CI.InstClass && - AddrList.front().IsAGPR == CI.IsAGPR && AddrList.front().hasSameBaseAddress(CI)) { AddrList.emplace_back(CI); return; @@ -2465,19 +2516,6 @@ SILoadStoreOptimizer::collectMergeableInsts( if (!CI.hasMergeableAddress(*MRI)) continue; - if (CI.InstClass == DS_WRITE && CI.IsAGPR) { - LLVM_DEBUG( - dbgs() << "cannot merge ds writes with mixed AGPR and VGPR data\n"); - - // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data - // operands. However we are reporting that ds_write2 shall have - // only VGPR data so that machine copy propagation does not - // create an illegal instruction with a VGPR and AGPR sources. - // Consequenctially if we create such instruction the verifier - // will complain. - continue; - } - LLVM_DEBUG(dbgs() << "Mergeable: " << MI); addInstToMergeableList(CI, MergeableInsts); @@ -2650,6 +2688,7 @@ bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) { } bool SILoadStoreOptimizer::run(MachineFunction &MF) { + this->MF = &MF; STM = &MF.getSubtarget(); if (!STM->loadStoreOptEnabled()) return false; diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir new file mode 100644 index 0000000000000..33f210533e10b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir @@ -0,0 +1,210 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -o - %s | FileCheck %s + +--- +name: ds_write_b32__av32_x2 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: ds_write_b32__av32_x2 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_32 = COPY $vgpr1 + %2:av_32 = COPY $vgpr2 + DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__av32_x2_subregs_different_reg +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: ds_write_b32__av32_x2_subregs_different_reg + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY2]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64_align2 = COPY $vgpr2_vgpr3 + %2:av_64_align2 = COPY $vgpr4_vgpr5 + DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__unaligned_av64_subregs +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + + ; CHECK-LABEL: name: ds_write_b32__unaligned_av64_subregs + ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY2]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64 = COPY $vgpr1_vgpr2 + %2:av_64 = COPY $vgpr3_vgpr4 + DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__av32_x2_subregs_same_reg +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: ds_write_b32__av32_x2_subregs_same_reg + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY1]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64_align2 = COPY $vgpr2_vgpr3 + DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %1.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__av32__vgpr32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: ds_write_b32__av32__vgpr32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__vgpr32__av32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: ds_write_b32__vgpr32__av32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:av_32 = COPY $vgpr2 + DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b64__av64_x2 +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: ds_write_b64__av64_x2 + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 + ; CHECK-NEXT: DS_WRITE2_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 5, 12, 0, implicit $exec :: (store (s64), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64_align2 = COPY $vgpr2_vgpr3 + %2:av_64_align2 = COPY $vgpr4_vgpr5 + DS_WRITE_B64_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s64), addrspace 3) + DS_WRITE_B64_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s64), addrspace 3) + +... + +--- +name: ds_write_b64__av64_x2_subregs +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8_vgpr9 + + ; CHECK-LABEL: name: ds_write_b64__av64_x2_subregs + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8_vgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY $vgpr6_vgpr7_vgpr8_vgpr9 + ; CHECK-NEXT: DS_WRITE2_B64_gfx9 [[COPY]], [[COPY1]].sub2_sub3, [[COPY2]].sub2_sub3, 5, 12, 0, implicit $exec :: (store (s64), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_128_align2 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:av_128_align2 = COPY $vgpr6_vgpr7_vgpr8_vgpr9 + DS_WRITE_B64_gfx9 %0, %1.sub2_sub3, 40, 0, implicit $exec :: (store (s64), addrspace 3) + DS_WRITE_B64_gfx9 %0, %2.sub2_sub3, 96, 0, implicit $exec :: (store (s64), addrspace 3) + +... + +--- +name: ds_writest64_b32__av32_x2 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: ds_writest64_b32__av32_x2 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: DS_WRITE2ST64_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 1, 3, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_32 = COPY $vgpr1 + %2:av_32 = COPY $vgpr2 + DS_WRITE_B32_gfx9 %0, %1, 256, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2, 768, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_writest64_b64__av64_x2 +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: ds_writest64_b64__av64_x2 + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 + ; CHECK-NEXT: DS_WRITE2ST64_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 1, 3, 0, implicit $exec :: (store (s64), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64_align2 = COPY $vgpr2_vgpr3 + %2:av_64_align2 = COPY $vgpr4_vgpr5 + DS_WRITE_B64_gfx9 %0, %1, 512, 0, implicit $exec :: (store (s64), addrspace 3) + DS_WRITE_B64_gfx9 %0, %2, 1536, 0, implicit $exec :: (store (s64), addrspace 3) + +... diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir index 31ff5bd841f86..09aae9152c4ee 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir @@ -172,9 +172,10 @@ body: | ; GCN-LABEL: name: no_merge_flat_load_dword_agpr_with_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF - ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`) - ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:agpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`) - ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]] + ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr poison`, align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`, align 4) %2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`, align 4) @@ -398,8 +399,8 @@ body: | ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) - ; GCN-NEXT: FLAT_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1 + ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr poison`, align 4) %0:vreg_64_align2 = IMPLICIT_DEF %1:agpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir index 6071caf07011d..cd4610eed41a1 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir @@ -172,9 +172,10 @@ body: | ; GCN-LABEL: name: no_merge_global_load_dword_agpr_with_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF - ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:agpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) poison`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, align 4, addrspace 1) %2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, align 4, addrspace 1) @@ -604,8 +605,8 @@ body: | ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: GLOBAL_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: GLOBAL_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1 + ; GCN-NEXT: GLOBAL_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 4, addrspace 1) %0:vreg_64_align2 = IMPLICIT_DEF %1:agpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir index 0e9c02113e441..e8fc734b126c9 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir @@ -37,8 +37,9 @@ body: | ; GCN-LABEL: name: ds_read_b32_v_a ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) - ; GCN-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:agpr_32 = DS_READ_B32_gfx9 [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64_align2 = DS_READ2_B32_gfx9 [[DEF]], 0, 2, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[DS_READ2_B32_gfx9_]].sub1 %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) %2:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) @@ -51,8 +52,9 @@ body: | ; GCN-LABEL: name: ds_read_b32_a_v ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:agpr_32 = DS_READ_B32_gfx9 [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) - ; GCN-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:areg_64_align2 = DS_READ2_B32_gfx9 [[DEF]], 0, 2, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[DS_READ2_B32_gfx9_]].sub0 %0:vgpr_32 = IMPLICIT_DEF %1:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) %2:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) From beb9b6cbba9df94bd53c5644f92788804bd50366 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 3 Sep 2025 09:15:37 +0900 Subject: [PATCH 2/2] rename test functions --- llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir index cd4610eed41a1..0817694295f86 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir @@ -166,11 +166,11 @@ body: | ... --- -name: no_merge_global_load_dword_agpr_with_vgpr +name: merge_global_load_dword_agpr_with_vgpr body: | bb.0.entry: - ; GCN-LABEL: name: no_merge_global_load_dword_agpr_with_vgpr + ; GCN-LABEL: name: merge_global_load_dword_agpr_with_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) poison`, align 4, addrspace 1) ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 @@ -597,11 +597,11 @@ body: | ... --- -name: no_merge_global_store_dword_agpr_with_vgpr +name: merge_global_store_dword_agpr_with_vgpr body: | bb.0.entry: - ; GCN-LABEL: name: no_merge_global_store_dword_agpr_with_vgpr + ; GCN-LABEL: name: merge_global_store_dword_agpr_with_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF