diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 263f7127fbf10..69d02e7c2934c 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -119,7 +119,7 @@ class SILoadStoreOptimizer { unsigned DMask; InstClassEnum InstClass; unsigned CPol = 0; - bool IsAGPR; + const TargetRegisterClass *DataRC; bool UseST64; int AddrIdx[MaxAddressRegs]; const MachineOperand *AddrReg[MaxAddressRegs]; @@ -203,6 +203,7 @@ class SILoadStoreOptimizer { using MemInfoMap = DenseMap; private: + MachineFunction *MF = nullptr; const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; @@ -245,6 +246,8 @@ class SILoadStoreOptimizer { unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; + unsigned getWrite2Opcode(const CombineInfo &CI) const; + MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore); @@ -846,7 +849,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, if (InstClass == UNKNOWN) return; - IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); + DataRC = LSO.getDataRegClass(*MI); switch (InstClass) { case DS_READ: @@ -1313,6 +1316,50 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, // have already been confirmed to be mergeable. if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) offsetsCanBeCombined(CI, *STM, Paired, true); + + if (CI.InstClass == DS_WRITE) { + // Both data operands must be AGPR or VGPR, so the data registers needs to + // be constrained to one or the other. We expect to only emit the VGPR form + // here for now. + // + // FIXME: There is currently a hack in getRegClass to report that the write2 + // operands are VGPRs. In the future we should have separate agpr + // instruction definitions. + const MachineOperand *Data0 = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); + const MachineOperand *Data1 = + TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); + + const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI)); + int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), + AMDGPU::OpName::data0); + int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), + AMDGPU::OpName::data1); + + const TargetRegisterClass *DataRC0 = + TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF); + + const TargetRegisterClass *DataRC1 = + TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF); + + if (unsigned SubReg = Data0->getSubReg()) { + DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()), + DataRC0, SubReg); + } + + if (unsigned SubReg = Data1->getSubReg()) { + DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()), + DataRC1, SubReg); + } + + if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) || + !MRI->constrainRegClass(Data1->getReg(), DataRC1)) + return nullptr; + + // TODO: If one register can be constrained, and not the other, insert a + // copy. + } + return Where; } @@ -1462,6 +1509,10 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { : AMDGPU::DS_WRITE2ST64_B64_gfx9; } +unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const { + return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); +} + MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { @@ -1478,8 +1529,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( unsigned NewOffset0 = CI.Offset; unsigned NewOffset1 = Paired.Offset; - unsigned Opc = - CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); + unsigned Opc = getWrite2Opcode(CI); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -2032,6 +2082,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, } } + // FIXME: This should compute the instruction to use, and then use the result + // of TII->getRegClass. unsigned BitWidth = 32 * (CI.Width + Paired.Width); return TRI->isAGPRClass(getDataRegClass(*CI.I)) ? TRI->getAGPRClassForBitWidth(BitWidth) @@ -2400,7 +2452,6 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, std::list > &MergeableInsts) const { for (std::list &AddrList : MergeableInsts) { if (AddrList.front().InstClass == CI.InstClass && - AddrList.front().IsAGPR == CI.IsAGPR && AddrList.front().hasSameBaseAddress(CI)) { AddrList.emplace_back(CI); return; @@ -2465,19 +2516,6 @@ SILoadStoreOptimizer::collectMergeableInsts( if (!CI.hasMergeableAddress(*MRI)) continue; - if (CI.InstClass == DS_WRITE && CI.IsAGPR) { - LLVM_DEBUG( - dbgs() << "cannot merge ds writes with mixed AGPR and VGPR data\n"); - - // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data - // operands. However we are reporting that ds_write2 shall have - // only VGPR data so that machine copy propagation does not - // create an illegal instruction with a VGPR and AGPR sources. - // Consequenctially if we create such instruction the verifier - // will complain. - continue; - } - LLVM_DEBUG(dbgs() << "Mergeable: " << MI); addInstToMergeableList(CI, MergeableInsts); @@ -2650,6 +2688,7 @@ bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) { } bool SILoadStoreOptimizer::run(MachineFunction &MF) { + this->MF = &MF; STM = &MF.getSubtarget(); if (!STM->loadStoreOptEnabled()) return false; diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir new file mode 100644 index 0000000000000..33f210533e10b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-ds-regclass-constrain.mir @@ -0,0 +1,210 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -o - %s | FileCheck %s + +--- +name: ds_write_b32__av32_x2 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: ds_write_b32__av32_x2 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_32 = COPY $vgpr1 + %2:av_32 = COPY $vgpr2 + DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__av32_x2_subregs_different_reg +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: ds_write_b32__av32_x2_subregs_different_reg + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY2]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64_align2 = COPY $vgpr2_vgpr3 + %2:av_64_align2 = COPY $vgpr4_vgpr5 + DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__unaligned_av64_subregs +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + + ; CHECK-LABEL: name: ds_write_b32__unaligned_av64_subregs + ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY2]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64 = COPY $vgpr1_vgpr2 + %2:av_64 = COPY $vgpr3_vgpr4 + DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__av32_x2_subregs_same_reg +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: ds_write_b32__av32_x2_subregs_same_reg + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY1]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64_align2 = COPY $vgpr2_vgpr3 + DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %1.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__av32__vgpr32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: ds_write_b32__av32__vgpr32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b32__vgpr32__av32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: ds_write_b32__vgpr32__av32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:av_32 = COPY $vgpr2 + DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_write_b64__av64_x2 +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: ds_write_b64__av64_x2 + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 + ; CHECK-NEXT: DS_WRITE2_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 5, 12, 0, implicit $exec :: (store (s64), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64_align2 = COPY $vgpr2_vgpr3 + %2:av_64_align2 = COPY $vgpr4_vgpr5 + DS_WRITE_B64_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s64), addrspace 3) + DS_WRITE_B64_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s64), addrspace 3) + +... + +--- +name: ds_write_b64__av64_x2_subregs +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8_vgpr9 + + ; CHECK-LABEL: name: ds_write_b64__av64_x2_subregs + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8_vgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY $vgpr6_vgpr7_vgpr8_vgpr9 + ; CHECK-NEXT: DS_WRITE2_B64_gfx9 [[COPY]], [[COPY1]].sub2_sub3, [[COPY2]].sub2_sub3, 5, 12, 0, implicit $exec :: (store (s64), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_128_align2 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:av_128_align2 = COPY $vgpr6_vgpr7_vgpr8_vgpr9 + DS_WRITE_B64_gfx9 %0, %1.sub2_sub3, 40, 0, implicit $exec :: (store (s64), addrspace 3) + DS_WRITE_B64_gfx9 %0, %2.sub2_sub3, 96, 0, implicit $exec :: (store (s64), addrspace 3) + +... + +--- +name: ds_writest64_b32__av32_x2 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: ds_writest64_b32__av32_x2 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: DS_WRITE2ST64_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 1, 3, 0, implicit $exec :: (store (s32), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_32 = COPY $vgpr1 + %2:av_32 = COPY $vgpr2 + DS_WRITE_B32_gfx9 %0, %1, 256, 0, implicit $exec :: (store (s32), addrspace 3) + DS_WRITE_B32_gfx9 %0, %2, 768, 0, implicit $exec :: (store (s32), addrspace 3) + +... + +--- +name: ds_writest64_b64__av64_x2 +body: | + bb.0: + liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: ds_writest64_b64__av64_x2 + ; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5 + ; CHECK-NEXT: DS_WRITE2ST64_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 1, 3, 0, implicit $exec :: (store (s64), addrspace 3) + %0:vgpr_32 = COPY $vgpr0 + %1:av_64_align2 = COPY $vgpr2_vgpr3 + %2:av_64_align2 = COPY $vgpr4_vgpr5 + DS_WRITE_B64_gfx9 %0, %1, 512, 0, implicit $exec :: (store (s64), addrspace 3) + DS_WRITE_B64_gfx9 %0, %2, 1536, 0, implicit $exec :: (store (s64), addrspace 3) + +... diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir index 31ff5bd841f86..09aae9152c4ee 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir @@ -172,9 +172,10 @@ body: | ; GCN-LABEL: name: no_merge_flat_load_dword_agpr_with_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF - ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`) - ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:agpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`) - ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]] + ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr poison`, align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`, align 4) %2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`, align 4) @@ -398,8 +399,8 @@ body: | ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) - ; GCN-NEXT: FLAT_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1 + ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr poison`, align 4) %0:vreg_64_align2 = IMPLICIT_DEF %1:agpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir index 6071caf07011d..0817694295f86 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir @@ -166,15 +166,16 @@ body: | ... --- -name: no_merge_global_load_dword_agpr_with_vgpr +name: merge_global_load_dword_agpr_with_vgpr body: | bb.0.entry: - ; GCN-LABEL: name: no_merge_global_load_dword_agpr_with_vgpr + ; GCN-LABEL: name: merge_global_load_dword_agpr_with_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF - ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:agpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) poison`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, align 4, addrspace 1) %2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison`, align 4, addrspace 1) @@ -596,16 +597,16 @@ body: | ... --- -name: no_merge_global_store_dword_agpr_with_vgpr +name: merge_global_store_dword_agpr_with_vgpr body: | bb.0.entry: - ; GCN-LABEL: name: no_merge_global_store_dword_agpr_with_vgpr + ; GCN-LABEL: name: merge_global_store_dword_agpr_with_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: GLOBAL_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: GLOBAL_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1 + ; GCN-NEXT: GLOBAL_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 4, addrspace 1) %0:vreg_64_align2 = IMPLICIT_DEF %1:agpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir index 0e9c02113e441..e8fc734b126c9 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir @@ -37,8 +37,9 @@ body: | ; GCN-LABEL: name: ds_read_b32_v_a ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) - ; GCN-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:agpr_32 = DS_READ_B32_gfx9 [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64_align2 = DS_READ2_B32_gfx9 [[DEF]], 0, 2, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[DS_READ2_B32_gfx9_]].sub1 %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) %2:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) @@ -51,8 +52,9 @@ body: | ; GCN-LABEL: name: ds_read_b32_a_v ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:agpr_32 = DS_READ_B32_gfx9 [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) - ; GCN-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:areg_64_align2 = DS_READ2_B32_gfx9 [[DEF]], 0, 2, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[DS_READ2_B32_gfx9_]].sub0 %0:vgpr_32 = IMPLICIT_DEF %1:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) %2:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)