diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 91e0c86cd365c..060fb66d38f7b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -828,6 +828,12 @@ def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans", "Has Pseudo Scalar Transcendental instructions" >; +def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset", + "HasRestrictedSOffset", + "true", + "Has restricted SOffset (immediate not supported)." +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -1474,6 +1480,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, FeaturePseudoScalarTrans, + FeatureHasRestrictedSOffset, FeatureVGPRSingleUseHintInsts, FeatureMADIntraFwdBug, FeatureScalarDwordx3Loads]>; @@ -1787,6 +1794,11 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>; +def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">, + AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>; +def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">, + AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>; + def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">, AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 489b4f5a8d86a..f3a59109b4821 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -460,8 +460,8 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, return false; // If we have 96-bit memory operations, we shouldn't touch them. Note we may - // end up widening these for a scalar load during RegBankSelect, since there - // aren't 96-bit scalar loads. + // end up widening these for a scalar load during RegBankSelect, if we don't + // have 96-bit scalar loads. if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) return false; @@ -6467,10 +6467,10 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( MemSize, MemAlign); MI.addMemOperand(MF, MMO); - // There are no 96-bit result scalar loads, but widening to 128-bit should + // If we don't have 96-bit result scalar loads, widening to 128-bit should // always be legal. We may need to restore this to a 96-bit result if it turns // out this needs to be converted to a vector load during RegBankSelect. - if (!isPowerOf2_32(Size)) { + if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) { if (Ty.isVector()) Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); else diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b2f4c114dcbb3..b47fafb273442 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1061,7 +1061,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad( if (DstBank == &AMDGPU::SGPRRegBank) { // There are some special cases that we need to look at for 32 bit and 96 // bit SGPR loads otherwise we have nothing to do. - if (LoadSize != 32 && LoadSize != 96) + if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads())) return false; MachineMemOperand *MMO = *MI.memoperands_begin(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a675da8da3398..31a5a99e51bb3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -199,6 +199,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasSALUFloatInsts = false; bool HasVGPRSingleUseHintInsts = false; bool HasPseudoScalarTrans = false; + bool HasRestrictedSOffset = false; bool HasVcmpxPermlaneHazard = false; bool HasVMEMtoScalarWriteHazard = false; @@ -1163,6 +1164,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } + bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 708f212e204ac..f4d2c695e317c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1438,11 +1438,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; - } else { + } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) { // On GFX9 the offset is signed 21-bit in bytes (but must not be negative // for S_BUFFER_* instructions). if (!isInt<21>(AM.BaseOffs)) return false; + } else { + // On GFX12, all offsets are signed 24-bit in bytes. + if (!isInt<24>(AM.BaseOffs)) + return false; } if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. @@ -7497,7 +7501,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, }; // Widen vec3 load to vec4. - if (VT.isVector() && VT.getVectorNumElements() == 3) { + if (VT.isVector() && VT.getVectorNumElements() == 3 && + !Subtarget->hasScalarDwordx3Loads()) { EVT WidenedVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); auto WidenedOp = DAG.getMemIntrinsicNode( @@ -7913,6 +7918,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +// On targets not supporting constant in soffset field, turn zero to +// SGPR_NULL to avoid generating an extra s_mov with zero. +static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, + const GCNSubtarget *Subtarget) { + if (Subtarget->hasRestrictedSOffset()) + if (auto SOffsetConst = dyn_cast(SOffset)) { + if (SOffsetConst->isZero()) { + return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32); + } + } + return SOffset; +} + SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const { @@ -7921,13 +7939,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, SDValue VData = Op.getOperand(2); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain VData, // vdata Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -7954,13 +7973,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, SDValue VData = Op.getOperand(2); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain VData, // vdata Rsrc, // rsrc Op.getOperand(4), // vindex Offsets.first, // voffset - Op.getOperand(6), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -8116,12 +8136,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); + auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(4), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(5), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -8140,12 +8161,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Rsrc, // rsrc Op.getOperand(3), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -8157,21 +8179,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, MemSDNode *M = cast(Op); EVT LoadVT = Op.getValueType(); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); unsigned Dfmt = cast(Op.getOperand(7))->getZExtValue(); unsigned Nfmt = cast(Op.getOperand(8))->getZExtValue(); unsigned Glc = cast(Op.getOperand(9))->getZExtValue(); unsigned Slc = cast(Op.getOperand(10))->getZExtValue(); unsigned IdxEn = getIdxEn(Op.getOperand(3)); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - Op.getOperand(5), // soffset - Op.getOperand(6), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + SOffset, // soffset + Op.getOperand(6), // offset + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -8187,13 +8210,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT LoadVT = Op.getValueType(); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); + auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(4), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(5), // format Op.getOperand(6), // cachepolicy, swizzled buffer @@ -8213,13 +8237,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT LoadVT = Op.getValueType(); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Rsrc, // rsrc Op.getOperand(3), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // format Op.getOperand(7), // cachepolicy, swizzled buffer @@ -8432,6 +8457,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // src @@ -8439,7 +8465,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(6), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -8454,6 +8480,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG); + auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // src @@ -8461,7 +8488,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Rsrc, // rsrc Op.getOperand(5), // vindex Offsets.first, // voffset - Op.getOperand(7), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(8), // cachepolicy DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -8893,13 +8920,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, VData = handleD16VData(VData, DAG); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); SDValue Ops[] = { Chain, VData, // vdata Rsrc, // rsrc Op.getOperand(4), // vindex Offsets.first, // voffset - Op.getOperand(6), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(7), // format Op.getOperand(8), // cachepolicy, swizzled buffer @@ -8920,13 +8948,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, VData = handleD16VData(VData, DAG); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Chain, VData, // vdata Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // format Op.getOperand(7), // cachepolicy, swizzled buffer @@ -9000,13 +9029,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Chain, VData, Rsrc, DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -9050,13 +9080,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); SDValue Ops[] = { Chain, VData, Rsrc, Op.getOperand(4), // vindex Offsets.first, // voffset - Op.getOperand(6), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -9404,8 +9435,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, return; } } + + SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() + ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32) + : DAG.getConstant(0, DL, MVT::i32); + Offsets[0] = CombinedOffset; - Offsets[1] = DAG.getConstant(0, DL, MVT::i32); + Offsets[1] = SOffsetZero; Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); } @@ -9663,7 +9699,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) { - if (MemVT.isPow2VectorType()) + if (MemVT.isPow2VectorType() || + (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) return SDValue(); return WidenOrSplitVectorLoad(Op, DAG); } @@ -9679,7 +9716,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) && Alignment >= Align(4) && NumElements < 32) { - if (MemVT.isPow2VectorType()) + if (MemVT.isPow2VectorType() || + (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) return SDValue(); return WidenOrSplitVectorLoad(Op, DAG); } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 17105965471f6..dfd84b66bec72 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -161,8 +161,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass { if (!AddrOp->isReg()) return false; - // TODO: We should be able to merge physical reg addresses. - if (AddrOp->getReg().isPhysical()) + // TODO: We should be able to merge instructions with other physical reg + // addresses too. + if (AddrOp->getReg().isPhysical() && + AddrOp->getReg() != AMDGPU::SGPR_NULL) return false; // If an address has only one use then there will be no other @@ -350,6 +352,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_LOAD_DWORDX2: case AMDGPU::FLAT_STORE_DWORDX2: return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX3: @@ -443,16 +448,19 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { return UNKNOWN; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return S_BUFFER_LOAD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: return S_BUFFER_LOAD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: return S_LOAD_IMM; @@ -524,16 +532,19 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { return Opc; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: return AMDGPU::S_LOAD_DWORD_IMM; @@ -631,16 +642,19 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { return Result; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: Result.SOffset = true; [[fallthrough]]; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: Result.SBase = true; @@ -967,6 +981,17 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, return false; if (CI.CPol != Paired.CPol) return false; + if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || + CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { + // Reject cases like: + // dword + dwordx2 -> dwordx3 + // dword + dwordx3 -> dwordx4 + // If we tried to combine these cases, we would fail to extract a subreg + // for the result of the second load due to SGPR alignment requirements. + if (CI.Width != Paired.Width && + (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) + return false; + } return true; } @@ -1046,6 +1071,8 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, case 4: case 8: return true; + case 3: + return STM.hasScalarDwordx3Loads(); } } } @@ -1674,6 +1701,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; case 8: @@ -1685,6 +1714,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; + case 3: + return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; case 8: @@ -1696,6 +1727,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM; case 4: return AMDGPU::S_LOAD_DWORDX4_IMM; case 8: @@ -1817,6 +1850,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, return nullptr; case 2: return &AMDGPU::SReg_64_XEXECRegClass; + case 3: + return &AMDGPU::SGPR_96RegClass; case 4: return &AMDGPU::SGPR_128RegClass; case 8: diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index d24bfd535d4dd..231c897390e5c 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -826,7 +826,7 @@ def SMRDBufferImm : ComplexPattern; def SMRDBufferImm32 : ComplexPattern; def SMRDBufferSgprImm : ComplexPattern; -multiclass SMRD_Pattern { +multiclass SMRD_Pattern { // 1. IMM offset def : GCNPat < @@ -835,7 +835,7 @@ multiclass SMRD_Pattern { >; // 2. 32-bit IMM offset on CI - def : GCNPat < + if immci then def : GCNPat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_IMM_ci") $sbase, $offset, 0))> { let OtherPredicates = [isGFX7Only]; @@ -867,7 +867,7 @@ multiclass SMRD_Pattern { >; } -multiclass SMLoad_Pattern { +multiclass SMLoad_Pattern { // 1. Offset as an immediate def : GCNPat < (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy), @@ -876,7 +876,7 @@ multiclass SMLoad_Pattern { } // 2. 32-bit IMM offset on CI - def : GCNPat < + if immci then def : GCNPat < (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)), (!cast(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset, (extract_cpol $cachepolicy))> { @@ -919,6 +919,10 @@ foreach vt = SReg_64.RegTypes in { defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>; } +foreach vt = SReg_96.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX3", vt, false>; +} + foreach vt = SReg_128.RegTypes in { defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>; } @@ -935,12 +939,14 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX3", v3i32, false>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX3", v3f32, false>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir index 420c55f8f6da2..fb2a548cd7945 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s --- name: s_buffer_load_s32 @@ -28,15 +29,23 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-LABEL: name: s_buffer_load_v3s32 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX67-LABEL: name: s_buffer_load_v3s32 + ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: {{ $}} + ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GFX67-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) + ; GFX67-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX12-LABEL: name: s_buffer_load_v3s32 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -50,16 +59,25 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-LABEL: name: s_buffer_load_v3p3 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GCN-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<3 x p3>) + ; GFX67-LABEL: name: s_buffer_load_v3p3 + ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: {{ $}} + ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GFX67-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) + ; GFX67-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX67-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<3 x p3>) + ; + ; GFX12-LABEL: name: s_buffer_load_v3p3 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<3 x p3>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x p3>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -73,16 +91,25 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-LABEL: name: s_buffer_load_v6s16 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GCN-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>) + ; GFX67-LABEL: name: s_buffer_load_v6s16 + ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: {{ $}} + ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GFX67-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) + ; GFX67-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX67-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>) + ; + ; GFX12-LABEL: name: s_buffer_load_v6s16 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -140,52 +167,92 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-LABEL: name: s_buffer_load_v12s8 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) - ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GCN-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) - ; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; GCN-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32) - ; GCN-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) - ; GCN-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) - ; GCN-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32) - ; GCN-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) - ; GCN-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) - ; GCN-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32) - ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C4]] - ; GCN-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]] - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C2]](s32) - ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]] - ; GCN-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; GCN-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C4]] - ; GCN-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]] - ; GCN-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) - ; GCN-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]] - ; GCN-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; GCN-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C2]](s32) - ; GCN-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL3]] - ; GCN-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) - ; GCN-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C4]] - ; GCN-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]] - ; GCN-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) - ; GCN-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] - ; GCN-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) - ; GCN-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LSHR8]], [[C2]](s32) - ; GCN-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[SHL5]] - ; GCN-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) - ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; GCN-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>) + ; GFX67-LABEL: name: s_buffer_load_v12s8 + ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: {{ $}} + ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GFX67-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX67-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; GFX67-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX67-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; GFX67-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX67-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32) + ; GFX67-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; GFX67-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; GFX67-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32) + ; GFX67-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; GFX67-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; GFX67-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32) + ; GFX67-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX67-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C4]] + ; GFX67-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]] + ; GFX67-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX67-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX67-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX67-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C2]](s32) + ; GFX67-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]] + ; GFX67-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; GFX67-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C4]] + ; GFX67-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]] + ; GFX67-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX67-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]] + ; GFX67-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GFX67-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C2]](s32) + ; GFX67-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL3]] + ; GFX67-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GFX67-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C4]] + ; GFX67-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]] + ; GFX67-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX67-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]] + ; GFX67-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GFX67-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LSHR8]], [[C2]](s32) + ; GFX67-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[SHL5]] + ; GFX67-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GFX67-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; GFX67-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>) + ; + ; GFX12-LABEL: name: s_buffer_load_v12s8 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX12-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32) + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX12-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; GFX12-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; GFX12-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32) + ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX12-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; GFX12-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; GFX12-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; GFX12-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; GFX12-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32) + ; GFX12-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; GFX12-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; GFX12-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; GFX12-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX12-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX12-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX12-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX12-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -200,15 +267,23 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-LABEL: name: s_buffer_load_s96 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX67-LABEL: name: s_buffer_load_s96 + ; GFX67: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: {{ $}} + ; GFX67-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX67-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX67-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX67-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) + ; GFX67-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) + ; GFX67-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX12-LABEL: name: s_buffer_load_s96 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index b621185d83edd..d31570e47db77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s ; FIXME: Merge with regbankselect, which mostly overlaps when all types supported. @@ -54,6 +55,22 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret i32 %val } @@ -106,6 +123,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_glc + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 1 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 1) ret i32 %val } @@ -173,6 +206,27 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX12-LABEL: name: s_buffer_load_v2i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX2_SGPR_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR_IMM]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR_IMM]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <2 x i32> %val } @@ -255,6 +309,31 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 + ; + ; GFX12-LABEL: name: s_buffer_load_v3i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM:%[0-9]+]]:sgpr_96 = S_BUFFER_LOAD_DWORDX3_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x i32> %val } @@ -394,6 +473,51 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; GFX8-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub3 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub4 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub5 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub6 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub7 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX12-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX12-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX12-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX12-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x i32> %val } @@ -629,6 +753,83 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX8-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec ; GFX8-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 + ; + ; GFX12-LABEL: name: s_buffer_load_v16i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s512), align 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub3 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub4 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub5 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub6 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub7 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub8 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub9 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub10 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub11 + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub12 + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub13 + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub14 + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub15 + ; GFX12-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec + ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX12-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec + ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX12-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX12-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX12-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] + ; GFX12-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec + ; GFX12-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] + ; GFX12-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX12-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] + ; GFX12-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX12-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX12-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec + ; GFX12-NEXT: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]] + ; GFX12-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec + ; GFX12-NEXT: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX12-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX12-NEXT: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]] + ; GFX12-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX12-NEXT: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]] + ; GFX12-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec + ; GFX12-NEXT: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]] + ; GFX12-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec + ; GFX12-NEXT: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]] + ; GFX12-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec + ; GFX12-NEXT: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]] + ; GFX12-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec + ; GFX12-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x i32> %val } @@ -680,6 +881,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_1 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1, i32 0) ret i32 %val } @@ -729,6 +945,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_glc_4 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 4, 1 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 4, i32 1) ret i32 %val } @@ -780,6 +1011,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_255 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 255, i32 0) ret i32 %val } @@ -829,6 +1075,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_256 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 256, i32 0) ret i32 %val } @@ -878,6 +1139,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_1020 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1020, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1020, i32 0) ret i32 %val } @@ -929,6 +1205,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_1023 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1023, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1023, i32 0) ret i32 %val } @@ -979,6 +1270,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_1024 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1024, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1024, i32 0) ret i32 %val } @@ -1030,6 +1336,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_1025 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1025, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1025, i32 0) ret i32 %val } @@ -1082,6 +1403,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg1 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0) ret i32 %load } @@ -1133,6 +1470,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg4 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0) ret i32 %load } @@ -1184,6 +1537,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg8 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0) ret i32 %load } @@ -1235,6 +1604,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit31 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0) ret i32 %load } @@ -1286,6 +1671,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_glc_bit30 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 1 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 1) ret i32 %load } @@ -1337,6 +1738,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit29 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0) ret i32 %load } @@ -1388,6 +1805,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit21 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 2097152, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0) ret i32 %load } @@ -1439,6 +1871,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit20 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1048576, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0) ret i32 %load } @@ -1490,6 +1937,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg_bit20 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0) ret i32 %load } @@ -1540,6 +2003,21 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_bit19 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0) ret i32 %load } @@ -1591,6 +2069,22 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32_offset_neg_bit19 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0) ret i32 %load } @@ -1641,6 +2135,21 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val } @@ -1699,6 +2208,24 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r ; GFX8-NEXT: $vgpr0 = COPY [[COPY5]] ; GFX8-NEXT: $vgpr1 = COPY [[COPY6]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; + ; GFX12-LABEL: name: s_buffer_load_v2f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s64), align 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <2 x float> %val } @@ -1766,6 +2293,26 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; GFX8-NEXT: $vgpr1 = COPY [[COPY6]] ; GFX8-NEXT: $vgpr2 = COPY [[COPY7]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; + ; GFX12-LABEL: name: s_buffer_load_v3f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x float> %val } @@ -1836,6 +2383,28 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r ; GFX8-NEXT: $vgpr2 = COPY [[COPY7]] ; GFX8-NEXT: $vgpr3 = COPY [[COPY8]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; + ; GFX12-LABEL: name: s_buffer_load_v4f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub3 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <4 x float> %val } @@ -1936,6 +2505,38 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r ; GFX8-NEXT: $vgpr6 = COPY [[COPY11]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY12]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } @@ -2090,6 +2691,56 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg ; GFX8-NEXT: $vgpr14 = COPY [[COPY19]] ; GFX8-NEXT: $vgpr15 = COPY [[COPY20]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; + ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub8 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub9 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub10 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub11 + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub12 + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub13 + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub14 + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub15 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr8 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr9 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr10 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr11 = COPY [[COPY16]] + ; GFX12-NEXT: $vgpr12 = COPY [[COPY17]] + ; GFX12-NEXT: $vgpr13 = COPY [[COPY18]] + ; GFX12-NEXT: $vgpr14 = COPY [[COPY19]] + ; GFX12-NEXT: $vgpr15 = COPY [[COPY20]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } @@ -2139,6 +2790,21 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg % ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -2189,6 +2855,21 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg % ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -2239,6 +2920,21 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg % ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -2341,6 +3037,38 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> ; GFX8-NEXT: $vgpr6 = COPY [[COPY11]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY12]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -2443,6 +3171,38 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> ; GFX8-NEXT: $vgpr6 = COPY [[COPY11]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY12]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -2598,6 +3358,56 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3 ; GFX8-NEXT: $vgpr14 = COPY [[COPY19]] ; GFX8-NEXT: $vgpr15 = COPY [[COPY20]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; + ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub8 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub9 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub10 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub11 + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub12 + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub13 + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub14 + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub15 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr8 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr9 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr10 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr11 = COPY [[COPY16]] + ; GFX12-NEXT: $vgpr12 = COPY [[COPY17]] + ; GFX12-NEXT: $vgpr13 = COPY [[COPY18]] + ; GFX12-NEXT: $vgpr14 = COPY [[COPY19]] + ; GFX12-NEXT: $vgpr15 = COPY [[COPY20]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4032 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val @@ -2753,6 +3563,56 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3 ; GFX8-NEXT: $vgpr14 = COPY [[COPY19]] ; GFX8-NEXT: $vgpr15 = COPY [[COPY20]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; + ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub2 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub3 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub4 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub5 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub6 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub7 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub8 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub9 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub10 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub11 + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub12 + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub13 + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub14 + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub15 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY5]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY6]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY7]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY8]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr8 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr9 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr10 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr11 = COPY [[COPY16]] + ; GFX12-NEXT: $vgpr12 = COPY [[COPY17]] + ; GFX12-NEXT: $vgpr13 = COPY [[COPY18]] + ; GFX12-NEXT: $vgpr14 = COPY [[COPY19]] + ; GFX12-NEXT: $vgpr15 = COPY [[COPY20]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4036 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val @@ -2903,6 +3763,54 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val } @@ -3046,6 +3954,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -3202,6 +4156,56 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_2]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -3346,6 +4350,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) ret float %val } @@ -3493,6 +4543,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4096) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) ret float %val } @@ -3688,6 +4784,69 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX8-NEXT: $vgpr6 = COPY [[COPY15]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY16]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY16]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -3896,6 +5055,73 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX8-NEXT: $vgpr6 = COPY [[COPY16]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY17]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 + ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_2]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6 + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY16]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY17]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -4102,6 +5328,73 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX8-NEXT: $vgpr6 = COPY [[COPY16]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY17]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_2]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6 + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY16]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY17]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4096 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -4299,6 +5592,70 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX8-NEXT: $vgpr6 = COPY [[COPY15]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY16]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4064 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 936, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 952, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY16]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 5000 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -4496,6 +5853,70 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX8-NEXT: $vgpr6 = COPY [[COPY15]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY16]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY16]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4076 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -4693,6 +6114,70 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX8-NEXT: $vgpr6 = COPY [[COPY15]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY16]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6 + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY15]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY16]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4080 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -4887,6 +6372,69 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX8-NEXT: $vgpr6 = COPY [[COPY14]] ; GFX8-NEXT: $vgpr7 = COPY [[COPY15]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.3(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec + ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub3 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub4 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub5 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub6 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub7 + ; GFX12-NEXT: $vgpr0 = COPY [[COPY8]] + ; GFX12-NEXT: $vgpr1 = COPY [[COPY9]] + ; GFX12-NEXT: $vgpr2 = COPY [[COPY10]] + ; GFX12-NEXT: $vgpr3 = COPY [[COPY11]] + ; GFX12-NEXT: $vgpr4 = COPY [[COPY12]] + ; GFX12-NEXT: $vgpr5 = COPY [[COPY13]] + ; GFX12-NEXT: $vgpr6 = COPY [[COPY14]] + ; GFX12-NEXT: $vgpr7 = COPY [[COPY15]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0) ret <8 x float> %val } @@ -4936,6 +6484,21 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg % ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.v, %offset.s %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) ret float %val @@ -4986,6 +6549,21 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg % ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.s, %offset.v %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) ret float %val @@ -5045,6 +6623,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, %offset.s %offset = add i32 %offset.base, 1024 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -5105,6 +6701,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, %offset.v %offset = add i32 %offset.base, 1024 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -5166,6 +6780,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, 1024 %offset = add i32 %offset.base, %offset.v %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -5226,6 +6858,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024 %offset = add i32 %offset.base, %offset.s %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index c28d204784d4b..4853bb309c1bb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -1,4 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s @@ -7,6 +9,53 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { +; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NOUNALIGNED-NEXT: s_clause 0xb +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v5, v[0:1], off offset:3 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v6, v[0:1], off offset:4 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v7, v[0:1], off offset:5 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v9, v[0:1], off offset:7 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1 +; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4 +; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7 +; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -166,6 +215,31 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { } define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) { +; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align2: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NOUNALIGNED-NEXT: s_clause 0x5 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v2, v[0:1], off +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v3, v[0:1], off offset:2 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v4, v[0:1], off offset:4 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v5, v[0:1], off offset:6 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v6, v[0:1], off offset:8 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v7, v[0:1], off offset:10 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6 +; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -256,6 +330,13 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) { } define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) { +; GFX12-LABEL: v_load_constant_v3i32_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_load_constant_v3i32_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -291,6 +372,13 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) { } define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) { +; GFX12-LABEL: v_load_constant_i96_align8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_load_constant_i96_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -326,6 +414,13 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) { } define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) { +; GFX12-LABEL: v_load_constant_v3i32_align8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_load_constant_v3i32_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -361,6 +456,13 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) { } define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { +; GFX12-LABEL: v_load_constant_v6i16_align8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_load_constant_v6i16_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -405,6 +507,25 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { } define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) { +; GFX12-LABEL: v_load_constant_v12i8_align8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b32_e32 v13, 8, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13 +; GFX12-NEXT: v_mov_b32_e32 v8, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v12 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_load_constant_v12i8_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -475,6 +596,13 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) { } define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) { +; GFX12-LABEL: v_load_constant_v3i32_align16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_load_constant_v3i32_align16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -506,6 +634,60 @@ define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) { } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg %ptr) { +; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align1: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1] +; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NOUNALIGNED-NEXT: s_clause 0xb +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v1, v0, s[0:1] +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v2, v0, s[0:1] offset:1 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v3, v0, s[0:1] offset:2 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v4, v0, s[0:1] offset:3 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v5, v0, s[0:1] offset:4 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v6, v0, s[0:1] offset:5 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v7, v0, s[0:1] offset:6 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v8, v0, s[0:1] offset:7 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v9, v0, s[0:1] offset:8 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v10, v0, s[0:1] offset:9 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v0, s[0:1] offset:11 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v0, v0, s[0:1] offset:10 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v10, 8, v9 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v5, v6, v4 +; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v0, v8, v0, v7 +; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 @@ -674,6 +856,38 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg %ptr) { +; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align2: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1] +; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NOUNALIGNED-NEXT: s_clause 0x5 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v3, v0, s[0:1] offset:4 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v4, v0, s[0:1] offset:6 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v5, v0, s[0:1] offset:8 +; GFX12-NOUNALIGNED-NEXT: global_load_u16 v0, v0, s[0:1] offset:10 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v5 +; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 @@ -773,6 +987,12 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) { +; GFX12-LABEL: s_load_constant_v3i32_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_load_constant_v3i32_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -804,6 +1024,12 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg } define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { +; GFX12-LABEL: s_load_constant_i96_align8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_load_constant_i96_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -835,6 +1061,12 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) { +; GFX12-LABEL: s_load_constant_v3i32_align8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_load_constant_v3i32_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -866,6 +1098,12 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) { +; GFX12-LABEL: s_load_constant_v6i16_align8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_load_constant_v6i16_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -898,6 +1136,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg } define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) { +; GFX12-LABEL: s_load_constant_v12i8_align8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s13, s0, 8 +; GFX12-NEXT: s_lshr_b32 s12, s0, 16 +; GFX12-NEXT: s_lshr_b32 s3, s0, 24 +; GFX12-NEXT: s_lshr_b32 s5, s1, 8 +; GFX12-NEXT: s_lshr_b32 s6, s1, 16 +; GFX12-NEXT: s_lshr_b32 s7, s1, 24 +; GFX12-NEXT: s_lshr_b32 s9, s2, 8 +; GFX12-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-NEXT: s_lshr_b32 s11, s2, 24 +; GFX12-NEXT: s_mov_b32 s4, s1 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s1, s13 +; GFX12-NEXT: s_mov_b32 s2, s12 +; GFX12-NEXT: ; return to shader part epilog +; ; GFX9-LABEL: s_load_constant_v12i8_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 @@ -956,6 +1213,12 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %ptr) { +; GFX12-LABEL: s_load_constant_v3i32_align16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; ; GCN-LABEL: s_load_constant_v3i32_align16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index 2c84b7ccea401..80bd85d16f357 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-fast -o - %s | FileCheck %s -check-prefix=GFX7 ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-greedy -o - %s | FileCheck %s -check-prefix=GFX7 +; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -simplify-mir -stop-after=amdgpu-regbankselect -o - %s | FileCheck %s -check-prefix=GFX12 ; Natural mapping define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { @@ -19,6 +20,22 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret i32 %val } @@ -43,6 +60,26 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX12-LABEL: name: s_buffer_load_v2i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <2 x i32> %val } @@ -70,6 +107,29 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) ; GFX7-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 + ; + ; GFX12-LABEL: name: s_buffer_load_v3i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x i32> %val } @@ -112,6 +172,44 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) ; GFX7-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) + ; GFX12-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; GFX12-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) + ; GFX12-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) + ; GFX12-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) + ; GFX12-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x i32> %val } @@ -178,6 +276,68 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX7-NEXT: [[INTRINSIC_CONVERGENT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32) ; GFX7-NEXT: $sgpr15 = COPY [[INTRINSIC_CONVERGENT15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 + ; + ; GFX12-LABEL: name: s_buffer_load_v16i32 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512), align 4) + ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) + ; GFX12-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32) + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; GFX12-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32) + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) + ; GFX12-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32) + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) + ; GFX12-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) + ; GFX12-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32) + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32) + ; GFX12-NEXT: $sgpr8 = COPY [[INTRINSIC_CONVERGENT8]](s32) + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32) + ; GFX12-NEXT: $sgpr9 = COPY [[INTRINSIC_CONVERGENT9]](s32) + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32) + ; GFX12-NEXT: $sgpr10 = COPY [[INTRINSIC_CONVERGENT10]](s32) + ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32) + ; GFX12-NEXT: $sgpr11 = COPY [[INTRINSIC_CONVERGENT11]](s32) + ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32) + ; GFX12-NEXT: $sgpr12 = COPY [[INTRINSIC_CONVERGENT12]](s32) + ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32) + ; GFX12-NEXT: $sgpr13 = COPY [[INTRINSIC_CONVERGENT13]](s32) + ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32) + ; GFX12-NEXT: $sgpr14 = COPY [[INTRINSIC_CONVERGENT14]](s32) + ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32) + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32) + ; GFX12-NEXT: $sgpr15 = COPY [[INTRINSIC_CONVERGENT15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x i32> %val } @@ -199,6 +359,22 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val } @@ -221,6 +397,24 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; + ; GFX12-LABEL: name: s_buffer_load_v2f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <2 x float> %val } @@ -244,6 +438,25 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; + ; GFX12-LABEL: name: s_buffer_load_v3f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<3 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<3 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x float> %val } @@ -268,6 +481,26 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; + ; GFX12-LABEL: name: s_buffer_load_v4f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <4 x float> %val } @@ -298,6 +531,32 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val } @@ -338,6 +597,42 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; + ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val } @@ -360,6 +655,24 @@ define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; GFX7-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128) ; GFX7-NEXT: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_i96_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s96) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[AMDGPU_BUFFER_LOAD]](s96), [[COPY5]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0) store i96 %val, ptr addrspace(1) undef ret void @@ -389,6 +702,31 @@ define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; GFX7-NEXT: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_i256_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[UV]](s128), [[COPY5]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1) + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1) + ; GFX12-NEXT: G_STORE [[UV1]](s128), [[COPY6]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0) store i256 %val, ptr addrspace(1) undef ret void @@ -426,6 +764,41 @@ define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; GFX7-NEXT: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 48, align 8, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_i512_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[UV]](s128), [[COPY5]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1) + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1) + ; GFX12-NEXT: G_STORE [[UV1]](s128), [[COPY6]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1) + ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1) + ; GFX12-NEXT: G_STORE [[UV2]](s128), [[COPY7]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 32, align 8, addrspace 1) + ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1) + ; GFX12-NEXT: G_STORE [[UV3]](s128), [[COPY8]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 48, align 8, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0) store i512 %val, ptr addrspace(1) undef ret void @@ -455,6 +828,31 @@ define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; GFX7-NEXT: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_v16i16_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[UV]](<8 x s16>), [[COPY5]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 32, addrspace 1) + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1) + ; GFX12-NEXT: G_STORE [[UV1]](<8 x s16>), [[COPY6]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0) store <16 x i16> %val, ptr addrspace(1) undef ret void @@ -492,6 +890,41 @@ define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; GFX7-NEXT: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_v32i16_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[UV]](<8 x s16>), [[COPY5]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 64, addrspace 1) + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1) + ; GFX12-NEXT: G_STORE [[UV1]](<8 x s16>), [[COPY6]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1) + ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1) + ; GFX12-NEXT: G_STORE [[UV2]](<8 x s16>), [[COPY7]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1) + ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1) + ; GFX12-NEXT: G_STORE [[UV3]](<8 x s16>), [[COPY8]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0) store <32 x i16> %val, ptr addrspace(1) undef ret void @@ -521,6 +954,31 @@ define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i3 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; GFX7-NEXT: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_v4i64_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[UV]](<2 x s64>), [[COPY5]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 32, addrspace 1) + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1) + ; GFX12-NEXT: G_STORE [[UV1]](<2 x s64>), [[COPY6]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0) store <4 x i64> %val, ptr addrspace(1) undef ret void @@ -558,6 +1016,41 @@ define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i3 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; GFX7-NEXT: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_v8i64_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[UV]](<2 x s64>), [[COPY5]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 64, addrspace 1) + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1) + ; GFX12-NEXT: G_STORE [[UV1]](<2 x s64>), [[COPY6]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1) + ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1) + ; GFX12-NEXT: G_STORE [[UV2]](<2 x s64>), [[COPY7]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1) + ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1) + ; GFX12-NEXT: G_STORE [[UV3]](<2 x s64>), [[COPY8]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0) store <8 x i64> %val, ptr addrspace(1) undef ret void @@ -587,6 +1080,31 @@ define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) ; GFX7-NEXT: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_v4p1_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[UV]](<2 x p1>), [[COPY5]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 32, addrspace 1) + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1) + ; GFX12-NEXT: G_STORE [[UV1]](<2 x p1>), [[COPY6]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32> %rsrc, i32 %soffset, i32 0) store <4 x ptr addrspace(1)> %val, ptr addrspace(1) undef ret void @@ -624,6 +1142,41 @@ define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) ; GFX7-NEXT: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: s_buffer_load_v8p1_vgpr_offset + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GFX12-NEXT: G_STORE [[UV]](<2 x p1>), [[COPY5]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 64, addrspace 1) + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1) + ; GFX12-NEXT: G_STORE [[UV1]](<2 x p1>), [[COPY6]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1) + ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1) + ; GFX12-NEXT: G_STORE [[UV2]](<2 x p1>), [[COPY7]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1) + ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1) + ; GFX12-NEXT: G_STORE [[UV3]](<2 x p1>), [[COPY8]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 %val = call <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32> %rsrc, i32 %soffset, i32 0) store <8 x ptr addrspace(1)> %val, ptr addrspace(1) undef ret void @@ -648,6 +1201,25 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg % ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -672,6 +1244,25 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg % ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -695,6 +1286,25 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg % ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -730,6 +1340,35 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -764,6 +1403,35 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -808,6 +1476,45 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3 ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; + ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4032 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val @@ -851,6 +1558,45 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3 ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + ; + ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 %soffset = add i32 %soffset.base, 4036 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x float> %val @@ -903,6 +1649,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val } @@ -955,6 +1747,53 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 + ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -1009,6 +1848,54 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %16, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret float %val @@ -1061,6 +1948,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) ret float %val } @@ -1112,6 +2045,52 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4096) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) ret float %val } @@ -1175,6 +2154,63 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4064 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1240,6 +2276,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 + ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4068 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1303,6 +2397,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %soffset.base, 4096 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1365,6 +2517,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 936, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 952, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 5000 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1427,6 +2637,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4076 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1489,6 +2757,64 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %soffset = add i32 %offset.base, 4080 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x float> %val @@ -1550,6 +2876,62 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32) ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064 + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3 + ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] + ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] + ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] + ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1) + ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.4, %bb.2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) + ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32) + ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32) + ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32) + ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32) + ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32) + ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32) + ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32) + ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0) ret <8 x float> %val } @@ -1572,6 +2954,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg % ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; GFX12-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.v, %offset.s %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) ret float %val @@ -1595,6 +2995,24 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg % ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; GFX12-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.s, %offset.v %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) ret float %val @@ -1622,6 +3040,28 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, %offset.s %offset = add i32 %offset.base, 1024 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -1650,6 +3090,28 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, %offset.v %offset = add i32 %offset.base, 1024 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -1679,6 +3141,28 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, 1024 %offset = add i32 %offset.base, %offset.v %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) @@ -1707,6 +3191,28 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024 %offset = add i32 %offset.base, %offset.s %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir index 9a42745e76f64..442902c9fc8f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -check-prefixes=GCN,GFX7 +# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s -check-prefixes=GCN,GFX7 +# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12 --- | define amdgpu_kernel void @load_global_v8i32_non_uniform(ptr addrspace(1) %in) { @@ -113,16 +114,16 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_global_v8i32_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32 + 16, basealign 32, addrspace 1) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GCN-LABEL: name: load_global_v8i32_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32 + 16, basealign 32, addrspace 1) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from %ir.global.not.uniform.v8i32) ... @@ -135,16 +136,16 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_global_v4i64_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64 + 16, basealign 32, addrspace 1) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) + ; GCN-LABEL: name: load_global_v4i64_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64 + 16, basealign 32, addrspace 1) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from %ir.global.not.uniform.v4i64) ... @@ -156,22 +157,22 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_global_v16i32_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 16, basealign 64, addrspace 1) - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 1) - ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 48, basealign 64, addrspace 1) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; GCN-LABEL: name: load_global_v16i32_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 16, basealign 64, addrspace 1) + ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 1) + ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 48, basealign 64, addrspace 1) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from %ir.global.not.uniform.v16i32) ... @@ -183,22 +184,22 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_global_v8i64_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 16, basealign 64, addrspace 1) - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 1) - ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 48, basealign 64, addrspace 1) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>) + ; GCN-LABEL: name: load_global_v8i64_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 16, basealign 64, addrspace 1) + ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 1) + ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 48, basealign 64, addrspace 1) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from %ir.global.not.uniform.v8i64) ... @@ -210,11 +211,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_global_v8i32_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s32>), addrspace 1) + ; GCN-LABEL: name: load_global_v8i32_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s32>), addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<8 x s32>) = G_LOAD %0 :: (invariant load (<8 x s32>), addrspace 1) ... @@ -226,11 +227,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_global_v4i64_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<4 x s64>), addrspace 1) + ; GCN-LABEL: name: load_global_v4i64_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<4 x s64>), addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<4 x s64>) = G_LOAD %0 :: (invariant load (<4 x s64>), addrspace 1) ... @@ -242,11 +243,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_global_v16i32_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<16 x s32>), addrspace 1) + ; GCN-LABEL: name: load_global_v16i32_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load (<16 x s32>), addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<16 x s32>) = G_LOAD %0 :: (invariant load (<16 x s32>), addrspace 1) ... @@ -258,11 +259,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_global_v8i64_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s64>), addrspace 1) + ; GCN-LABEL: name: load_global_v8i64_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load (<8 x s64>), addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(<8 x s64>) = G_LOAD %0 :: (invariant load (<8 x s64>), addrspace 1) ... @@ -274,16 +275,16 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v8i32_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32 + 16, basealign 32, addrspace 4) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GCN-LABEL: name: load_constant_v8i32_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32 + 16, basealign 32, addrspace 4) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from %ir.constant.not.uniform.v8i32) ... @@ -295,16 +296,16 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_i256_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load (s128) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4) - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128) + ; GCN-LABEL: name: load_constant_i256_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load (s128) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4) + ; GCN-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s256) = G_LOAD %0 :: (load (s256) from %ir.constant.not.uniform) ... @@ -317,16 +318,16 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v16i16_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>) + ; GCN-LABEL: name: load_constant_v16i16_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<16 x s16>) = G_LOAD %0 :: (load (<16 x s16>) from %ir.constant.not.uniform) ... @@ -338,16 +339,16 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v4i64_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64 + 16, basealign 32, addrspace 4) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) + ; GCN-LABEL: name: load_constant_v4i64_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64 + 16, basealign 32, addrspace 4) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from %ir.constant.not.uniform.v4i64) ... @@ -359,22 +360,22 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v16i32_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 16, basealign 64, addrspace 4) - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 4) - ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 48, basealign 64, addrspace 4) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; GCN-LABEL: name: load_constant_v16i32_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 16, basealign 64, addrspace 4) + ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 4) + ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 48, basealign 64, addrspace 4) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from %ir.constant.not.uniform.v16i32) ... @@ -386,22 +387,22 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v8i64_non_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 16, basealign 64, addrspace 4) - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 4) - ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 48, basealign 64, addrspace 4) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>) + ; GCN-LABEL: name: load_constant_v8i64_non_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 16, basealign 64, addrspace 4) + ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 4) + ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 48, basealign 64, addrspace 4) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from %ir.constant.not.uniform.v8i64) ... @@ -413,11 +414,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v8i32_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4) + ; GCN-LABEL: name: load_constant_v8i32_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), addrspace 4) ... @@ -429,11 +430,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v16i16_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s16>) = G_LOAD [[COPY]](p4) :: (load (<16 x s16>), addrspace 4) + ; GCN-LABEL: name: load_constant_v16i16_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s16>) = G_LOAD [[COPY]](p4) :: (load (<16 x s16>), addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<16 x s16>) = G_LOAD %0 :: (load (<16 x s16>), addrspace 4) ... @@ -445,11 +446,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v4i64_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), addrspace 4) + ; GCN-LABEL: name: load_constant_v4i64_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), addrspace 4) ... @@ -461,11 +462,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v16i32_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), addrspace 4) + ; GCN-LABEL: name: load_constant_v16i32_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), addrspace 4) ... @@ -477,11 +478,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v8i64_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p4) :: (load (<8 x s64>), addrspace 4) + ; GCN-LABEL: name: load_constant_v8i64_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p4) :: (load (<8 x s64>), addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), addrspace 4) ... @@ -493,12 +494,12 @@ body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: load_local_uniform - ; CHECK: liveins: $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 3) + ; GCN-LABEL: name: load_local_uniform + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 3) %0:_(p3) = COPY $sgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 3) @@ -510,12 +511,12 @@ body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: load_region_uniform - ; CHECK: liveins: $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 5) + ; GCN-LABEL: name: load_region_uniform + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 5) %0:_(p3) = COPY $sgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 5) @@ -528,12 +529,12 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: extload_constant_i8_to_i32_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4) + ; GCN-LABEL: name: extload_constant_i8_to_i32_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 4, align 1) ... @@ -546,12 +547,12 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: extload_global_i8_to_i32_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 1) + ; GCN-LABEL: name: extload_global_i8_to_i32_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 1) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 1, align 1) ... @@ -564,12 +565,12 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: extload_constant_i16_to_i32_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4) + ; GCN-LABEL: name: extload_constant_i16_to_i32_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 4, align 2) ... @@ -582,12 +583,12 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: extload_global_i16_to_i32_uniform - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 1) + ; GCN-LABEL: name: extload_global_i16_to_i32_uniform + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 1) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 1, align 2) ... @@ -599,11 +600,11 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_i32_uniform_align4 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4) + ; GCN-LABEL: name: load_constant_i32_uniform_align4 + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 4) ... @@ -616,12 +617,12 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_i32_uniform_align2 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 2, addrspace 4) + ; GCN-LABEL: name: load_constant_i32_uniform_align2 + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 2, addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 2) ... @@ -634,12 +635,12 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_i32_uniform_align1 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 1, addrspace 4) + ; GCN-LABEL: name: load_constant_i32_uniform_align1 + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 1, addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 1) ... @@ -652,12 +653,12 @@ body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: load_private_uniform_sgpr_i32 - ; CHECK: liveins: $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p5) = COPY [[COPY]](p5) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p5) :: (load (s32), addrspace 5) + ; GCN-LABEL: name: load_private_uniform_sgpr_i32 + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p5) = COPY [[COPY]](p5) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p5) :: (load (s32), addrspace 5) %0:_(p5) = COPY $sgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 5, align 4) ... @@ -671,15 +672,15 @@ body: | bb.0: liveins: $vgpr0_vgpr1 - ; CHECK-LABEL: name: load_constant_v8i32_vgpr_crash - ; CHECK: liveins: $vgpr0_vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 32, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GCN-LABEL: name: load_constant_v8i32_vgpr_crash + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), addrspace 4) ... @@ -690,26 +691,26 @@ legalized: true tracksRegLiveness: true body: | - ; CHECK-LABEL: name: load_constant_v8i32_vgpr_crash_loop_phi - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p4) = COPY $sgpr2_sgpr3 - ; CHECK-NEXT: G_BR %bb.1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(p4) = G_PHI [[COPY]](p4), %bb.0, %3(p4), %bb.1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PHI]](p4) :: (load (<4 x s32>), align 32, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PHI]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p4) = COPY [[COPY1]](p4) - ; CHECK-NEXT: G_BR %bb.1 + ; GCN-LABEL: name: load_constant_v8i32_vgpr_crash_loop_phi + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: G_BR %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr(p4) = G_PHI [[COPY]](p4), %bb.0, %3(p4), %bb.1 + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PHI]](p4) :: (load (<4 x s32>), align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PHI]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4) + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr(p4) = COPY [[COPY1]](p4) + ; GCN-NEXT: G_BR %bb.1 bb.0: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 @@ -732,17 +733,24 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v3i32_align4 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), align 4, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4) - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX7-LABEL: name: load_constant_v3i32_align4 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), align 4, addrspace 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX12-LABEL: name: load_constant_v3i32_align4 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 4, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 4) S_ENDPGM 0, implicit %1 @@ -755,17 +763,24 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v3i32_align8 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4) - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX7-LABEL: name: load_constant_v3i32_align8 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<2 x s32>), addrspace 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX12-LABEL: name: load_constant_v3i32_align8 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 8, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 8) S_ENDPGM 0, implicit %1 @@ -778,14 +793,21 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v3i32_align16 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s32>), addrspace 4) - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX7-LABEL: name: load_constant_v3i32_align16 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s32>), addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) + ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX12-LABEL: name: load_constant_v3i32_align16 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (invariant load (<3 x s32>), align 16, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<3 x s32>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<3 x s32>) = G_LOAD %0 :: (invariant load (<3 x s32>), addrspace 4, align 16) S_ENDPGM 0, implicit %1 @@ -798,18 +820,25 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v6i16_align4 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), align 4, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, addrspace 4) - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) - ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; GFX7-LABEL: name: load_constant_v6i16_align4 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), align 4, addrspace 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) + ; GFX7-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>) + ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; + ; GFX12-LABEL: name: load_constant_v6i16_align4 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 4, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 4) S_ENDPGM 0, implicit %1 @@ -822,18 +851,25 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v6i16_align8 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, align 8, addrspace 4) - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) - ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; GFX7-LABEL: name: load_constant_v6i16_align8 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<4 x s16>), addrspace 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, align 8, addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) + ; GFX7-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>) + ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; + ; GFX12-LABEL: name: load_constant_v6i16_align8 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 8, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 8) S_ENDPGM 0, implicit %1 @@ -846,14 +882,21 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_v6i16_align16 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<8 x s16>), addrspace 4) - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16), [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16), [[UV6:%[0-9]+]]:sgpr(s16), [[UV7:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; GFX7-LABEL: name: load_constant_v6i16_align16 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<8 x s16>), addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16), [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16), [[UV6:%[0-9]+]]:sgpr(s16), [[UV7:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>) + ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>) + ; + ; GFX12-LABEL: name: load_constant_v6i16_align16 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<6 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<6 x s16>), align 16, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](<6 x s16>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<6 x s16>) = G_LOAD %0 :: (invariant load (<6 x s16>), addrspace 4, align 16) S_ENDPGM 0, implicit %1 @@ -866,17 +909,24 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_i96_align4 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), align 4, addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4) - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64) - ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s96) + ; GFX7-LABEL: name: load_constant_i96_align4 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), align 4, addrspace 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64) + ; GFX7-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[MV]](s96) + ; + ; GFX12-LABEL: name: load_constant_i96_align4 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 4, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 4) S_ENDPGM 0, implicit %1 @@ -889,17 +939,24 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_i96_align8 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), addrspace 4) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4) - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64) - ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s96) + ; GFX7-LABEL: name: load_constant_i96_align8 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s64) = G_LOAD [[COPY]](p4) :: (invariant load (s64), addrspace 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (s32) from unknown-address + 8, align 8, addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64) + ; GFX7-NEXT: [[MV:%[0-9]+]]:sgpr(s96) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[MV]](s96) + ; + ; GFX12-LABEL: name: load_constant_i96_align8 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 8, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 8) S_ENDPGM 0, implicit %1 @@ -912,13 +969,20 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: load_constant_i96_align16 - ; CHECK: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sgpr(s128) = G_LOAD [[COPY]](p4) :: (invariant load (s128), addrspace 4) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[LOAD]](s128) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s96) + ; GFX7-LABEL: name: load_constant_i96_align16 + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(s128) = G_LOAD [[COPY]](p4) :: (invariant load (s128), addrspace 4) + ; GFX7-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[LOAD]](s128) + ; GFX7-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s96) + ; + ; GFX12-LABEL: name: load_constant_i96_align16 + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(s96) = G_LOAD [[COPY]](p4) :: (invariant load (s96), align 16, addrspace 4) + ; GFX12-NEXT: S_ENDPGM 0, implicit [[LOAD]](s96) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s96) = G_LOAD %0 :: (invariant load (s96), addrspace 4, align 16) S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir index 0ac2dc42b969c..949ed7946a6b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=amdgpu-regbankselect %s -o - | FileCheck -check-prefix=GFX7 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=amdgpu-regbankselect %s -o - | FileCheck -check-prefix=GFX12 %s --- | @@ -28,17 +29,23 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; SI-LABEL: name: split_smrd_load_range - ; SI: liveins: $sgpr0_sgpr1 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; SI-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4) - ; SI-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 - ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; SI-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, addrspace 4) - ; SI-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) - ; SI-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; GFX7-LABEL: name: split_smrd_load_range + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) + ; GFX7-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-LABEL: name: split_smrd_load_range + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8, !range !0, addrspace 4) + ; GFX12-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[LOAD]](<3 x s32>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 8, addrspace 4, !range !0) $sgpr0_sgpr1_sgpr2 = COPY %1 @@ -52,17 +59,23 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; SI-LABEL: name: split_smrd_load_tbaa - ; SI: liveins: $sgpr0_sgpr1 - ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; SI-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), !tbaa !2, addrspace 4) - ; SI-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 - ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) - ; SI-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, !tbaa !2, addrspace 4) - ; SI-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) - ; SI-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; GFX7-LABEL: name: split_smrd_load_tbaa + ; GFX7: liveins: $sgpr0_sgpr1 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), !tbaa !2, addrspace 4) + ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s32) from unknown-address + 8, align 8, !tbaa !2, addrspace 4) + ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32) + ; GFX7-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-LABEL: name: split_smrd_load_tbaa + ; GFX12: liveins: $sgpr0_sgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8, !tbaa !2, addrspace 4) + ; GFX12-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[LOAD]](<3 x s32>) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 8, addrspace 4, !tbaa !1) $sgpr0_sgpr1_sgpr2 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll index bfd5dcaa143c1..f5846c3d6db73 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GFX6789,GFX678,GFX67,GFX7 ; RUN: llc -march=amdgcn -mcpu=gfx801 < %s | FileCheck %s -check-prefixes=GFX6789,GFX678,GFX689,GFX89 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX6789,GFX689,GFX89,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12 define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i32 inreg %val) { ; GFX67-LABEL: test_sink_smem_offset_400: @@ -28,6 +29,19 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3 ; GFX89-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX89-NEXT: ; %bb.2: ; %end ; GFX89-NEXT: s_endpgm +; +; GFX12-LABEL: test_sink_smem_offset_400: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: .LBB0_1: ; %loop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x190 +; GFX12-NEXT: s_add_co_i32 s2, s2, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB0_1 +; GFX12-NEXT: ; %bb.2: ; %end +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i64 400 br label %loop @@ -81,6 +95,19 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i ; GFX89-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX89-NEXT: ; %bb.2: ; %end ; GFX89-NEXT: s_endpgm +; +; GFX12-LABEL: test_sink_smem_offset_4000: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: .LBB1_1: ; %loop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0xfa0 +; GFX12-NEXT: s_add_co_i32 s2, s2, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX12-NEXT: ; %bb.2: ; %end +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i64 4000 br label %loop @@ -122,6 +149,19 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr ; GFX7-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX7-NEXT: ; %bb.2: ; %end ; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: test_sink_smem_offset_4000000: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: .LBB2_1: ; %loop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x3d0900 +; GFX12-NEXT: s_add_co_i32 s2, s2, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX12-NEXT: ; %bb.2: ; %end +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i64 4000000 br label %loop @@ -137,20 +177,92 @@ end: ret void } +define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %ptr, i32 inreg %val) { +; GFX689-LABEL: test_sink_smem_offset_40000000: +; GFX689: ; %bb.0: ; %entry +; GFX689-NEXT: s_add_u32 s0, s0, 0x2625a00 +; GFX689-NEXT: s_addc_u32 s1, s1, 0 +; GFX689-NEXT: .LBB3_1: ; %loop +; GFX689-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX689-NEXT: s_waitcnt lgkmcnt(0) +; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX689-NEXT: s_add_i32 s2, s2, -1 +; GFX689-NEXT: s_cmp_lg_u32 s2, 0 +; GFX689-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX689-NEXT: ; %bb.2: ; %end +; GFX689-NEXT: s_endpgm +; +; GFX7-LABEL: test_sink_smem_offset_40000000: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: .LBB3_1: ; %loop +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[0:1], 0x989680 +; GFX7-NEXT: s_add_i32 s2, s2, -1 +; GFX7-NEXT: s_cmp_lg_u32 s2, 0 +; GFX7-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX7-NEXT: ; %bb.2: ; %end +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: test_sink_smem_offset_40000000: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x2625a00 +; GFX12-NEXT: .LBB3_1: ; %loop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX12-NEXT: s_add_co_i32 s2, s2, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX12-NEXT: ; %bb.2: ; %end +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) %ptr, i64 40000000 + br label %loop + +loop: + %count = phi i32 [ %dec, %loop ], [ %val, %entry ] + %dec = sub i32 %count, 1 + %load = load volatile i32, ptr addrspace(4) %gep + %cond = icmp eq i32 %dec, 0 + br i1 %cond, label %end, label %loop + +end: + ret void +} + define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg %ptr, i32 inreg %val) { ; GFX6789-LABEL: test_sink_smem_offset_40000000000: ; GFX6789: ; %bb.0: ; %entry ; GFX6789-NEXT: s_add_u32 s0, s0, 0x502f9000 ; GFX6789-NEXT: s_addc_u32 s1, s1, 9 -; GFX6789-NEXT: .LBB3_1: ; %loop +; GFX6789-NEXT: .LBB4_1: ; %loop ; GFX6789-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6789-NEXT: s_waitcnt lgkmcnt(0) ; GFX6789-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX6789-NEXT: s_add_i32 s2, s2, -1 ; GFX6789-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6789-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX6789-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX6789-NEXT: ; %bb.2: ; %end ; GFX6789-NEXT: s_endpgm +; +; GFX12-LABEL: test_sink_smem_offset_40000000000: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_mov_b32 s4, 0x502f9000 +; GFX12-NEXT: s_mov_b32 s5, 9 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: .LBB4_1: ; %loop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX12-NEXT: s_add_co_i32 s2, s2, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX12-NEXT: ; %bb.2: ; %end +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i64 40000000000 br label %loop @@ -171,27 +283,40 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, ; GFX678: ; %bb.0: ; %entry ; GFX678-NEXT: s_add_u32 s0, s0, 0xfffffe70 ; GFX678-NEXT: s_addc_u32 s1, s1, -1 -; GFX678-NEXT: .LBB4_1: ; %loop +; GFX678-NEXT: .LBB5_1: ; %loop ; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX678-NEXT: s_add_i32 s2, s2, -1 ; GFX678-NEXT: s_cmp_lg_u32 s2, 0 -; GFX678-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX678-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX678-NEXT: ; %bb.2: ; %end ; GFX678-NEXT: s_endpgm ; ; GFX9-LABEL: test_sink_smem_offset_neg400: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: .LBB4_1: ; %loop +; GFX9-NEXT: .LBB5_1: ; %loop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190 ; GFX9-NEXT: s_add_i32 s2, s2, -1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %end ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_sink_smem_offset_neg400: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: .LBB5_1: ; %loop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190 +; GFX12-NEXT: s_add_co_i32 s2, s2, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %end +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i64 -400 br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 3c0b8f7712e19..b95231fd8880f 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -4024,14 +4024,12 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) -; GFX12-NEXT: s_add_f32 s3, s4, s5 -; GFX12-NEXT: s_add_f32 s2, s4, s2 +; GFX12-NEXT: s_add_f32 s2, s4, s5 +; GFX12-NEXT: s_add_f32 s3, s4, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX12-NEXT: s_max_num_f32 s2, s3, s2 +; GFX12-NEXT: s_max_num_f32 s2, s2, s3 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] offset:12 ; GFX12-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index f202326b873b1..d630ba946dca3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -85,9 +85,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-LABEL: v_permlane16_b32_vii: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -133,9 +131,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-LABEL: v_permlane16_b32_vll: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -207,9 +203,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -227,9 +221,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -629,9 +621,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src ; ; GFX12-LABEL: v_permlanex16_b32_vii: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -677,9 +667,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src ; ; GFX12-LABEL: v_permlanex16_b32_vll: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -751,9 +739,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -771,9 +757,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index 131a3951b2bf2..f865418befed7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -41,9 +41,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -56,9 +54,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -76,9 +72,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -91,9 +85,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -111,9 +103,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -125,9 +115,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -270,9 +258,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -285,9 +271,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -305,9 +289,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -320,9 +302,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -340,9 +320,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -354,9 +332,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 4e65b37633949..818e8eb946395 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -91,9 +91,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT4-LABEL: test_barrier: ; VARIANT4: ; %bb.0: ; %entry -; VARIANT4-NEXT: s_clause 0x1 -; VARIANT4-NEXT: s_load_b32 s2, s[0:1], 0x2c -; VARIANT4-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT4-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 @@ -115,9 +113,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT5-LABEL: test_barrier: ; VARIANT5: ; %bb.0: ; %entry -; VARIANT5-NEXT: s_clause 0x1 -; VARIANT5-NEXT: s_load_b32 s2, s[0:1], 0x2c -; VARIANT5-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT5-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 @@ -139,9 +135,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT6-LABEL: test_barrier: ; VARIANT6: ; %bb.0: ; %entry -; VARIANT6-NEXT: s_clause 0x1 -; VARIANT6-NEXT: s_load_b32 s2, s[0:1], 0x2c -; VARIANT6-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; VARIANT6-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 1ad3e58ce7fc3..220002ce4f6c4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -482,9 +482,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test1_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -501,9 +499,7 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -530,9 +526,7 @@ entry: define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test2_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -549,9 +543,7 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -578,9 +570,7 @@ entry: define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test3_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -597,9 +587,7 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -820,37 +808,33 @@ entry: define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_barrier_join_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_b32 v3, v1, s[2:3] -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: global_store_b32 v3, v1, s[0:1] ; GCN-NEXT: s_barrier_join m0 -; GCN-NEXT: global_store_b32 v3, v0, s[2:3] +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3] -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_barrier_join m0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3] +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm @@ -1062,37 +1046,33 @@ entry: define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_wakeup_barrier_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_b32 v3, v1, s[2:3] -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: global_store_b32 v3, v1, s[0:1] ; GCN-NEXT: s_wakeup_barrier m0 -; GCN-NEXT: global_store_b32 v3, v0, s[2:3] +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3] -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3] +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm @@ -1238,36 +1218,32 @@ entry: define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_get_barrier_state_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_b32 v0, v1, s[2:3] -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_get_barrier_state s0, m0 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] +; GCN-NEXT: s_get_barrier_state s2, m0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_b32 v0, v1, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0 -; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, m0 ; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll index 970c2c1c0456e..ace70aedc33d9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX6789,GFX78910,GFX789,GFX8910,GFX89,GFX910,GFX9 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX678910,GFX78910,GFX8910,GFX910,GFX10 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX12 define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { ; GFX67-LABEL: s_buffer_load_imm: @@ -30,6 +31,14 @@ define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_load_imm: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x4 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) %bitcast = bitcast i32 %load to float @@ -61,6 +70,14 @@ define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %ind ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_load_index: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) %bitcast = bitcast i32 %load to float @@ -82,6 +99,13 @@ define amdgpu_ps void @s_buffer_load_index_divergent(<4 x i32> inreg %desc, i32 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_load_index_divergent: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) %bitcast = bitcast i32 %load to float @@ -116,6 +140,15 @@ define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) { ; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx2_imm: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x40 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0) %bitcast = bitcast <2 x i32> %load to <2 x float> @@ -152,6 +185,15 @@ define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %i ; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx2_index: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) %bitcast = bitcast <2 x i32> %load to <2 x float> @@ -175,6 +217,13 @@ define amdgpu_ps void @s_buffer_loadx2_index_divergent(<4 x i32> inreg %desc, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx2_index_divergent: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) %bitcast = bitcast <2 x i32> %load to <2 x float> @@ -214,6 +263,16 @@ define amdgpu_ps void @s_buffer_loadx3_imm(<4 x i32> inreg %desc) { ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx3_imm: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b96 s[0:2], s[0:3], 0x40 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: export mrt0 v0, v1, v2, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 64, i32 0) %bitcast = bitcast <3 x i32> %load to <3 x float> @@ -254,6 +313,16 @@ define amdgpu_ps void @s_buffer_loadx3_index(<4 x i32> inreg %desc, i32 inreg %i ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx3_index: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b96 s[0:2], s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: export mrt0 v0, v1, v2, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0) %bitcast = bitcast <3 x i32> %load to <3 x float> @@ -285,6 +354,13 @@ define amdgpu_ps void @s_buffer_loadx3_index_divergent(<4 x i32> inreg %desc, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: exp mrt0 v0, v1, v2, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx3_index_divergent: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: buffer_load_b96 v[0:2], v0, s[0:3], null offen +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: export mrt0 v0, v1, v2, v0 done +; GFX12-NEXT: s_endpgm main_body: %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0) %bitcast = bitcast <3 x i32> %load to <3 x float> @@ -328,6 +404,17 @@ define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) { ; GFX11-NEXT: v_mov_b32_e32 v3, s3 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx4_imm: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0xc8 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done +; GFX12-NEXT: s_endpgm main_body: %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0) %bitcast = bitcast <4 x i32> %load to <4 x float> @@ -372,6 +459,17 @@ define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %i ; GFX11-NEXT: v_mov_b32_e32 v3, s3 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx4_index: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done +; GFX12-NEXT: s_endpgm main_body: %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) %bitcast = bitcast <4 x i32> %load to <4 x float> @@ -397,6 +495,13 @@ define amdgpu_ps void @s_buffer_loadx4_index_divergent(<4 x i32> inreg %desc, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_loadx4_index_divergent: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done +; GFX12-NEXT: s_endpgm main_body: %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) %bitcast = bitcast <4 x i32> %load to <4 x float> @@ -435,6 +540,15 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { ; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_load_imm_mergex2: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x4 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done +; GFX12-NEXT: s_endpgm main_body: %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) @@ -477,6 +591,17 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { ; GFX11-NEXT: v_mov_b32_e32 v3, s3 ; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_load_imm_mergex4: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0x8 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done +; GFX12-NEXT: s_endpgm main_body: %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0) @@ -594,6 +719,24 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_load_index_across_bb: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_add_co_u32 s4, s4, gv@gotpcrel32@lo+4 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, gv@gotpcrel32@hi+12 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_or_b32_e32 v0, 8, v0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm main_body: %tmp = shl i32 %index, 4 store i32 %tmp, ptr addrspace(1) @gv @@ -623,6 +766,14 @@ define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %des ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: exp mrt0 v0, v1, v0, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_buffer_load_index_across_bb_merged: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0/*Invalid immediate*/ offen offset:8 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: export mrt0 v0, v1, v0, v0 done +; GFX12-NEXT: s_endpgm main_body: %tmp = shl i32 %index, 4 br label %bb1 @@ -667,6 +818,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg1(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_neg1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0) ret i32 %load } @@ -706,6 +864,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg4(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_neg4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, -4 +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0) ret i32 %load } @@ -745,6 +910,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg8(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_neg8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, -8 +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0) ret i32 %load } @@ -784,6 +956,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit31(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_bit31: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_brev_b32 s4, 1 +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0) ret i32 %load } @@ -823,6 +1002,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit30(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_bit30: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, 2.0 +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 0) ret i32 %load } @@ -862,6 +1048,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit29(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_bit29: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_brev_b32 s4, 4 +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0) ret i32 %load } @@ -901,6 +1094,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit21(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_bit21: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x200000 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0) ret i32 %load } @@ -940,6 +1139,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit20(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_bit20: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100000 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0) ret i32 %load } @@ -979,6 +1184,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg_bit20(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_neg_bit20: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, 0xfff00000 +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0) ret i32 %load } @@ -1009,6 +1221,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit19(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_bit19: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x80000 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0) ret i32 %load } @@ -1048,6 +1266,13 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg_bit19(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_neg_bit19: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, 0xfff80000 +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0) ret i32 %load } @@ -1079,6 +1304,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_255(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0xff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_255: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0xff +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 255, i32 0) ret i32 %load } @@ -1101,6 +1332,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_256(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_256: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x100 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 256, i32 0) ret i32 %load } @@ -1123,6 +1360,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1016(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3f8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_1016: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3f8 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1016, i32 0) ret i32 %load } @@ -1145,6 +1388,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1020(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fc ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_1020: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fc +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1020, i32 0) ret i32 %load } @@ -1176,6 +1425,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1021(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fd ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_1021: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x3fd +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1021, i32 0) ret i32 %load } @@ -1206,6 +1461,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1024(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_1024: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0) ret i32 %load } @@ -1237,6 +1498,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1025(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x401 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_1025: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x401 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1025, i32 0) ret i32 %load } @@ -1267,6 +1534,12 @@ define amdgpu_ps i32 @s_buffer_load_imm_1028(<4 x i32> inreg %desc) { ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_buffer_load_imm_1028: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], 0x400 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: ; return to shader part epilog %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0) ret i32 %load } diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll index b5f38c641da74..d299e760b8774 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s ; Tests whether a load chain of 8 constants gets vectorized into a wider load. define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { @@ -53,6 +54,31 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur ; EG-NEXT: ADD T0.X, T1.W, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v8f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x0 +; GFX12-NEXT: s_load_b256 s[0:7], s[8:9], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_f32 s0, s0, s12 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_add_f32 s0, s1, s0 +; GFX12-NEXT: s_add_f32 s0, s2, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_add_f32 s0, s3, s0 +; GFX12-NEXT: s_add_f32 s0, s4, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_add_f32 s0, s5, s0 +; GFX12-NEXT: s_add_f32 s0, s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_add_f32 s0, s7, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[10:11] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %out_ptr.promoted = load float, ptr addrspace(1) %out_ptr, align 4 %tmp = load float, ptr addrspace(4) %weights, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 09ef53f330308..d00044c6ac1ab 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6-NOHSA %s ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s ; FUNC-LABEL: {{^}}constant_load_f64: define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { @@ -43,6 +44,19 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load double, ptr addrspace(4) %in store double %ld, ptr addrspace(1) %out ret void @@ -119,6 +133,31 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1] ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NOHSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_2v4f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0 +; GFX12-NEXT: s_load_b512 s[0:15], s[16:17], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[20:21] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[2:3], v[0:1] +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[4:5], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[6:7], v[0:1] +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[8:9], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[10:11], v[0:1] +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[12:13], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[14:15], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[18:19] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %out_ptr.promoted = load double, ptr addrspace(1) %out_ptr, align 4 %tmp = load double, ptr addrspace(4) %weights, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index d100cadb8ee57..4ed4034a0348f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_i1: @@ -61,6 +62,19 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %in store i1 %load, ptr addrspace(1) %out ret void @@ -122,6 +136,18 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v2i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in store <2 x i1> %load, ptr addrspace(1) %out ret void @@ -182,6 +208,18 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v3i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in store <3 x i1> %load, ptr addrspace(1) %out ret void @@ -243,6 +281,18 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v4i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in store <4 x i1> %load, ptr addrspace(1) %out ret void @@ -304,6 +354,18 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v8i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in store <8 x i1> %load, ptr addrspace(1) %out ret void @@ -365,6 +427,18 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v16i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in store <16 x i1> %load, ptr addrspace(1) %out ret void @@ -410,6 +484,18 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v32i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in store <32 x i1> %load, ptr addrspace(1) %out ret void @@ -457,6 +543,19 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v64i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in store <64 x i1> %load, ptr addrspace(1) %out ret void @@ -508,6 +607,18 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_i1_to_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i1, ptr addrspace(4) %in %ext = zext i1 %a to i32 store i32 %ext, ptr addrspace(1) %out @@ -563,6 +674,19 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_i1_to_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i1, ptr addrspace(4) %in %ext = sext i1 %a to i32 store i32 %ext, ptr addrspace(1) %out @@ -615,6 +739,18 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v1i1_to_v1i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i1>, ptr addrspace(4) %in %ext = zext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(1) %out @@ -670,6 +806,19 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v1i1_to_v1i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i1>, ptr addrspace(4) %in %ext = sext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(1) %out @@ -729,6 +878,23 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; EG-NEXT: AND_INT T0.X, T0.X, 1, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v2i1_to_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = zext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(1) %out @@ -789,6 +955,22 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: BFE_INT * T1.Y, PV.W, 0.0, 1, +; +; GFX12-LABEL: constant_sextload_v2i1_to_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = sext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(1) %out @@ -858,6 +1040,27 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: LSHR * T3.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v3i1_to_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, ptr addrspace(1) %out @@ -928,6 +1131,24 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; EG-NEXT: LSHR T0.X, PS, literal.x, ; EG-NEXT: BFE_INT * T3.Y, PV.W, 0.0, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v3i1_to_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v1, v4, 0, 1 +; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = sext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, ptr addrspace(1) %out @@ -999,6 +1220,31 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; EG-NEXT: AND_INT T0.X, T0.X, 1, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v4i1_to_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v3, 3, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX12-NEXT: v_and_b32_e32 v5, 1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(1) %out @@ -1071,6 +1317,27 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: BFE_INT * T1.Y, PV.W, 0.0, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v4i1_to_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 +; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_bfe_i32 v3, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_bfe_i32 v1, v5, 0, 1 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = sext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(1) %out @@ -1170,6 +1437,38 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; EG-NEXT: 4(5.605194e-45), 16(2.242078e-44) ; EG-NEXT: LSHR * T8.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v8i1_to_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v2, 5, v0 +; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v6, 3, v0 +; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0 +; GFX12-NEXT: v_lshrrev_b16 v3, 6, v0 +; GFX12-NEXT: v_and_b32_e32 v9, 1, v2 +; GFX12-NEXT: v_lshrrev_b16 v4, 7, v0 +; GFX12-NEXT: v_lshrrev_b16 v7, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v10, 1, v5 +; GFX12-NEXT: v_and_b32_e32 v5, 1, v6 +; GFX12-NEXT: v_and_b32_e32 v6, 1, v3 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: v_and_b32_e32 v2, 1, v7 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 1, v1 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(1) %out @@ -1273,6 +1572,35 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; EG-NEXT: LSHR T5.X, PS, literal.x, ; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v8i1_to_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0 +; GFX12-NEXT: v_lshrrev_b16 v5, 6, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 3, v0 +; GFX12-NEXT: v_lshrrev_b16 v6, 2, v0 +; GFX12-NEXT: v_lshrrev_b16 v7, 7, v0 +; GFX12-NEXT: v_lshrrev_b16 v9, 1, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v6, 0, 1 +; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 1 +; GFX12-NEXT: v_bfe_i32 v6, v5, 0, 1 +; GFX12-NEXT: v_bfe_i32 v5, v4, 0, 1 +; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v1, v9, 0, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = sext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(1) %out @@ -1433,6 +1761,60 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; EG-NEXT: 12(1.681558e-44), 48(6.726233e-44) ; EG-NEXT: LSHR * T14.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v16i1_to_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v16, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v2, 13, v0 +; GFX12-NEXT: v_lshrrev_b16 v13, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v15, 3, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 9, v0 +; GFX12-NEXT: v_lshrrev_b16 v6, 11, v0 +; GFX12-NEXT: v_and_b32_e32 v17, 1, v2 +; GFX12-NEXT: v_lshrrev_b16 v10, 5, v0 +; GFX12-NEXT: v_lshrrev_b16 v12, 7, v0 +; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0 +; GFX12-NEXT: v_lshrrev_b16 v3, 14, v0 +; GFX12-NEXT: v_lshrrev_b16 v5, 15, v0 +; GFX12-NEXT: v_lshrrev_b16 v14, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v22, 1, v13 +; GFX12-NEXT: v_and_b32_e32 v13, 1, v15 +; GFX12-NEXT: v_lshrrev_b16 v7, 8, v0 +; GFX12-NEXT: v_lshrrev_b16 v8, 10, v0 +; GFX12-NEXT: v_lshrrev_b16 v9, 4, v0 +; GFX12-NEXT: v_lshrrev_b16 v11, 6, v0 +; GFX12-NEXT: v_and_b32_e32 v18, 1, v4 +; GFX12-NEXT: v_and_b32_e32 v19, 1, v6 +; GFX12-NEXT: v_and_b32_e32 v20, 1, v10 +; GFX12-NEXT: v_and_b32_e32 v21, 1, v12 +; GFX12-NEXT: v_and_b32_e32 v2, 1, v14 +; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v5 +; GFX12-NEXT: v_and_b32_e32 v14, 1, v3 +; GFX12-NEXT: v_and_b32_e32 v12, 1, v1 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v17 +; GFX12-NEXT: v_and_b32_e32 v6, 1, v11 +; GFX12-NEXT: v_and_b32_e32 v4, 1, v9 +; GFX12-NEXT: v_and_b32_e32 v10, 1, v8 +; GFX12-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v19 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v21 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = zext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(1) %out @@ -1602,6 +1984,53 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; EG-NEXT: LSHR T7.X, PS, literal.x, ; EG-NEXT: BFE_INT * T13.Y, PV.W, 0.0, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v16i1_to_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v16, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 13, v0 +; GFX12-NEXT: v_lshrrev_b16 v8, 14, v0 +; GFX12-NEXT: v_lshrrev_b16 v12, 15, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 3, v0 +; GFX12-NEXT: v_lshrrev_b16 v7, 2, v0 +; GFX12-NEXT: v_lshrrev_b16 v13, 7, v0 +; GFX12-NEXT: v_lshrrev_b16 v17, 8, v0 +; GFX12-NEXT: v_lshrrev_b16 v9, 9, v0 +; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0 +; GFX12-NEXT: v_lshrrev_b16 v11, 11, v0 +; GFX12-NEXT: v_lshrrev_b16 v18, 4, v0 +; GFX12-NEXT: v_lshrrev_b16 v5, 5, v0 +; GFX12-NEXT: v_lshrrev_b16 v6, 6, v0 +; GFX12-NEXT: v_lshrrev_b16 v19, 1, v0 +; GFX12-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v7, 0, 1 +; GFX12-NEXT: v_bfe_i32 v7, v13, 0, 1 +; GFX12-NEXT: v_bfe_i32 v15, v12, 0, 1 +; GFX12-NEXT: v_bfe_i32 v14, v8, 0, 1 +; GFX12-NEXT: v_bfe_i32 v13, v4, 0, 1 +; GFX12-NEXT: v_bfe_i32 v12, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1 +; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1 +; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1 +; GFX12-NEXT: v_bfe_i32 v8, v17, 0, 1 +; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1 +; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1 +; GFX12-NEXT: v_bfe_i32 v4, v18, 0, 1 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: v_bfe_i32 v1, v19, 0, 1 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = sext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(1) %out @@ -1914,6 +2343,92 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; EG-NEXT: 28(3.923636e-44), 112(1.569454e-43) ; EG-NEXT: LSHR * T26.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v32i1_to_v32i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s3, s2, 24 +; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2 +; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2 +; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2 +; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2 +; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2 +; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2 +; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3 +; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3 +; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3 +; GFX12-NEXT: v_lshrrev_b16 v10, 1, s2 +; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v33, 1, v1 +; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 +; GFX12-NEXT: v_lshrrev_b16 v12, 14, s2 +; GFX12-NEXT: v_lshrrev_b16 v20, 15, s2 +; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v3, 10, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 4, s2 +; GFX12-NEXT: v_lshrrev_b16 v7, 6, s2 +; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 +; GFX12-NEXT: s_and_b32 s5, s2, 1 +; GFX12-NEXT: v_lshrrev_b16 v15, 4, s3 +; GFX12-NEXT: v_lshrrev_b16 v16, 6, s3 +; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3 +; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3 +; GFX12-NEXT: v_and_b32_e32 v25, 1, v14 +; GFX12-NEXT: v_and_b32_e32 v26, 1, v18 +; GFX12-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013 +; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13 +; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011 +; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010 +; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017 +; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6 +; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016 +; GFX12-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10014 +; GFX12-NEXT: v_and_b32_e32 v23, 1, v4 +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015 +; GFX12-NEXT: v_and_b32_e32 v22, 1, v2 +; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10 +; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11 +; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7 +; GFX12-NEXT: v_and_b32_e32 v4, 1, v5 +; GFX12-NEXT: v_and_b32_e32 v10, 1, v3 +; GFX12-NEXT: v_and_b32_e32 v14, 1, v19 +; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX12-NEXT: v_and_b32_e32 v18, 1, v16 +; GFX12-NEXT: v_and_b32_e32 v16, 1, v15 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_and_b32 v13, 0xffff, v26 +; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v23 +; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_and_b32 v23, 0xffff, v20 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; GFX12-NEXT: v_and_b32_e32 v20, 1, v0 +; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_and_b32 v17, 0xffff, v25 +; GFX12-NEXT: v_mov_b32_e32 v25, s2 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; GFX12-NEXT: v_and_b32_e32 v22, 1, v12 +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_and_b32 v15, 0xffff, v21 +; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v33 +; GFX12-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:96 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = zext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(1) %out @@ -2249,6 +2764,89 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; EG-NEXT: LSHR T11.X, PS, literal.x, ; EG-NEXT: BFE_INT * T25.Y, PV.W, 0.0, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v32i1_to_v32i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 +; GFX12-NEXT: v_lshrrev_b16 v12, 13, s2 +; GFX12-NEXT: v_lshrrev_b16 v13, 14, s2 +; GFX12-NEXT: v_lshrrev_b16 v14, 15, s2 +; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v9, 9, s2 +; GFX12-NEXT: v_lshrrev_b16 v10, 10, s2 +; GFX12-NEXT: v_lshrrev_b16 v11, 11, s2 +; GFX12-NEXT: s_lshr_b32 s3, s2, 24 +; GFX12-NEXT: v_lshrrev_b16 v4, 4, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2 +; GFX12-NEXT: v_lshrrev_b16 v6, 6, s2 +; GFX12-NEXT: v_lshrrev_b16 v7, 7, s2 +; GFX12-NEXT: v_lshrrev_b16 v1, 1, s2 +; GFX12-NEXT: v_lshrrev_b16 v2, 2, s2 +; GFX12-NEXT: v_lshrrev_b16 v3, 3, s2 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018 +; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000 +; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013 +; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012 +; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3 +; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3 +; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3 +; GFX12-NEXT: v_lshrrev_b16 v22, 7, s3 +; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3 +; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3 +; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10011 +; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10010 +; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10017 +; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016 +; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2 +; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1 +; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9 +; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s3 +; GFX12-NEXT: v_bfe_i32 v14, v13, 0, 1 +; GFX12-NEXT: v_bfe_i32 v13, v12, 0, 1 +; GFX12-NEXT: v_bfe_i32 v12, v0, 0, 1 +; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1 +; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1 +; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1 +; GFX12-NEXT: v_bfe_i32 v8, v8, 0, 1 +; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v31, s6 +; GFX12-NEXT: v_mov_b32_e32 v30, s7 +; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 1 +; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1 +; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1 +; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1 +; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_bfe_i32 v23, v22, 0, 1 +; GFX12-NEXT: v_bfe_i32 v22, v21, 0, 1 +; GFX12-NEXT: v_bfe_i32 v21, v20, 0, 1 +; GFX12-NEXT: v_bfe_i32 v20, v16, 0, 1 +; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1 +; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1 +; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v16, s4 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = sext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(1) %out @@ -2840,6 +3438,173 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: 28(3.923636e-44), 240(3.363116e-43) ; EG-NEXT: LSHR * T50.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v64i1_to_v64i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v2, 13, s2 +; GFX12-NEXT: s_lshr_b32 s4, s3, 24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: v_lshrrev_b16 v10, 13, s3 +; GFX12-NEXT: v_lshrrev_b16 v3, 9, s2 +; GFX12-NEXT: v_and_b32_e32 v45, 1, v2 +; GFX12-NEXT: v_lshrrev_b16 v2, 1, s4 +; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2 +; GFX12-NEXT: v_lshrrev_b16 v7, 1, s2 +; GFX12-NEXT: v_lshrrev_b16 v8, 3, s2 +; GFX12-NEXT: v_lshrrev_b16 v12, 11, s3 +; GFX12-NEXT: v_lshrrev_b16 v14, 7, s3 +; GFX12-NEXT: v_lshrrev_b16 v18, 5, s4 +; GFX12-NEXT: s_lshr_b32 s5, s2, 24 +; GFX12-NEXT: s_and_b32 s6, s3, 1 +; GFX12-NEXT: s_bfe_u32 s14, s3, 0x10012 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v55, s14 :: v_dual_and_b32 v36, 1, v10 +; GFX12-NEXT: v_and_b32_e32 v10, 1, v2 +; GFX12-NEXT: v_lshrrev_b16 v2, 3, s4 +; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2 +; GFX12-NEXT: v_lshrrev_b16 v11, 9, s3 +; GFX12-NEXT: v_lshrrev_b16 v13, 5, s3 +; GFX12-NEXT: v_lshrrev_b16 v15, 1, s3 +; GFX12-NEXT: v_lshrrev_b16 v16, 3, s3 +; GFX12-NEXT: v_and_b32_e32 v43, 1, v4 +; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5 +; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10014 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v49, s19 :: v_dual_and_b32 v42, 1, v3 +; GFX12-NEXT: v_lshrrev_b16 v3, 5, s5 +; GFX12-NEXT: s_bfe_u32 s13, s3, 0x10013 +; GFX12-NEXT: v_lshrrev_b16 v29, 12, s3 +; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3 +; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3 +; GFX12-NEXT: v_lshrrev_b16 v25, 8, s3 +; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3 +; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3 +; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3 +; GFX12-NEXT: v_dual_mov_b32 v56, s13 :: v_dual_and_b32 v27, 1, v12 +; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3 +; GFX12-NEXT: v_and_b32_e32 v12, 1, v2 +; GFX12-NEXT: v_lshrrev_b16 v2, 1, s5 +; GFX12-NEXT: s_and_b32 s7, s2, 1 +; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10011 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v54, s15 :: v_dual_and_b32 v35, 1, v8 +; GFX12-NEXT: v_lshrrev_b16 v8, 7, s5 +; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10010 +; GFX12-NEXT: v_dual_mov_b32 v53, s16 :: v_dual_and_b32 v40, 1, v7 +; GFX12-NEXT: v_lshrrev_b16 v7, 2, s5 +; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10017 +; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10016 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v51, s18 :: v_dual_and_b32 v44, 1, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 4, s5 +; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10015 +; GFX12-NEXT: v_and_b32_e32 v23, 1, v14 +; GFX12-NEXT: v_and_b32_e32 v14, 1, v18 +; GFX12-NEXT: v_lshrrev_b16 v18, 6, s5 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018 +; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015 +; GFX12-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_and_b32 v39, 1, v6 +; GFX12-NEXT: v_and_b32_e32 v32, 1, v11 +; GFX12-NEXT: v_lshrrev_b16 v11, 2, s4 +; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012 +; GFX12-NEXT: v_and_b32_e32 v20, 1, v16 +; GFX12-NEXT: v_lshrrev_b16 v16, 7, s4 +; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010 +; GFX12-NEXT: v_and_b32_e32 v24, 1, v15 +; GFX12-NEXT: v_lshrrev_b16 v15, 6, s4 +; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017 +; GFX12-NEXT: v_mov_b32_e32 v50, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016 +; GFX12-NEXT: v_and_b32_e32 v28, 1, v13 +; GFX12-NEXT: v_lshrrev_b16 v13, 4, s4 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 +; GFX12-NEXT: v_and_b32_e32 v6, 1, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 1, v4 +; GFX12-NEXT: v_lshrrev_b16 v17, 15, s2 +; GFX12-NEXT: v_lshrrev_b16 v1, 12, s2 +; GFX12-NEXT: v_lshrrev_b16 v9, 14, s2 +; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v41, 10, s2 +; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2 +; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2 +; GFX12-NEXT: v_lshrrev_b16 v37, 4, s2 +; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013 +; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011 +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10014 +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX12-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:192 +; GFX12-NEXT: v_mov_b32_e32 v52, s12 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX12-NEXT: v_dual_mov_b32 v54, s10 :: v_dual_and_b32 v3, 1, v7 +; GFX12-NEXT: v_dual_mov_b32 v56, s8 :: v_dual_and_b32 v7, 1, v18 +; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s13 +; GFX12-NEXT: v_mov_b32_e32 v51, s3 +; GFX12-NEXT: v_dual_mov_b32 v53, s11 :: v_dual_and_b32 v18, 0xffff, v24 +; GFX12-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX12-NEXT: v_and_b32_e32 v23, 1, v22 +; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v28 +; GFX12-NEXT: v_and_b32_e32 v28, 0xffff, v27 +; GFX12-NEXT: v_and_b32_e32 v27, 1, v26 +; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v32 +; GFX12-NEXT: v_and_b32_e32 v32, 0xffff, v31 +; GFX12-NEXT: v_and_b32_e32 v31, 1, v30 +; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v36 +; GFX12-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX12-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX12-NEXT: v_dual_mov_b32 v55, s9 :: v_dual_and_b32 v48, 0xffff, v17 +; GFX12-NEXT: v_dual_mov_b32 v17, s6 :: v_dual_and_b32 v20, 0xffff, v20 +; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX12-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX12-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v35 +; GFX12-NEXT: v_and_b32_e32 v35, 1, v34 +; GFX12-NEXT: v_and_b32_e32 v34, 0xffff, v40 +; GFX12-NEXT: v_and_b32_e32 v40, 0xffff, v39 +; GFX12-NEXT: v_and_b32_e32 v39, 1, v38 +; GFX12-NEXT: v_and_b32_e32 v38, 0xffff, v44 +; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v43 +; GFX12-NEXT: v_and_b32_e32 v43, 1, v41 +; GFX12-NEXT: v_and_b32_e32 v47, 1, v9 +; GFX12-NEXT: v_and_b32_e32 v46, 0xffff, v45 +; GFX12-NEXT: v_and_b32_e32 v45, 1, v1 +; GFX12-NEXT: v_and_b32_e32 v41, 1, v33 +; GFX12-NEXT: v_dual_mov_b32 v33, s7 :: v_dual_and_b32 v14, 0xffff, v14 +; GFX12-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_and_b32 v42, 0xffff, v42 +; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX12-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX12-NEXT: v_and_b32_e32 v37, 1, v37 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v6, 0xffff, v6 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: s_clause 0xd +; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1] +; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:176 +; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:160 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:144 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:128 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:96 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = zext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(1) %out @@ -3473,6 +4238,160 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: LSHR T50.X, PS, literal.x, ; EG-NEXT: BFE_INT * T19.Y, PV.W, 0.0, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v64i1_to_v64i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s5, s2, 24 +; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3 +; GFX12-NEXT: v_lshrrev_b16 v29, 13, s3 +; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3 +; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3 +; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3 +; GFX12-NEXT: v_lshrrev_b16 v25, 9, s3 +; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3 +; GFX12-NEXT: v_lshrrev_b16 v27, 11, s3 +; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3 +; GFX12-NEXT: v_lshrrev_b16 v21, 5, s3 +; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3 +; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3 +; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3 +; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3 +; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3 +; GFX12-NEXT: s_lshr_b32 s4, s3, 24 +; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5 +; GFX12-NEXT: v_lshrrev_b16 v5, 5, s5 +; GFX12-NEXT: v_lshrrev_b16 v6, 6, s5 +; GFX12-NEXT: v_lshrrev_b16 v1, 3, s5 +; GFX12-NEXT: v_lshrrev_b16 v2, 2, s5 +; GFX12-NEXT: v_lshrrev_b16 v7, 1, s5 +; GFX12-NEXT: v_lshrrev_b16 v44, 7, s5 +; GFX12-NEXT: s_bfe_i32 s5, s3, 0x10018 +; GFX12-NEXT: s_bfe_i32 s6, s3, 0x10000 +; GFX12-NEXT: s_bfe_i32 s13, s3, 0x10013 +; GFX12-NEXT: s_bfe_i32 s14, s3, 0x10012 +; GFX12-NEXT: s_bfe_i32 s15, s3, 0x10011 +; GFX12-NEXT: s_bfe_i32 s16, s3, 0x10010 +; GFX12-NEXT: s_bfe_i32 s17, s3, 0x10017 +; GFX12-NEXT: s_bfe_i32 s18, s3, 0x10016 +; GFX12-NEXT: s_bfe_i32 s19, s3, 0x10014 +; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s3 +; GFX12-NEXT: v_dual_mov_b32 v48, s19 :: v_dual_mov_b32 v51, s17 +; GFX12-NEXT: v_dual_mov_b32 v50, s18 :: v_dual_mov_b32 v53, s15 +; GFX12-NEXT: v_lshrrev_b16 v16, 14, s2 +; GFX12-NEXT: v_dual_mov_b32 v52, s16 :: v_dual_mov_b32 v55, s13 +; GFX12-NEXT: s_bfe_i32 s13, s2, 0x10015 +; GFX12-NEXT: v_mov_b32_e32 v54, s14 +; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 +; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2 +; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2 +; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4 +; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4 +; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4 +; GFX12-NEXT: v_lshrrev_b16 v15, 7, s4 +; GFX12-NEXT: v_lshrrev_b16 v40, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v41, 9, s2 +; GFX12-NEXT: v_lshrrev_b16 v42, 10, s2 +; GFX12-NEXT: v_lshrrev_b16 v43, 11, s2 +; GFX12-NEXT: v_lshrrev_b16 v36, 4, s2 +; GFX12-NEXT: v_lshrrev_b16 v37, 5, s2 +; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2 +; GFX12-NEXT: v_lshrrev_b16 v39, 7, s2 +; GFX12-NEXT: v_lshrrev_b16 v33, 1, s2 +; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2 +; GFX12-NEXT: v_lshrrev_b16 v35, 3, s2 +; GFX12-NEXT: v_lshrrev_b16 v9, 1, s4 +; GFX12-NEXT: v_lshrrev_b16 v10, 2, s4 +; GFX12-NEXT: v_lshrrev_b16 v11, 3, s4 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018 +; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10000 +; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10013 +; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10012 +; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10011 +; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10010 +; GFX12-NEXT: s_bfe_i32 s12, s2, 0x10017 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10016 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10014 +; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1 +; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1 +; GFX12-NEXT: v_bfe_i32 v21, v21, 0, 1 +; GFX12-NEXT: v_bfe_i32 v20, v20, 0, 1 +; GFX12-NEXT: v_bfe_i32 v31, v31, 0, 1 +; GFX12-NEXT: v_bfe_i32 v30, v30, 0, 1 +; GFX12-NEXT: v_bfe_i32 v29, v29, 0, 1 +; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s2 +; GFX12-NEXT: v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s3 +; GFX12-NEXT: v_mov_b32_e32 v53, s10 +; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1 +; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1 +; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1 +; GFX12-NEXT: v_bfe_i32 v27, v27, 0, 1 +; GFX12-NEXT: v_bfe_i32 v26, v26, 0, 1 +; GFX12-NEXT: v_bfe_i32 v25, v25, 0, 1 +; GFX12-NEXT: v_bfe_i32 v24, v24, 0, 1 +; GFX12-NEXT: v_bfe_i32 v46, v16, 0, 1 +; GFX12-NEXT: v_dual_mov_b32 v52, s11 :: v_dual_mov_b32 v55, s8 +; GFX12-NEXT: v_mov_b32_e32 v54, s9 +; GFX12-NEXT: v_mov_b32_e32 v16, s6 +; GFX12-NEXT: v_bfe_i32 v3, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v1, v7, 0, 1 +; GFX12-NEXT: v_bfe_i32 v7, v44, 0, 1 +; GFX12-NEXT: v_bfe_i32 v15, v15, 0, 1 +; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1 +; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 1 +; GFX12-NEXT: v_bfe_i32 v12, v12, 0, 1 +; GFX12-NEXT: v_bfe_i32 v47, v32, 0, 1 +; GFX12-NEXT: v_bfe_i32 v45, v8, 0, 1 +; GFX12-NEXT: v_bfe_i32 v44, v0, 0, 1 +; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1 +; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1 +; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1 +; GFX12-NEXT: v_bfe_i32 v43, v43, 0, 1 +; GFX12-NEXT: v_bfe_i32 v42, v42, 0, 1 +; GFX12-NEXT: v_bfe_i32 v41, v41, 0, 1 +; GFX12-NEXT: v_bfe_i32 v40, v40, 0, 1 +; GFX12-NEXT: v_mov_b32_e32 v8, s5 +; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1 +; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1 +; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1 +; GFX12-NEXT: v_bfe_i32 v39, v39, 0, 1 +; GFX12-NEXT: v_bfe_i32 v38, v38, 0, 1 +; GFX12-NEXT: v_bfe_i32 v37, v37, 0, 1 +; GFX12-NEXT: v_bfe_i32 v36, v36, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX12-NEXT: v_bfe_i32 v35, v35, 0, 1 +; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1 +; GFX12-NEXT: v_bfe_i32 v33, v33, 0, 1 +; GFX12-NEXT: v_mov_b32_e32 v32, s7 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v56, v[44:47], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v56, v[40:43], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v56, v[36:39], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v56, v[32:35], s[0:1] +; GFX12-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:176 +; GFX12-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:160 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:144 +; GFX12-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:128 +; GFX12-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = sext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(1) %out @@ -3530,6 +4449,19 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; EG-NEXT: MOV * T0.Y, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_i1_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i1, ptr addrspace(4) %in %ext = zext i1 %a to i64 store i64 %ext, ptr addrspace(1) %out @@ -3588,6 +4520,21 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV * T0.Y, PV.X, +; +; GFX12-LABEL: constant_sextload_i1_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i1, ptr addrspace(4) %in %ext = sext i1 %a to i64 store i64 %ext, ptr addrspace(1) %out @@ -3645,6 +4592,19 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; EG-NEXT: MOV * T0.Y, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v1i1_to_v1i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i1>, ptr addrspace(4) %in %ext = zext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -3703,6 +4663,21 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV * T0.Y, PV.X, +; +; GFX12-LABEL: constant_sextload_v1i1_to_v1i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i1>, ptr addrspace(4) %in %ext = sext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -3768,6 +4743,23 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; EG-NEXT: MOV T0.W, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v2i1_to_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -3835,6 +4827,25 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, ; EG-NEXT: MOV * T1.W, T1.Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v2i1_to_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = sext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -3918,6 +4929,29 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: LSHR * T3.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v3i1_to_v3i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v5, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v5, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v3, 2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX12-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v6, 1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v3, v5 :: v_dual_and_b32 v4, 0xffff, v3 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, ptr addrspace(1) %out @@ -4005,6 +5039,31 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; EG-NEXT: MOV T0.Y, PV.X, ; EG-NEXT: MOV * T1.W, T1.Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v3i1_to_v3i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v6, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v6, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = sext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, ptr addrspace(1) %out @@ -4097,6 +5156,34 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: LSHR * T3.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v4i1_to_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v6, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: v_lshrrev_b16 v0, 3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v2 +; GFX12-NEXT: v_and_b32_e32 v9, 1, v4 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v4, 0xffff, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -4193,6 +5280,35 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; EG-NEXT: MOV T1.W, T1.Z, ; EG-NEXT: MOV * T2.W, T2.Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v4i1_to_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 +; GFX12-NEXT: v_lshrrev_b16 v3, 1, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_bfe_i32 v6, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v4, v2, 0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_bfe_i32 v2, v3, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = sext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -4335,6 +5451,41 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) ; EG-NEXT: LSHR * T12.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v8i1_to_v8i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v12, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0 +; GFX12-NEXT: v_lshrrev_b16 v8, 3, v0 +; GFX12-NEXT: v_lshrrev_b16 v14, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 7, v0 +; GFX12-NEXT: v_lshrrev_b16 v6, 6, v0 +; GFX12-NEXT: v_lshrrev_b16 v10, 4, v0 +; GFX12-NEXT: v_and_b32_e32 v17, 1, v4 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v18, 1, v8 +; GFX12-NEXT: v_lshrrev_b16 v16, 2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v14, 1, v14 +; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v0, 1, v6 +; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v16 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v4, 1, v10 +; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v6, 0xffff, v17 +; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v18 +; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = zext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(1) %out @@ -4489,6 +5640,45 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; EG-NEXT: MOV T5.W, T5.Z, ; EG-NEXT: MOV * T8.W, T8.Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v8i1_to_v8i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v16, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v16, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v3, 6, v1 +; GFX12-NEXT: v_lshrrev_b16 v5, 7, v1 +; GFX12-NEXT: v_lshrrev_b16 v7, 4, v1 +; GFX12-NEXT: v_lshrrev_b16 v4, 3, v1 +; GFX12-NEXT: v_lshrrev_b16 v8, 2, v1 +; GFX12-NEXT: v_lshrrev_b16 v9, 5, v1 +; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1 +; GFX12-NEXT: v_bfe_i32 v14, v5, 0, 1 +; GFX12-NEXT: v_bfe_i32 v12, v3, 0, 1 +; GFX12-NEXT: v_bfe_i32 v6, v4, 0, 1 +; GFX12-NEXT: v_bfe_i32 v4, v8, 0, 1 +; GFX12-NEXT: v_bfe_i32 v10, v9, 0, 1 +; GFX12-NEXT: v_bfe_i32 v8, v7, 0, 1 +; GFX12-NEXT: v_bfe_i32 v0, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = sext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(1) %out @@ -4738,6 +5928,67 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) ; EG-NEXT: LSHR * T22.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v16i1_to_v16i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v28, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0 +; GFX12-NEXT: v_lshrrev_b16 v8, 9, v0 +; GFX12-NEXT: v_lshrrev_b16 v12, 13, v0 +; GFX12-NEXT: v_lshrrev_b16 v16, 7, v0 +; GFX12-NEXT: v_lshrrev_b16 v2, 15, v0 +; GFX12-NEXT: v_lshrrev_b16 v6, 14, v0 +; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0 +; GFX12-NEXT: v_lshrrev_b16 v20, 5, v0 +; GFX12-NEXT: v_lshrrev_b16 v24, 3, v0 +; GFX12-NEXT: v_lshrrev_b16 v32, 1, v0 +; GFX12-NEXT: v_and_b32_e32 v33, 1, v4 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v34, 1, v8 +; GFX12-NEXT: v_lshrrev_b16 v14, 8, v0 +; GFX12-NEXT: v_lshrrev_b16 v18, 12, v0 +; GFX12-NEXT: v_and_b32_e32 v35, 1, v12 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v36, 1, v16 +; GFX12-NEXT: v_lshrrev_b16 v22, 6, v0 +; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v38, 1, v24 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v32, 1, v32 +; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v4, 1, v10 +; GFX12-NEXT: v_mov_b32_e32 v23, v1 +; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX12-NEXT: v_mov_b32_e32 v31, v1 +; GFX12-NEXT: v_lshrrev_b16 v26, 4, v0 +; GFX12-NEXT: v_lshrrev_b16 v30, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v37, 1, v20 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v6, 0xffff, v33 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v24, 1, v30 +; GFX12-NEXT: v_and_b32_e32 v8, 1, v14 +; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v10, 0xffff, v34 +; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v20, 1, v26 +; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v16, 1, v22 +; GFX12-NEXT: v_and_b32_e32 v12, 1, v18 +; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v18, 0xffff, v36 +; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v32 +; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38 +; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -5010,6 +6261,73 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; EG-NEXT: MOV T20.W, T20.Z, ; EG-NEXT: MOV * T14.W, T14.Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v16i1_to_v16i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v32, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v32, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v3, 14, v1 +; GFX12-NEXT: v_lshrrev_b16 v5, 15, v1 +; GFX12-NEXT: v_lshrrev_b16 v7, 12, v1 +; GFX12-NEXT: v_lshrrev_b16 v9, 13, v1 +; GFX12-NEXT: v_lshrrev_b16 v11, 10, v1 +; GFX12-NEXT: v_lshrrev_b16 v13, 11, v1 +; GFX12-NEXT: v_lshrrev_b16 v15, 8, v1 +; GFX12-NEXT: v_lshrrev_b16 v16, 9, v1 +; GFX12-NEXT: v_lshrrev_b16 v12, 6, v1 +; GFX12-NEXT: v_lshrrev_b16 v14, 7, v1 +; GFX12-NEXT: v_lshrrev_b16 v8, 4, v1 +; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1 +; GFX12-NEXT: v_lshrrev_b16 v4, 3, v1 +; GFX12-NEXT: v_lshrrev_b16 v10, 2, v1 +; GFX12-NEXT: v_lshrrev_b16 v17, 5, v1 +; GFX12-NEXT: v_bfe_i32 v30, v5, 0, 1 +; GFX12-NEXT: v_bfe_i32 v28, v3, 0, 1 +; GFX12-NEXT: v_bfe_i32 v26, v9, 0, 1 +; GFX12-NEXT: v_bfe_i32 v24, v7, 0, 1 +; GFX12-NEXT: v_bfe_i32 v22, v13, 0, 1 +; GFX12-NEXT: v_bfe_i32 v20, v11, 0, 1 +; GFX12-NEXT: v_bfe_i32 v18, v16, 0, 1 +; GFX12-NEXT: v_bfe_i32 v16, v15, 0, 1 +; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1 +; GFX12-NEXT: v_bfe_i32 v12, v12, 0, 1 +; GFX12-NEXT: v_bfe_i32 v0, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX12-NEXT: v_bfe_i32 v6, v4, 0, 1 +; GFX12-NEXT: v_bfe_i32 v4, v10, 0, 1 +; GFX12-NEXT: v_bfe_i32 v10, v17, 0, 1 +; GFX12-NEXT: v_bfe_i32 v8, v8, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GFX12-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX12-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = sext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -5459,6 +6777,119 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; EG-NEXT: LSHR * T42.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v32i1_to_v32i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2 +; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2 +; GFX12-NEXT: s_lshr_b32 s3, s2, 24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_lshrrev_b16 v2, 12, s2 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2 +; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2 +; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3 +; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3 +; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3 +; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3 +; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3 +; GFX12-NEXT: v_lshrrev_b16 v22, 2, s3 +; GFX12-NEXT: v_lshrrev_b16 v23, 1, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016 +; GFX12-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10017 +; GFX12-NEXT: v_lshrrev_b16 v11, 5, s2 +; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2 +; GFX12-NEXT: v_and_b32_e32 v24, 1, v4 +; GFX12-NEXT: v_and_b32_e32 v25, 1, v8 +; GFX12-NEXT: v_and_b32_e32 v28, 1, v21 +; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2 +; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0 +; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014 +; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10015 +; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v15, 1, s2 +; GFX12-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX12-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012 +; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10013 +; GFX12-NEXT: v_lshrrev_b16 v6, 10, s2 +; GFX12-NEXT: v_and_b32_e32 v26, 1, v15 +; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v15, 1, v9 +; GFX12-NEXT: v_and_b32_e32 v9, 1, v17 +; GFX12-NEXT: v_and_b32_e32 v29, 1, v23 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2 +; GFX12-NEXT: v_lshrrev_b16 v7, 14, s2 +; GFX12-NEXT: v_lshrrev_b16 v10, 6, s2 +; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2 +; GFX12-NEXT: v_lshrrev_b16 v14, 2, s2 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 +; GFX12-NEXT: s_and_b32 s5, s2, 1 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011 +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010 +; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v19, 1, v6 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1 +; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26 +; GFX12-NEXT: v_and_b32_e32 v4, 1, v14 +; GFX12-NEXT: v_and_b32_e32 v8, 1, v12 +; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v29 +; GFX12-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v35, 1, v18 +; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v16 +; GFX12-NEXT: v_and_b32_e32 v39, 1, v7 +; GFX12-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_and_b32 v41, 0xffff, v5 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_and_b32 v29, 0xffff, v9 +; GFX12-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v23, 1, v22 +; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v27, 1, v20 +; GFX12-NEXT: v_mov_b32_e32 v20, v1 +; GFX12-NEXT: v_mov_b32_e32 v22, v1 +; GFX12-NEXT: v_mov_b32_e32 v18, v1 +; GFX12-NEXT: v_and_b32_e32 v12, 1, v10 +; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v25 +; GFX12-NEXT: v_mov_b32_e32 v24, v1 +; GFX12-NEXT: s_clause 0x4 +; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v1, v[19:22], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v1, v[15:18], s[0:1] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v15, v1 +; GFX12-NEXT: v_mov_b32_e32 v11, v1 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, v44 :: v_dual_mov_b32 v9, v1 +; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v25, 0xffff, v28 +; GFX12-NEXT: v_mov_b32_e32 v28, v1 +; GFX12-NEXT: s_clause 0x4 +; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = zext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out @@ -6011,6 +7442,132 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; EG-NEXT: MOV T40.W, T40.Z, ; EG-NEXT: MOV * T26.W, T26.Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v32i1_to_v32i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v26, 6, s2 +; GFX12-NEXT: v_lshrrev_b16 v28, 7, s2 +; GFX12-NEXT: v_lshrrev_b16 v4, 2, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 3, s2 +; GFX12-NEXT: s_lshr_b32 s22, s2, 24 +; GFX12-NEXT: s_lshr_b32 s12, s2, 22 +; GFX12-NEXT: s_lshr_b32 s14, s2, 23 +; GFX12-NEXT: v_lshrrev_b16 v6, 4, s2 +; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2 +; GFX12-NEXT: v_lshrrev_b16 v3, 1, s2 +; GFX12-NEXT: s_lshr_b32 s16, s2, 20 +; GFX12-NEXT: s_lshr_b32 s18, s2, 21 +; GFX12-NEXT: v_lshrrev_b16 v1, 14, s2 +; GFX12-NEXT: v_lshrrev_b16 v2, 15, s2 +; GFX12-NEXT: v_lshrrev_b16 v12, 6, s22 +; GFX12-NEXT: v_lshrrev_b16 v14, 7, s22 +; GFX12-NEXT: v_lshrrev_b16 v9, 12, s2 +; GFX12-NEXT: v_lshrrev_b16 v10, 13, s2 +; GFX12-NEXT: v_lshrrev_b16 v16, 4, s22 +; GFX12-NEXT: v_lshrrev_b16 v17, 5, s22 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX12-NEXT: s_lshr_b32 s4, s2, 18 +; GFX12-NEXT: v_lshrrev_b16 v37, 10, s2 +; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2 +; GFX12-NEXT: v_lshrrev_b16 v13, 2, s22 +; GFX12-NEXT: v_lshrrev_b16 v15, 3, s22 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v49, s12 +; GFX12-NEXT: v_lshrrev_b16 v30, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v32, 9, s2 +; GFX12-NEXT: v_lshrrev_b16 v11, 1, s22 +; GFX12-NEXT: v_bfe_i32 v7, v5, 0, 1 +; GFX12-NEXT: v_bfe_i32 v5, v4, 0, 1 +; GFX12-NEXT: v_bfe_i32 v31, v28, 0, 1 +; GFX12-NEXT: v_bfe_i32 v29, v26, 0, 1 +; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s14 +; GFX12-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s16 +; GFX12-NEXT: s_lshr_b32 s6, s2, 19 +; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1 +; GFX12-NEXT: v_bfe_i32 v27, v8, 0, 1 +; GFX12-NEXT: v_bfe_i32 v25, v6, 0, 1 +; GFX12-NEXT: v_dual_mov_b32 v54, s17 :: v_dual_mov_b32 v55, s18 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX12-NEXT: v_mov_b32_e32 v56, s19 +; GFX12-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-NEXT: s_lshr_b32 s20, s2, 17 +; GFX12-NEXT: v_bfe_i32 v23, v14, 0, 1 +; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 1 +; GFX12-NEXT: v_bfe_i32 v47, v2, 0, 1 +; GFX12-NEXT: v_bfe_i32 v45, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v19, v17, 0, 1 +; GFX12-NEXT: v_bfe_i32 v17, v16, 0, 1 +; GFX12-NEXT: v_bfe_i32 v43, v10, 0, 1 +; GFX12-NEXT: v_bfe_i32 v41, v9, 0, 1 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[2:3], 0x10000 +; GFX12-NEXT: v_bfe_i32 v15, v15, 0, 1 +; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 1 +; GFX12-NEXT: v_bfe_i32 v39, v34, 0, 1 +; GFX12-NEXT: v_bfe_i32 v37, v37, 0, 1 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX12-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX12-NEXT: v_bfe_i32 v35, v32, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31 +; GFX12-NEXT: v_bfe_i32 v33, v30, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:176 +; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:160 +; GFX12-NEXT: v_dual_mov_b32 v49, s4 :: v_dual_mov_b32 v50, s5 +; GFX12-NEXT: v_dual_mov_b32 v51, s6 :: v_dual_mov_b32 v52, s7 +; GFX12-NEXT: v_mov_b32_e32 v53, s10 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[22:23], 0x10000 +; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GFX12-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX12-NEXT: v_dual_mov_b32 v54, s11 :: v_dual_mov_b32 v55, s20 +; GFX12-NEXT: v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v1, s8 +; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s2 +; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX12-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX12-NEXT: v_ashrrev_i32_e32 v48, 31, v47 +; GFX12-NEXT: v_ashrrev_i32_e32 v46, 31, v45 +; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX12-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43 +; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41 +; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX12-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39 +; GFX12-NEXT: v_ashrrev_i32_e32 v38, 31, v37 +; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX12-NEXT: v_ashrrev_i32_e32 v36, 31, v35 +; GFX12-NEXT: v_ashrrev_i32_e32 v34, 31, v33 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:144 +; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:128 +; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v10, s3 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:192 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = sext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out @@ -6871,6 +8428,220 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: 2(2.802597e-45), 496(6.950440e-43) ; EG-NEXT: LSHR * T82.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v64i1_to_v64i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2 +; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2 +; GFX12-NEXT: s_lshr_b32 s4, s3, 24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3 +; GFX12-NEXT: v_and_b32_e32 v34, 1, v4 +; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v28, v1 :: v_dual_and_b32 v41, 1, v6 +; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4 +; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4 +; GFX12-NEXT: s_lshr_b32 s5, s2, 24 +; GFX12-NEXT: v_lshrrev_b16 v3, 13, s2 +; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2 +; GFX12-NEXT: v_lshrrev_b16 v10, 3, s2 +; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3 +; GFX12-NEXT: v_lshrrev_b16 v24, 5, s3 +; GFX12-NEXT: v_lshrrev_b16 v25, 3, s3 +; GFX12-NEXT: v_and_b32_e32 v50, 1, v14 +; GFX12-NEXT: v_and_b32_e32 v47, 1, v18 +; GFX12-NEXT: v_and_b32_e32 v18, 1, v4 +; GFX12-NEXT: v_and_b32_e32 v14, 1, v6 +; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5 +; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5 +; GFX12-NEXT: v_lshrrev_b16 v0, 15, s2 +; GFX12-NEXT: v_lshrrev_b16 v2, 14, s2 +; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 9, s2 +; GFX12-NEXT: v_lshrrev_b16 v12, 1, s2 +; GFX12-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX12-NEXT: v_and_b32_e32 v42, 1, v8 +; GFX12-NEXT: v_and_b32_e32 v52, 1, v10 +; GFX12-NEXT: v_and_b32_e32 v40, 1, v23 +; GFX12-NEXT: v_dual_mov_b32 v44, v1 :: v_dual_and_b32 v43, 1, v24 +; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5 +; GFX12-NEXT: v_lshrrev_b16 v10, 2, s5 +; GFX12-NEXT: v_lshrrev_b16 v24, 4, s5 +; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10014 +; GFX12-NEXT: v_and_b32_e32 v33, 1, v25 +; GFX12-NEXT: v_and_b32_e32 v25, 1, v6 +; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10015 +; GFX12-NEXT: v_and_b32_e32 v23, 1, v4 +; GFX12-NEXT: v_lshrrev_b16 v11, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v16, 11, s3 +; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v35, 1, v5 +; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v5, 1, v12 +; GFX12-NEXT: v_lshrrev_b16 v36, 7, s5 +; GFX12-NEXT: v_lshrrev_b16 v37, 6, s5 +; GFX12-NEXT: v_and_b32_e32 v56, 1, v8 +; GFX12-NEXT: v_and_b32_e32 v4, 1, v10 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; GFX12-NEXT: v_and_b32_e32 v8, 1, v24 +; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v25 +; GFX12-NEXT: v_and_b32_e32 v23, 1, v2 +; GFX12-NEXT: v_dual_mov_b32 v24, v1 :: v_dual_and_b32 v25, 0xffff, v0 +; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v3 +; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013 +; GFX12-NEXT: v_and_b32_e32 v27, 1, v7 +; GFX12-NEXT: v_lshrrev_b16 v9, 10, s2 +; GFX12-NEXT: v_lshrrev_b16 v13, 6, s2 +; GFX12-NEXT: v_and_b32_e32 v22, 1, v16 +; GFX12-NEXT: v_lshrrev_b16 v54, 1, s3 +; GFX12-NEXT: v_lshrrev_b16 v55, 1, s4 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:96 +; GFX12-NEXT: v_and_b32_e32 v23, 1, v37 +; GFX12-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX12-NEXT: v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v28, 0xffff, v34 +; GFX12-NEXT: v_dual_mov_b32 v59, v1 :: v_dual_and_b32 v34, 1, v11 +; GFX12-NEXT: v_dual_mov_b32 v35, v1 :: v_dual_and_b32 v36, 0xffff, v35 +; GFX12-NEXT: v_dual_mov_b32 v37, v1 :: v_dual_and_b32 v26, 1, v9 +; GFX12-NEXT: v_mov_b32_e32 v27, v1 +; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v0, 1, v55 +; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:64 +; GFX12-NEXT: v_and_b32_e32 v34, 1, v13 +; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v41 +; GFX12-NEXT: v_and_b32_e32 v2, 1, v54 +; GFX12-NEXT: global_store_b128 v1, v[26:29], s[0:1] offset:80 +; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s7 +; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:48 +; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v2 +; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10016 +; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10017 +; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3 +; GFX12-NEXT: v_lshrrev_b16 v21, 14, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 +; GFX12-NEXT: v_mov_b32_e32 v0, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, s8 +; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10012 +; GFX12-NEXT: v_lshrrev_b16 v19, 12, s3 +; GFX12-NEXT: v_lshrrev_b16 v32, 8, s3 +; GFX12-NEXT: v_lshrrev_b16 v38, 6, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432 +; GFX12-NEXT: v_mov_b32_e32 v0, s8 +; GFX12-NEXT: v_mov_b32_e32 v2, s9 +; GFX12-NEXT: v_lshrrev_b16 v39, 4, s3 +; GFX12-NEXT: v_lshrrev_b16 v31, 2, s3 +; GFX12-NEXT: v_lshrrev_b16 v28, 10, s3 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018 +; GFX12-NEXT: s_and_b32 s6, s3, 1 +; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10011 +; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s8 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016 +; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10017 +; GFX12-NEXT: v_lshrrev_b16 v15, 4, s2 +; GFX12-NEXT: v_and_b32_e32 v31, 1, v31 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384 +; GFX12-NEXT: v_mov_b32_e32 v2, s8 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014 +; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10015 +; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v43 +; GFX12-NEXT: v_and_b32_e32 v41, 1, v15 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: v_mov_b32_e32 v2, s8 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012 +; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013 +; GFX12-NEXT: v_lshrrev_b16 v17, 2, s2 +; GFX12-NEXT: v_lshrrev_b16 v46, 7, s4 +; GFX12-NEXT: v_lshrrev_b16 v49, 6, s4 +; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v43, 0xffff, v42 +; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v45, 1, v32 +; GFX12-NEXT: v_and_b32_e32 v47, 0xffff, v47 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 +; GFX12-NEXT: v_mov_b32_e32 v2, s8 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_lshrrev_b16 v16, 4, s4 +; GFX12-NEXT: v_lshrrev_b16 v12, 2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 +; GFX12-NEXT: s_and_b32 s7, s2, 1 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011 +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010 +; GFX12-NEXT: v_and_b32_e32 v51, 1, v17 +; GFX12-NEXT: v_dual_mov_b32 v54, v1 :: v_dual_and_b32 v53, 0xffff, v52 +; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v5 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: v_mov_b32_e32 v52, v1 +; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:32 +; GFX12-NEXT: v_and_b32_e32 v41, 1, v49 +; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v46 +; GFX12-NEXT: v_mov_b32_e32 v13, v1 +; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v56 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 +; GFX12-NEXT: v_mov_b32_e32 v0, s7 +; GFX12-NEXT: v_mov_b32_e32 v46, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, v37 +; GFX12-NEXT: v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v16, 1, v16 +; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v48, 1, v19 +; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX12-NEXT: global_store_b128 v1, v[51:54], s[0:1] offset:16 +; GFX12-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v52, 1, v21 +; GFX12-NEXT: v_and_b32_e32 v54, 0xffff, v20 +; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v50, 0xffff, v50 +; GFX12-NEXT: v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v18, 0xffff, v18 +; GFX12-NEXT: v_mov_b32_e32 v51, v1 +; GFX12-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v27, 1, v39 +; GFX12-NEXT: v_and_b32_e32 v38, 1, v38 +; GFX12-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; GFX12-NEXT: v_and_b32_e32 v56, 1, v28 +; GFX12-NEXT: v_and_b32_e32 v58, 0xffff, v22 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:496 +; GFX12-NEXT: global_store_b128 v1, v[52:55], s[0:1] offset:368 +; GFX12-NEXT: global_store_b128 v1, v[48:51], s[0:1] offset:352 +; GFX12-NEXT: v_mov_b32_e32 v41, v1 +; GFX12-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, v36 +; GFX12-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_and_b32 v33, 0xffff, v33 +; GFX12-NEXT: v_mov_b32_e32 v32, v1 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v1, v[56:59], s[0:1] offset:336 +; GFX12-NEXT: global_store_b128 v1, v[45:48], s[0:1] offset:320 +; GFX12-NEXT: global_store_b128 v1, v[38:41], s[0:1] offset:304 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, v30 +; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v12, 1, v12 +; GFX12-NEXT: v_mov_b32_e32 v15, v1 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:272 +; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:480 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, v35 :: v_dual_mov_b32 v9, v1 +; GFX12-NEXT: v_mov_b32_e32 v11, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v28, v1 +; GFX12-NEXT: v_mov_b32_e32 v30, v1 +; GFX12-NEXT: s_clause 0x4 +; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:464 +; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:288 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = zext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, ptr addrspace(1) %out @@ -7947,6 +9718,253 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MOV T80.W, T80.Z, ; EG-NEXT: MOV * T50.W, T50.Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s19, s5 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s26, s3, 22 +; GFX12-NEXT: s_lshr_b32 s28, s3, 23 +; GFX12-NEXT: s_lshr_b32 s30, s3, 20 +; GFX12-NEXT: s_lshr_b32 s34, s3, 21 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX12-NEXT: s_lshr_b32 s20, s3, 18 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v43, s27 +; GFX12-NEXT: v_dual_mov_b32 v42, s26 :: v_dual_mov_b32 v45, s29 +; GFX12-NEXT: v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31 +; GFX12-NEXT: s_lshr_b32 s22, s3, 19 +; GFX12-NEXT: v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX12-NEXT: v_mov_b32_e32 v48, s34 +; GFX12-NEXT: s_lshr_b32 s24, s3, 16 +; GFX12-NEXT: s_lshr_b32 s36, s3, 17 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX12-NEXT: s_lshr_b32 s12, s2, 22 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:432 +; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:416 +; GFX12-NEXT: v_dual_mov_b32 v43, s21 :: v_dual_mov_b32 v42, s20 +; GFX12-NEXT: v_dual_mov_b32 v45, s23 :: v_dual_mov_b32 v44, s22 +; GFX12-NEXT: v_mov_b32_e32 v47, s25 +; GFX12-NEXT: s_lshr_b32 s14, s2, 23 +; GFX12-NEXT: v_dual_mov_b32 v46, s24 :: v_dual_mov_b32 v49, s37 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX12-NEXT: v_mov_b32_e32 v48, s36 +; GFX12-NEXT: s_lshr_b32 s16, s2, 20 +; GFX12-NEXT: s_lshr_b32 s40, s2, 21 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX12-NEXT: s_lshr_b32 s6, s2, 18 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:400 +; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:384 +; GFX12-NEXT: v_dual_mov_b32 v43, s13 :: v_dual_mov_b32 v42, s12 +; GFX12-NEXT: v_dual_mov_b32 v45, s15 :: v_dual_mov_b32 v44, s14 +; GFX12-NEXT: v_mov_b32_e32 v47, s17 +; GFX12-NEXT: s_lshr_b32 s8, s2, 19 +; GFX12-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v49, s41 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX12-NEXT: v_mov_b32_e32 v48, s40 +; GFX12-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX12-NEXT: v_lshrrev_b16 v3, 14, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2 +; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2 +; GFX12-NEXT: v_lshrrev_b16 v9, 13, s2 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:176 +; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:160 +; GFX12-NEXT: v_dual_mov_b32 v43, s7 :: v_dual_mov_b32 v42, s6 +; GFX12-NEXT: v_dual_mov_b32 v45, s9 :: v_dual_mov_b32 v44, s8 +; GFX12-NEXT: v_mov_b32_e32 v47, s11 +; GFX12-NEXT: s_lshr_b32 s42, s2, 17 +; GFX12-NEXT: v_lshrrev_b16 v32, 10, s2 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2 +; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v35, 9, s2 +; GFX12-NEXT: v_lshrrev_b16 v27, 6, s2 +; GFX12-NEXT: v_lshrrev_b16 v29, 7, s2 +; GFX12-NEXT: v_lshrrev_b16 v30, 4, s2 +; GFX12-NEXT: v_lshrrev_b16 v31, 5, s2 +; GFX12-NEXT: v_lshrrev_b16 v24, 2, s2 +; GFX12-NEXT: v_lshrrev_b16 v25, 3, s2 +; GFX12-NEXT: v_lshrrev_b16 v23, 1, s2 +; GFX12-NEXT: v_lshrrev_b16 v18, 14, s3 +; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3 +; GFX12-NEXT: v_lshrrev_b16 v16, 12, s3 +; GFX12-NEXT: v_lshrrev_b16 v19, 13, s3 +; GFX12-NEXT: v_lshrrev_b16 v0, 10, s3 +; GFX12-NEXT: v_lshrrev_b16 v1, 11, s3 +; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3 +; GFX12-NEXT: v_lshrrev_b16 v15, 9, s3 +; GFX12-NEXT: v_lshrrev_b16 v14, 6, s3 +; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3 +; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3 +; GFX12-NEXT: v_lshrrev_b16 v22, 5, s3 +; GFX12-NEXT: v_lshrrev_b16 v26, 2, s3 +; GFX12-NEXT: v_lshrrev_b16 v28, 3, s3 +; GFX12-NEXT: v_lshrrev_b16 v36, 1, s3 +; GFX12-NEXT: s_lshr_b32 s18, s3, 24 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_lshr_b32 s38, s2, 24 +; GFX12-NEXT: v_dual_mov_b32 v46, s10 :: v_dual_mov_b32 v49, s43 +; GFX12-NEXT: v_bfe_i32 v52, v5, 0, 1 +; GFX12-NEXT: v_bfe_i32 v50, v3, 0, 1 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX12-NEXT: v_mov_b32_e32 v48, s42 +; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:144 +; GFX12-NEXT: v_bfe_i32 v44, v9, 0, 1 +; GFX12-NEXT: v_bfe_i32 v42, v7, 0, 1 +; GFX12-NEXT: v_lshrrev_b16 v41, 2, s18 +; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:128 +; GFX12-NEXT: v_lshrrev_b16 v54, 3, s18 +; GFX12-NEXT: v_lshrrev_b16 v56, 6, s38 +; GFX12-NEXT: v_ashrrev_i32_e32 v53, 31, v52 +; GFX12-NEXT: v_ashrrev_i32_e32 v51, 31, v50 +; GFX12-NEXT: v_ashrrev_i32_e32 v45, 31, v44 +; GFX12-NEXT: v_ashrrev_i32_e32 v43, 31, v42 +; GFX12-NEXT: v_bfe_i32 v46, v56, 0, 1 +; GFX12-NEXT: v_bfe_i32 v56, v54, 0, 1 +; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:112 +; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1 +; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:96 +; GFX12-NEXT: v_bfe_i32 v32, v32, 0, 1 +; GFX12-NEXT: v_bfe_i32 v54, v41, 0, 1 +; GFX12-NEXT: v_bfe_i32 v43, v35, 0, 1 +; GFX12-NEXT: v_bfe_i32 v41, v33, 0, 1 +; GFX12-NEXT: v_lshrrev_b16 v39, 4, s18 +; GFX12-NEXT: v_ashrrev_i32_e32 v35, 31, v34 +; GFX12-NEXT: v_ashrrev_i32_e32 v33, 31, v32 +; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43 +; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41 +; GFX12-NEXT: v_lshrrev_b16 v40, 5, s18 +; GFX12-NEXT: v_lshrrev_b16 v37, 6, s18 +; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:80 +; GFX12-NEXT: v_bfe_i32 v32, v39, 0, 1 +; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1] offset:64 +; GFX12-NEXT: v_bfe_i32 v41, v29, 0, 1 +; GFX12-NEXT: v_bfe_i32 v39, v27, 0, 1 +; GFX12-NEXT: v_bfe_i32 v34, v40, 0, 1 +; GFX12-NEXT: v_bfe_i32 v60, v31, 0, 1 +; GFX12-NEXT: v_bfe_i32 v58, v30, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41 +; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39 +; GFX12-NEXT: v_lshrrev_b16 v38, 7, s18 +; GFX12-NEXT: v_bfe_i32 v62, v37, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v61, 31, v60 +; GFX12-NEXT: v_ashrrev_i32_e32 v59, 31, v58 +; GFX12-NEXT: global_store_b128 v12, v[39:42], s[0:1] offset:48 +; GFX12-NEXT: v_bfe_i32 v39, v25, 0, 1 +; GFX12-NEXT: v_bfe_i32 v37, v24, 0, 1 +; GFX12-NEXT: v_bfe_i32 v64, v38, 0, 1 +; GFX12-NEXT: global_store_b128 v12, v[58:61], s[0:1] offset:32 +; GFX12-NEXT: v_bfe_i32 v43, v23, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39 +; GFX12-NEXT: v_ashrrev_i32_e32 v38, 31, v37 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX12-NEXT: v_bfe_i32 v24, v36, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43 +; GFX12-NEXT: v_dual_mov_b32 v41, s2 :: v_dual_mov_b32 v42, s3 +; GFX12-NEXT: v_mov_b32_e32 v23, s5 +; GFX12-NEXT: global_store_b128 v12, v[37:40], s[0:1] offset:16 +; GFX12-NEXT: v_bfe_i32 v38, v20, 0, 1 +; GFX12-NEXT: v_bfe_i32 v36, v18, 0, 1 +; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1] +; GFX12-NEXT: v_bfe_i32 v20, v19, 0, 1 +; GFX12-NEXT: v_bfe_i32 v18, v16, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v39, 31, v38 +; GFX12-NEXT: v_ashrrev_i32_e32 v37, 31, v36 +; GFX12-NEXT: v_lshrrev_b16 v55, 1, s18 +; GFX12-NEXT: v_bfe_i32 v40, v21, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX12-NEXT: v_lshrrev_b16 v9, 7, s38 +; GFX12-NEXT: v_lshrrev_b16 v4, 3, s38 +; GFX12-NEXT: v_lshrrev_b16 v8, 2, s38 +; GFX12-NEXT: v_lshrrev_b16 v10, 5, s38 +; GFX12-NEXT: v_lshrrev_b16 v11, 4, s38 +; GFX12-NEXT: v_lshrrev_b16 v2, 1, s38 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:368 +; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:352 +; GFX12-NEXT: v_bfe_i32 v38, v1, 0, 1 +; GFX12-NEXT: v_bfe_i32 v36, v0, 0, 1 +; GFX12-NEXT: v_bfe_i32 v52, v55, 0, 1 +; GFX12-NEXT: v_bfe_i32 v20, v15, 0, 1 +; GFX12-NEXT: v_bfe_i32 v18, v13, 0, 1 +; GFX12-NEXT: v_bfe_i32 v48, v9, 0, 1 +; GFX12-NEXT: v_bfe_i32 v16, v17, 0, 1 +; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1 +; GFX12-NEXT: v_bfe_i32 v6, v4, 0, 1 +; GFX12-NEXT: v_bfe_i32 v4, v8, 0, 1 +; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1 +; GFX12-NEXT: v_bfe_i32 v8, v11, 0, 1 +; GFX12-NEXT: v_bfe_i32 v42, v22, 0, 1 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[18:19], 0x10000 +; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1 +; GFX12-NEXT: v_bfe_i32 v26, v26, 0, 1 +; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX12-NEXT: v_ashrrev_i32_e32 v57, 31, v56 +; GFX12-NEXT: v_ashrrev_i32_e32 v55, 31, v54 +; GFX12-NEXT: v_ashrrev_i32_e32 v65, 31, v64 +; GFX12-NEXT: v_ashrrev_i32_e32 v63, 31, v62 +; GFX12-NEXT: v_ashrrev_i32_e32 v39, 31, v38 +; GFX12-NEXT: v_ashrrev_i32_e32 v37, 31, v36 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[38:39], 0x10000 +; GFX12-NEXT: v_ashrrev_i32_e32 v53, 31, v52 +; GFX12-NEXT: v_ashrrev_i32_e32 v35, 31, v34 +; GFX12-NEXT: v_ashrrev_i32_e32 v33, 31, v32 +; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX12-NEXT: v_dual_mov_b32 v22, s4 :: v_dual_mov_b32 v51, s9 +; GFX12-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_ashrrev_i32_e32 v49, 31, v48 +; GFX12-NEXT: v_ashrrev_i32_e32 v47, 31, v46 +; GFX12-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX12-NEXT: v_ashrrev_i32_e32 v43, 31, v42 +; GFX12-NEXT: v_ashrrev_i32_e32 v41, 31, v40 +; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GFX12-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:336 +; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:320 +; GFX12-NEXT: global_store_b128 v12, v[14:17], s[0:1] offset:304 +; GFX12-NEXT: global_store_b128 v12, v[40:43], s[0:1] offset:288 +; GFX12-NEXT: global_store_b128 v12, v[26:29], s[0:1] offset:272 +; GFX12-NEXT: global_store_b128 v12, v[22:25], s[0:1] offset:256 +; GFX12-NEXT: global_store_b128 v12, v[62:65], s[0:1] offset:496 +; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:480 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v12, v[54:57], s[0:1] offset:464 +; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:448 +; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v12, v[4:7], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v12, v[0:3], s[0:1] offset:192 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = sext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 5332da6827ec3..585f96b9ffb2e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-HSA %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-NOHSA-VI %s ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_i16: @@ -73,6 +74,18 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load i16, ptr addrspace(4) %in store i16 %ld, ptr addrspace(1) %out @@ -131,6 +144,18 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v2i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <2 x i16>, ptr addrspace(4) %in store <2 x i16> %ld, ptr addrspace(1) %out @@ -224,6 +249,21 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: OR_INT T6.X, PV.W, PS, ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v3i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] offset:4 +; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i16>, ptr addrspace(4) %in store <3 x i16> %ld, ptr addrspace(1) %out @@ -285,6 +325,19 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v4i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <4 x i16>, ptr addrspace(4) %in store <4 x i16> %ld, ptr addrspace(1) %out @@ -352,6 +405,20 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v8i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <8 x i16>, ptr addrspace(4) %in store <8 x i16> %ld, ptr addrspace(1) %out @@ -452,6 +519,24 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; EG-NEXT: ALU clause starting at 17: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v16i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i16>, ptr addrspace(4) %in store <16 x i16> %ld, ptr addrspace(1) %out @@ -654,6 +739,36 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; EG-NEXT: ALU clause starting at 11: ; EG-NEXT: MOV * T2.X, literal.x, ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v16i16_align2: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0xf +; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28 +; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24 +; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20 +; GFX12-NEXT: global_load_u16 v0, v8, s[0:1] offset:16 +; GFX12-NEXT: global_load_u16 v7, v8, s[0:1] offset:12 +; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8 +; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4 +; GFX12-NEXT: global_load_u16 v4, v8, s[0:1] +; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 +; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 +; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 +; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 +; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 +; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 +; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 +; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 +; GFX12-NEXT: s_waitcnt vmcnt(4) +; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i16>, ptr addrspace(4) %ptr0, align 2 store <16 x i16> %ld, ptr addrspace(1) undef, align 32 @@ -719,6 +834,18 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_i16_to_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = zext i16 %a to i32 store i32 %ext, ptr addrspace(1) %out @@ -785,6 +912,18 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) +; +; GFX12-LABEL: constant_sextload_i16_to_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_i16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = sext i16 %a to i32 store i32 %ext, ptr addrspace(1) %out @@ -850,6 +989,18 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v1i16_to_v1i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(1) %out @@ -916,6 +1067,18 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) +; +; GFX12-LABEL: constant_sextload_v1i16_to_v1i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_i16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(1) %out @@ -986,6 +1149,22 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; EG-NEXT: AND_INT T4.X, T4.X, literal.x, ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) +; +; GFX12-LABEL: constant_zextload_v2i16_to_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s3, s2, 0xffff +; GFX12-NEXT: s_lshr_b32 s2, s2, 16 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(1) %out @@ -1058,6 +1237,22 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) ; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v2i16_to_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_sext_i32_i16 s3, s2 +; GFX12-NEXT: s_ashr_i32 s2, s2, 16 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(1) %out @@ -1140,6 +1335,22 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; EG-NEXT: LSHR T4.X, T0.W, literal.x, ; EG-NEXT: MOV * T3.Y, T1.X, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v3i16_to_v3i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s3, s3, 0xffff +; GFX12-NEXT: s_and_b32 s4, s2, 0xffff +; GFX12-NEXT: s_lshr_b32 s2, s2, 16 +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i16>, ptr addrspace(4) %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -1226,6 +1437,22 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LSHR * T3.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v3i16_to_v3i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s4, s2, 16 +; GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i16>, ptr addrspace(4) %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -1315,6 +1542,25 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG-NEXT: AND_INT T5.X, T5.X, literal.x, ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) +; +; GFX12-LABEL: constant_zextload_v4i16_to_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s4, s3, 16 +; GFX12-NEXT: s_and_b32 s3, s3, 0xffff +; GFX12-NEXT: s_and_b32 s5, s2, 0xffff +; GFX12-NEXT: s_lshr_b32 s2, s2, 16 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(1) %out @@ -1406,6 +1652,24 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, ; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; +; GFX12-LABEL: constant_sextload_v4i16_to_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s4, s3, 16 +; GFX12-NEXT: s_ashr_i32 s5, s2, 16 +; GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(1) %out @@ -1540,6 +1804,32 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LSHR * T10.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v8i16_to_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s8, s7, 16 +; GFX12-NEXT: s_and_b32 s7, s7, 0xffff +; GFX12-NEXT: s_and_b32 s9, s6, 0xffff +; GFX12-NEXT: s_lshr_b32 s6, s6, 16 +; GFX12-NEXT: s_lshr_b32 s2, s5, 16 +; GFX12-NEXT: s_and_b32 s3, s5, 0xffff +; GFX12-NEXT: s_lshr_b32 s5, s4, 16 +; GFX12-NEXT: s_and_b32 s4, s4, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s8 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2 +; GFX12-NEXT: v_mov_b32_e32 v6, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(4) %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(1) %out @@ -1676,6 +1966,32 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; EG-NEXT: LSHR T10.X, PS, literal.x, ; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; +; GFX12-LABEL: constant_sextload_v8i16_to_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s8, s7, 16 +; GFX12-NEXT: s_ashr_i32 s9, s6, 16 +; GFX12-NEXT: s_sext_i32_i16 s6, s6 +; GFX12-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-NEXT: s_ashr_i32 s2, s5, 16 +; GFX12-NEXT: s_ashr_i32 s3, s4, 16 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_sext_i32_i16 s4, s4 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2 +; GFX12-NEXT: v_mov_b32_e32 v6, s5 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(4) %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(1) %out @@ -1900,6 +2216,46 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LSHR * T18.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v16i16_to_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s16, s11, 16 +; GFX12-NEXT: s_and_b32 s11, s11, 0xffff +; GFX12-NEXT: s_and_b32 s17, s10, 0xffff +; GFX12-NEXT: s_lshr_b32 s10, s10, 16 +; GFX12-NEXT: s_lshr_b32 s14, s9, 16 +; GFX12-NEXT: s_and_b32 s9, s9, 0xffff +; GFX12-NEXT: s_lshr_b32 s15, s8, 16 +; GFX12-NEXT: s_and_b32 s8, s8, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10 +; GFX12-NEXT: s_lshr_b32 s12, s7, 16 +; GFX12-NEXT: s_and_b32 s7, s7, 0xffff +; GFX12-NEXT: s_lshr_b32 s13, s6, 16 +; GFX12-NEXT: s_and_b32 s6, s6, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s16 +; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15 +; GFX12-NEXT: s_lshr_b32 s2, s5, 16 +; GFX12-NEXT: s_and_b32 s3, s5, 0xffff +; GFX12-NEXT: s_lshr_b32 s5, s4, 16 +; GFX12-NEXT: s_and_b32 s4, s4, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14 +; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12 +; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5 +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_mov_b32_e32 v14, s3 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(1) %out @@ -2128,6 +2484,46 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; EG-NEXT: LSHR T12.X, PS, literal.x, ; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; +; GFX12-LABEL: constant_sextload_v16i16_to_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s16, s11, 16 +; GFX12-NEXT: s_ashr_i32 s17, s10, 16 +; GFX12-NEXT: s_sext_i32_i16 s10, s10 +; GFX12-NEXT: s_sext_i32_i16 s11, s11 +; GFX12-NEXT: s_ashr_i32 s14, s9, 16 +; GFX12-NEXT: s_ashr_i32 s15, s8, 16 +; GFX12-NEXT: s_sext_i32_i16 s9, s9 +; GFX12-NEXT: s_sext_i32_i16 s8, s8 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17 +; GFX12-NEXT: s_ashr_i32 s12, s7, 16 +; GFX12-NEXT: s_ashr_i32 s13, s6, 16 +; GFX12-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-NEXT: s_sext_i32_i16 s6, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16 +; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15 +; GFX12-NEXT: s_ashr_i32 s2, s5, 16 +; GFX12-NEXT: s_ashr_i32 s3, s4, 16 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_sext_i32_i16 s4, s4 +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14 +; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12 +; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3 +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_mov_b32_e32 v14, s5 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(1) %out @@ -2538,6 +2934,76 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) ; EG-NEXT: LSHR * T34.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v32i16_to_v32i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s33, s15, 16 +; GFX12-NEXT: s_and_b32 s15, s15, 0xffff +; GFX12-NEXT: s_and_b32 s34, s14, 0xffff +; GFX12-NEXT: s_lshr_b32 s14, s14, 16 +; GFX12-NEXT: s_lshr_b32 s30, s13, 16 +; GFX12-NEXT: s_and_b32 s13, s13, 0xffff +; GFX12-NEXT: s_lshr_b32 s31, s12, 16 +; GFX12-NEXT: s_and_b32 s12, s12, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s14 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31 +; GFX12-NEXT: s_lshr_b32 s29, s10, 16 +; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30 +; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29 +; GFX12-NEXT: s_lshr_b32 s28, s11, 16 +; GFX12-NEXT: s_and_b32 s11, s11, 0xffff +; GFX12-NEXT: s_and_b32 s10, s10, 0xffff +; GFX12-NEXT: s_lshr_b32 s26, s9, 16 +; GFX12-NEXT: s_and_b32 s9, s9, 0xffff +; GFX12-NEXT: s_lshr_b32 s27, s8, 16 +; GFX12-NEXT: s_and_b32 s8, s8, 0xffff +; GFX12-NEXT: s_lshr_b32 s24, s7, 16 +; GFX12-NEXT: s_and_b32 s7, s7, 0xffff +; GFX12-NEXT: s_lshr_b32 s25, s6, 16 +; GFX12-NEXT: s_and_b32 s6, s6, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28 +; GFX12-NEXT: v_mov_b32_e32 v10, s11 +; GFX12-NEXT: s_lshr_b32 s22, s5, 16 +; GFX12-NEXT: s_and_b32 s5, s5, 0xffff +; GFX12-NEXT: s_lshr_b32 s23, s4, 16 +; GFX12-NEXT: s_and_b32 s4, s4, 0xffff +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v0, s8 +; GFX12-NEXT: v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v2, s9 +; GFX12-NEXT: v_mov_b32_e32 v5, s25 +; GFX12-NEXT: s_lshr_b32 s20, s3, 16 +; GFX12-NEXT: s_and_b32 s3, s3, 0xffff +; GFX12-NEXT: s_lshr_b32 s21, s2, 16 +; GFX12-NEXT: s_and_b32 s2, s2, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s24 +; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v13, s23 +; GFX12-NEXT: s_lshr_b32 s18, s1, 16 +; GFX12-NEXT: s_and_b32 s1, s1, 0xffff +; GFX12-NEXT: s_lshr_b32 s19, s0, 16 +; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s22 +; GFX12-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v17, s21 +; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s20 +; GFX12-NEXT: v_dual_mov_b32 v18, s3 :: v_dual_mov_b32 v21, s19 +; GFX12-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s18 +; GFX12-NEXT: v_mov_b32_e32 v22, s1 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(4) %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(1) %out @@ -2958,6 +3424,76 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; EG-NEXT: LSHR T24.X, PS, literal.x, ; EG-NEXT: BFE_INT * T34.Y, PV.Z, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; +; GFX12-LABEL: constant_sextload_v32i16_to_v32i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s33, s15, 16 +; GFX12-NEXT: s_ashr_i32 s34, s14, 16 +; GFX12-NEXT: s_sext_i32_i16 s14, s14 +; GFX12-NEXT: s_sext_i32_i16 s15, s15 +; GFX12-NEXT: s_ashr_i32 s30, s13, 16 +; GFX12-NEXT: s_ashr_i32 s31, s12, 16 +; GFX12-NEXT: s_sext_i32_i16 s13, s13 +; GFX12-NEXT: s_sext_i32_i16 s12, s12 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s34 +; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31 +; GFX12-NEXT: s_ashr_i32 s29, s10, 16 +; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30 +; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29 +; GFX12-NEXT: s_ashr_i32 s28, s11, 16 +; GFX12-NEXT: s_sext_i32_i16 s11, s11 +; GFX12-NEXT: s_sext_i32_i16 s10, s10 +; GFX12-NEXT: s_ashr_i32 s26, s9, 16 +; GFX12-NEXT: s_ashr_i32 s27, s8, 16 +; GFX12-NEXT: s_sext_i32_i16 s9, s9 +; GFX12-NEXT: s_sext_i32_i16 s8, s8 +; GFX12-NEXT: s_ashr_i32 s24, s7, 16 +; GFX12-NEXT: s_ashr_i32 s25, s6, 16 +; GFX12-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-NEXT: s_sext_i32_i16 s6, s6 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28 +; GFX12-NEXT: v_mov_b32_e32 v10, s11 +; GFX12-NEXT: s_ashr_i32 s22, s5, 16 +; GFX12-NEXT: s_ashr_i32 s23, s4, 16 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_sext_i32_i16 s4, s4 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v0, s8 +; GFX12-NEXT: v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v2, s9 +; GFX12-NEXT: v_mov_b32_e32 v5, s25 +; GFX12-NEXT: s_ashr_i32 s20, s3, 16 +; GFX12-NEXT: s_ashr_i32 s21, s2, 16 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s24 +; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v13, s23 +; GFX12-NEXT: s_ashr_i32 s18, s1, 16 +; GFX12-NEXT: s_ashr_i32 s19, s0, 16 +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_sext_i32_i16 s0, s0 +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s22 +; GFX12-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v17, s21 +; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s20 +; GFX12-NEXT: v_dual_mov_b32 v18, s3 :: v_dual_mov_b32 v21, s19 +; GFX12-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s18 +; GFX12-NEXT: v_mov_b32_e32 v22, s1 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(4) %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(1) %out @@ -3751,6 +4287,136 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00) ; EG-NEXT: LSHR * T66.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v64i16_to_v64i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0 +; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s49, s31, 16 +; GFX12-NEXT: s_lshr_b32 s65, s15, 16 +; GFX12-NEXT: s_lshr_b32 s66, s14, 16 +; GFX12-NEXT: s_and_b32 s14, s14, 0xffff +; GFX12-NEXT: s_and_b32 s15, s15, 0xffff +; GFX12-NEXT: s_lshr_b32 s63, s13, 16 +; GFX12-NEXT: s_lshr_b32 s64, s12, 16 +; GFX12-NEXT: s_and_b32 s13, s13, 0xffff +; GFX12-NEXT: s_and_b32 s12, s12, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s66 +; GFX12-NEXT: s_lshr_b32 s61, s11, 16 +; GFX12-NEXT: s_lshr_b32 s62, s10, 16 +; GFX12-NEXT: s_and_b32 s11, s11, 0xffff +; GFX12-NEXT: s_and_b32 s10, s10, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s65 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s64 +; GFX12-NEXT: s_lshr_b32 s59, s9, 16 +; GFX12-NEXT: s_lshr_b32 s60, s8, 16 +; GFX12-NEXT: s_and_b32 s9, s9, 0xffff +; GFX12-NEXT: s_and_b32 s8, s8, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s63 +; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s62 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s61 +; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s60 +; GFX12-NEXT: s_lshr_b32 s57, s7, 16 +; GFX12-NEXT: s_lshr_b32 s58, s6, 16 +; GFX12-NEXT: s_and_b32 s7, s7, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s59 +; GFX12-NEXT: v_mov_b32_e32 v14, s9 +; GFX12-NEXT: s_and_b32 s6, s6, 0xffff +; GFX12-NEXT: s_lshr_b32 s55, s5, 16 +; GFX12-NEXT: s_lshr_b32 s56, s4, 16 +; GFX12-NEXT: s_and_b32 s5, s5, 0xffff +; GFX12-NEXT: s_and_b32 s4, s4, 0xffff +; GFX12-NEXT: s_lshr_b32 s53, s3, 16 +; GFX12-NEXT: s_lshr_b32 s54, s2, 16 +; GFX12-NEXT: s_and_b32 s3, s3, 0xffff +; GFX12-NEXT: s_and_b32 s2, s2, 0xffff +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s58 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s7 +; GFX12-NEXT: v_mov_b32_e32 v5, s56 +; GFX12-NEXT: s_lshr_b32 s51, s1, 16 +; GFX12-NEXT: s_lshr_b32 s52, s0, 16 +; GFX12-NEXT: s_and_b32 s1, s1, 0xffff +; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s55 +; GFX12-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v9, s54 +; GFX12-NEXT: s_lshr_b32 s50, s30, 16 +; GFX12-NEXT: s_and_b32 s31, s31, 0xffff +; GFX12-NEXT: s_and_b32 s30, s30, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s53 +; GFX12-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s52 +; GFX12-NEXT: s_lshr_b32 s45, s27, 16 +; GFX12-NEXT: s_lshr_b32 s46, s26, 16 +; GFX12-NEXT: s_lshr_b32 s47, s29, 16 +; GFX12-NEXT: s_lshr_b32 s48, s28, 16 +; GFX12-NEXT: s_and_b32 s27, s27, 0xffff +; GFX12-NEXT: s_and_b32 s26, s26, 0xffff +; GFX12-NEXT: s_and_b32 s29, s29, 0xffff +; GFX12-NEXT: s_and_b32 s28, s28, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s51 +; GFX12-NEXT: v_dual_mov_b32 v14, s1 :: v_dual_mov_b32 v17, s50 +; GFX12-NEXT: s_lshr_b32 s43, s25, 16 +; GFX12-NEXT: s_lshr_b32 s44, s24, 16 +; GFX12-NEXT: s_and_b32 s25, s25, 0xffff +; GFX12-NEXT: s_and_b32 s24, s24, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v16, s30 :: v_dual_mov_b32 v19, s49 +; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v21, s48 +; GFX12-NEXT: s_lshr_b32 s41, s23, 16 +; GFX12-NEXT: s_lshr_b32 s42, s22, 16 +; GFX12-NEXT: s_and_b32 s23, s23, 0xffff +; GFX12-NEXT: s_and_b32 s22, s22, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v23, s47 +; GFX12-NEXT: v_mov_b32_e32 v22, s29 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v0, s26 +; GFX12-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v2, s27 +; GFX12-NEXT: v_mov_b32_e32 v5, s44 +; GFX12-NEXT: s_lshr_b32 s39, s21, 16 +; GFX12-NEXT: s_lshr_b32 s40, s20, 16 +; GFX12-NEXT: s_and_b32 s21, s21, 0xffff +; GFX12-NEXT: s_and_b32 s20, s20, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s43 +; GFX12-NEXT: v_dual_mov_b32 v6, s25 :: v_dual_mov_b32 v9, s42 +; GFX12-NEXT: s_lshr_b32 s35, s19, 16 +; GFX12-NEXT: s_lshr_b32 s38, s18, 16 +; GFX12-NEXT: s_and_b32 s19, s19, 0xffff +; GFX12-NEXT: s_and_b32 s18, s18, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s41 +; GFX12-NEXT: v_dual_mov_b32 v10, s23 :: v_dual_mov_b32 v13, s40 +; GFX12-NEXT: s_lshr_b32 s33, s17, 16 +; GFX12-NEXT: s_lshr_b32 s34, s16, 16 +; GFX12-NEXT: s_and_b32 s17, s17, 0xffff +; GFX12-NEXT: s_and_b32 s16, s16, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v15, s39 +; GFX12-NEXT: v_dual_mov_b32 v14, s21 :: v_dual_mov_b32 v17, s38 +; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s35 +; GFX12-NEXT: v_dual_mov_b32 v18, s19 :: v_dual_mov_b32 v21, s34 +; GFX12-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v23, s33 +; GFX12-NEXT: v_mov_b32_e32 v22, s17 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(4) %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(1) %out @@ -4560,6 +5226,136 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; EG-NEXT: LSHR T48.X, PS, literal.x, ; EG-NEXT: BFE_INT * T66.Y, PV.Z, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; +; GFX12-LABEL: constant_sextload_v64i16_to_v64i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40 +; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s65, s15, 16 +; GFX12-NEXT: s_ashr_i32 s66, s14, 16 +; GFX12-NEXT: s_sext_i32_i16 s14, s14 +; GFX12-NEXT: s_sext_i32_i16 s15, s15 +; GFX12-NEXT: s_ashr_i32 s63, s13, 16 +; GFX12-NEXT: s_ashr_i32 s64, s12, 16 +; GFX12-NEXT: s_sext_i32_i16 s13, s13 +; GFX12-NEXT: s_sext_i32_i16 s12, s12 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s66 +; GFX12-NEXT: s_ashr_i32 s61, s11, 16 +; GFX12-NEXT: s_ashr_i32 s62, s10, 16 +; GFX12-NEXT: s_sext_i32_i16 s11, s11 +; GFX12-NEXT: s_sext_i32_i16 s10, s10 +; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s65 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s64 +; GFX12-NEXT: s_ashr_i32 s59, s9, 16 +; GFX12-NEXT: s_ashr_i32 s60, s8, 16 +; GFX12-NEXT: s_sext_i32_i16 s9, s9 +; GFX12-NEXT: s_sext_i32_i16 s8, s8 +; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s63 +; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s62 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s61 +; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s60 +; GFX12-NEXT: s_ashr_i32 s57, s7, 16 +; GFX12-NEXT: s_ashr_i32 s58, s6, 16 +; GFX12-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s59 +; GFX12-NEXT: v_mov_b32_e32 v14, s9 +; GFX12-NEXT: s_sext_i32_i16 s6, s6 +; GFX12-NEXT: s_ashr_i32 s55, s5, 16 +; GFX12-NEXT: s_ashr_i32 s56, s4, 16 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_sext_i32_i16 s4, s4 +; GFX12-NEXT: s_ashr_i32 s53, s3, 16 +; GFX12-NEXT: s_ashr_i32 s54, s2, 16 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s58 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s7 +; GFX12-NEXT: v_mov_b32_e32 v5, s56 +; GFX12-NEXT: s_ashr_i32 s51, s1, 16 +; GFX12-NEXT: s_ashr_i32 s52, s0, 16 +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_sext_i32_i16 s0, s0 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s55 +; GFX12-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v9, s54 +; GFX12-NEXT: s_ashr_i32 s49, s31, 16 +; GFX12-NEXT: s_ashr_i32 s50, s30, 16 +; GFX12-NEXT: s_sext_i32_i16 s31, s31 +; GFX12-NEXT: s_sext_i32_i16 s30, s30 +; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s53 +; GFX12-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s52 +; GFX12-NEXT: s_ashr_i32 s45, s27, 16 +; GFX12-NEXT: s_ashr_i32 s46, s26, 16 +; GFX12-NEXT: s_sext_i32_i16 s27, s27 +; GFX12-NEXT: s_sext_i32_i16 s26, s26 +; GFX12-NEXT: s_ashr_i32 s47, s29, 16 +; GFX12-NEXT: s_ashr_i32 s48, s28, 16 +; GFX12-NEXT: s_sext_i32_i16 s29, s29 +; GFX12-NEXT: s_sext_i32_i16 s28, s28 +; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s51 +; GFX12-NEXT: v_dual_mov_b32 v14, s1 :: v_dual_mov_b32 v17, s50 +; GFX12-NEXT: s_ashr_i32 s43, s25, 16 +; GFX12-NEXT: s_ashr_i32 s44, s24, 16 +; GFX12-NEXT: s_sext_i32_i16 s25, s25 +; GFX12-NEXT: s_sext_i32_i16 s24, s24 +; GFX12-NEXT: v_dual_mov_b32 v16, s30 :: v_dual_mov_b32 v19, s49 +; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v21, s48 +; GFX12-NEXT: s_ashr_i32 s41, s23, 16 +; GFX12-NEXT: s_ashr_i32 s42, s22, 16 +; GFX12-NEXT: s_sext_i32_i16 s23, s23 +; GFX12-NEXT: s_sext_i32_i16 s22, s22 +; GFX12-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v23, s47 +; GFX12-NEXT: v_mov_b32_e32 v22, s29 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v0, s26 +; GFX12-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v2, s27 +; GFX12-NEXT: v_mov_b32_e32 v5, s44 +; GFX12-NEXT: s_ashr_i32 s39, s21, 16 +; GFX12-NEXT: s_ashr_i32 s40, s20, 16 +; GFX12-NEXT: s_sext_i32_i16 s21, s21 +; GFX12-NEXT: s_sext_i32_i16 s20, s20 +; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s43 +; GFX12-NEXT: v_dual_mov_b32 v6, s25 :: v_dual_mov_b32 v9, s42 +; GFX12-NEXT: s_ashr_i32 s35, s19, 16 +; GFX12-NEXT: s_ashr_i32 s38, s18, 16 +; GFX12-NEXT: s_sext_i32_i16 s19, s19 +; GFX12-NEXT: s_sext_i32_i16 s18, s18 +; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s41 +; GFX12-NEXT: v_dual_mov_b32 v10, s23 :: v_dual_mov_b32 v13, s40 +; GFX12-NEXT: s_ashr_i32 s33, s17, 16 +; GFX12-NEXT: s_ashr_i32 s34, s16, 16 +; GFX12-NEXT: s_sext_i32_i16 s17, s17 +; GFX12-NEXT: s_sext_i32_i16 s16, s16 +; GFX12-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v15, s39 +; GFX12-NEXT: v_dual_mov_b32 v14, s21 :: v_dual_mov_b32 v17, s38 +; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s35 +; GFX12-NEXT: v_dual_mov_b32 v18, s19 :: v_dual_mov_b32 v21, s34 +; GFX12-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v23, s33 +; GFX12-NEXT: v_mov_b32_e32 v22, s17 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(4) %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(1) %out @@ -4630,6 +5426,19 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; EG-NEXT: MOV * T0.Y, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_i16_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = zext i16 %a to i64 store i64 %ext, ptr addrspace(1) %out @@ -4707,6 +5516,21 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) ; EG-NEXT: ASHR * T0.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_i16_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = sext i16 %a to i64 store i64 %ext, ptr addrspace(1) %out @@ -4777,6 +5601,19 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; EG-NEXT: MOV * T0.Y, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v1i16_to_v1i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -4849,6 +5686,21 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) ; EG-NEXT: ASHR * T0.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v1i16_to_v1i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -4927,6 +5779,23 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; EG-NEXT: MOV T4.W, 0.0, ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) +; +; GFX12-LABEL: constant_zextload_v2i16_to_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -5010,6 +5879,23 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) ; EG-NEXT: ASHR * T4.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v2i16_to_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s4, s2, 16 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -5120,6 +6006,28 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: LSHR * T8.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v4i16_to_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s4, 0xffff, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0 +; GFX12-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -5245,6 +6153,31 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG-NEXT: LSHR T8.X, PV.W, literal.x, ; EG-NEXT: ASHR * T7.Y, PV.X, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v4i16_to_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: s_lshr_b32 s8, s3, 16 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000 +; GFX12-NEXT: s_lshr_b32 s2, s2, 16 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v5, s7 +; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s9 +; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -5421,6 +6354,37 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) ; EG-NEXT: LSHR * T14.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v8i16_to_v8i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s2, 0xffff, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0 +; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_and_b32 s3, 0xffff, s6 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0 +; GFX12-NEXT: s_and_b32 s3, 0xffff, s5 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0 +; GFX12-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(4) %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(1) %out @@ -5631,6 +6595,44 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; EG-NEXT: LSHR T7.X, PV.W, literal.x, ; EG-NEXT: ASHR * T14.Y, PV.X, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v8i16_to_v8i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s14, s7 +; GFX12-NEXT: s_lshr_b32 s16, s7, 16 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000 +; GFX12-NEXT: s_lshr_b32 s6, s6, 16 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 +; GFX12-NEXT: s_mov_b32 s8, s5 +; GFX12-NEXT: s_lshr_b32 s10, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GFX12-NEXT: s_lshr_b32 s4, s4, 16 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v9, s15 +; GFX12-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v11, s17 +; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v13, s9 +; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s11 +; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s5 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(4) %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(1) %out @@ -5940,6 +6942,58 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) ; EG-NEXT: LSHR * T26.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v16i16_to_v16i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s10, s5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10 +; GFX12-NEXT: s_lshr_b32 s5, s5, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_lshr_b32 s5, s4, 16 +; GFX12-NEXT: s_and_b32 s4, s4, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_lshr_b32 s4, s7, 16 +; GFX12-NEXT: s_and_b32 s5, s7, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_lshr_b32 s4, s6, 16 +; GFX12-NEXT: s_and_b32 s5, s6, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_lshr_b32 s4, s3, 16 +; GFX12-NEXT: s_and_b32 s3, s3, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_lshr_b32 s3, s2, 16 +; GFX12-NEXT: s_and_b32 s2, s2, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_lshr_b32 s2, s1, 16 +; GFX12-NEXT: s_and_b32 s1, s1, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_lshr_b32 s1, s0, 16 +; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -6324,6 +7378,71 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; EG-NEXT: LSHR T12.X, PV.W, literal.x, ; EG-NEXT: ASHR * T26.Y, PV.X, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v16i16_to_v16i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s30, s5 +; GFX12-NEXT: s_lshr_b32 s34, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 +; GFX12-NEXT: s_lshr_b32 s4, s4, 16 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 +; GFX12-NEXT: s_mov_b32 s24, s7 +; GFX12-NEXT: s_lshr_b32 s26, s7, 16 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 +; GFX12-NEXT: s_lshr_b32 s6, s6, 16 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s29 +; GFX12-NEXT: s_mov_b32 s18, s3 +; GFX12-NEXT: s_lshr_b32 s20, s3, 16 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31 +; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35 +; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 +; GFX12-NEXT: s_lshr_b32 s12, s0, 16 +; GFX12-NEXT: s_mov_b32 s14, s1 +; GFX12-NEXT: s_lshr_b32 s16, s1, 16 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 +; GFX12-NEXT: s_lshr_b32 s2, s2, 16 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21 +; GFX12-NEXT: v_mov_b32_e32 v18, s20 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64 +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14 +; GFX12-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16 +; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10 +; GFX12-NEXT: v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -6904,6 +8023,98 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; EG-NEXT: LSHR * T50.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v32i16_to_v32i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_and_b32 s18, s15, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s18 +; GFX12-NEXT: s_lshr_b32 s15, s15, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_lshr_b32 s15, s14, 16 +; GFX12-NEXT: s_and_b32 s14, s14, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:240 +; GFX12-NEXT: v_mov_b32_e32 v0, s14 +; GFX12-NEXT: v_mov_b32_e32 v2, s15 +; GFX12-NEXT: s_lshr_b32 s14, s13, 16 +; GFX12-NEXT: s_and_b32 s13, s13, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:224 +; GFX12-NEXT: v_mov_b32_e32 v0, s13 +; GFX12-NEXT: v_mov_b32_e32 v2, s14 +; GFX12-NEXT: s_lshr_b32 s13, s12, 16 +; GFX12-NEXT: s_and_b32 s12, s12, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:208 +; GFX12-NEXT: v_mov_b32_e32 v0, s12 +; GFX12-NEXT: v_mov_b32_e32 v2, s13 +; GFX12-NEXT: s_lshr_b32 s12, s11, 16 +; GFX12-NEXT: s_and_b32 s11, s11, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:192 +; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: v_mov_b32_e32 v2, s12 +; GFX12-NEXT: s_lshr_b32 s11, s10, 16 +; GFX12-NEXT: s_and_b32 s10, s10, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:176 +; GFX12-NEXT: v_mov_b32_e32 v0, s10 +; GFX12-NEXT: v_mov_b32_e32 v2, s11 +; GFX12-NEXT: s_lshr_b32 s10, s9, 16 +; GFX12-NEXT: s_and_b32 s9, s9, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:160 +; GFX12-NEXT: v_mov_b32_e32 v0, s9 +; GFX12-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-NEXT: s_lshr_b32 s9, s8, 16 +; GFX12-NEXT: s_and_b32 s8, s8, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:144 +; GFX12-NEXT: v_mov_b32_e32 v0, s8 +; GFX12-NEXT: v_mov_b32_e32 v2, s9 +; GFX12-NEXT: s_lshr_b32 s8, s7, 16 +; GFX12-NEXT: s_and_b32 s7, s7, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:128 +; GFX12-NEXT: v_mov_b32_e32 v0, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, s8 +; GFX12-NEXT: s_lshr_b32 s7, s6, 16 +; GFX12-NEXT: s_and_b32 s6, s6, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s7 +; GFX12-NEXT: s_lshr_b32 s6, s5, 16 +; GFX12-NEXT: s_and_b32 s5, s5, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: s_lshr_b32 s5, s4, 16 +; GFX12-NEXT: s_and_b32 s4, s4, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_lshr_b32 s4, s3, 16 +; GFX12-NEXT: s_and_b32 s3, s3, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_lshr_b32 s3, s2, 16 +; GFX12-NEXT: s_and_b32 s2, s2, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_lshr_b32 s2, s1, 16 +; GFX12-NEXT: s_and_b32 s1, s1, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_lshr_b32 s1, s0, 16 +; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(4) %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out @@ -7640,6 +8851,124 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; EG-NEXT: LSHR T38.X, PV.W, literal.x, ; EG-NEXT: ASHR * T50.Y, PV.X, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v32i16_to_v32i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s28, s2, 16 +; GFX12-NEXT: s_lshr_b32 s42, s5, 16 +; GFX12-NEXT: s_lshr_b32 s52, s8, 16 +; GFX12-NEXT: s_mov_b32 s60, s11 +; GFX12-NEXT: s_lshr_b32 s22, s0, 16 +; GFX12-NEXT: s_mov_b32 s24, s1 +; GFX12-NEXT: s_lshr_b32 s26, s1, 16 +; GFX12-NEXT: s_mov_b32 s30, s3 +; GFX12-NEXT: s_lshr_b32 s36, s3, 16 +; GFX12-NEXT: s_lshr_b32 s38, s4, 16 +; GFX12-NEXT: s_mov_b32 s40, s5 +; GFX12-NEXT: s_lshr_b32 s46, s6, 16 +; GFX12-NEXT: s_mov_b32 s48, s7 +; GFX12-NEXT: s_lshr_b32 s50, s7, 16 +; GFX12-NEXT: s_mov_b32 s54, s9 +; GFX12-NEXT: s_lshr_b32 s56, s9, 16 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[10:11], 0x100000 +; GFX12-NEXT: s_lshr_b32 s58, s10, 16 +; GFX12-NEXT: s_lshr_b32 s62, s11, 16 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[52:53], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[60:61], 0x100000 +; GFX12-NEXT: s_lshr_b32 s60, s14, 16 +; GFX12-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000 +; GFX12-NEXT: s_mov_b32 s14, s15 +; GFX12-NEXT: s_lshr_b32 s66, s15, 16 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[30:31], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[36:37], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[38:39], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[40:41], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[46:47], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[48:49], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[50:51], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[54:55], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[56:57], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[58:59], 0x100000 +; GFX12-NEXT: s_lshr_b32 s54, s12, 16 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 +; GFX12-NEXT: s_mov_b32 s12, s13 +; GFX12-NEXT: s_lshr_b32 s58, s13, 16 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s14 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, s66 +; GFX12-NEXT: v_dual_mov_b32 v4, s67 :: v_dual_mov_b32 v5, s64 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v6, s65 :: v_dual_mov_b32 v7, s60 +; GFX12-NEXT: v_dual_mov_b32 v8, s61 :: v_dual_mov_b32 v9, s12 +; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s58 +; GFX12-NEXT: v_dual_mov_b32 v12, s59 :: v_dual_mov_b32 v13, s56 +; GFX12-NEXT: v_dual_mov_b32 v14, s57 :: v_dual_mov_b32 v15, s54 +; GFX12-NEXT: v_mov_b32_e32 v16, s55 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[16:17] offset:240 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[16:17] offset:224 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[16:17] offset:208 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[16:17] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s52 :: v_dual_mov_b32 v2, s53 +; GFX12-NEXT: v_dual_mov_b32 v3, s12 :: v_dual_mov_b32 v4, s13 +; GFX12-NEXT: v_dual_mov_b32 v5, s44 :: v_dual_mov_b32 v6, s45 +; GFX12-NEXT: v_dual_mov_b32 v7, s50 :: v_dual_mov_b32 v8, s51 +; GFX12-NEXT: v_dual_mov_b32 v9, s46 :: v_dual_mov_b32 v10, s47 +; GFX12-NEXT: v_dual_mov_b32 v11, s48 :: v_dual_mov_b32 v12, s49 +; GFX12-NEXT: v_dual_mov_b32 v13, s34 :: v_dual_mov_b32 v14, s35 +; GFX12-NEXT: v_dual_mov_b32 v15, s42 :: v_dual_mov_b32 v16, s43 +; GFX12-NEXT: v_dual_mov_b32 v17, s40 :: v_dual_mov_b32 v18, s41 +; GFX12-NEXT: v_dual_mov_b32 v19, s38 :: v_dual_mov_b32 v20, s39 +; GFX12-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX12-NEXT: v_dual_mov_b32 v23, s36 :: v_dual_mov_b32 v24, s37 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[16:17] offset:176 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[16:17] offset:160 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[16:17] offset:144 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[16:17] offset:128 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[16:17] offset:112 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[16:17] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s30 :: v_dual_mov_b32 v2, s31 +; GFX12-NEXT: v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v4, s29 +; GFX12-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX12-NEXT: v_dual_mov_b32 v7, s26 :: v_dual_mov_b32 v8, s27 +; GFX12-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX12-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX12-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX12-NEXT: v_dual_mov_b32 v15, s10 :: v_dual_mov_b32 v16, s11 +; GFX12-NEXT: v_dual_mov_b32 v17, s8 :: v_dual_mov_b32 v18, s9 +; GFX12-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v20, s7 +; GFX12-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX12-NEXT: v_dual_mov_b32 v23, s4 :: v_dual_mov_b32 v24, s5 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[16:17] offset:80 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[16:17] offset:64 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(4) %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index ffc2cd23ec251..16f95409055b1 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -5,6 +5,7 @@ ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-HSA %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-HSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i32: @@ -69,6 +70,18 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-HSA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load i32, ptr addrspace(4) %in store i32 %ld, ptr addrspace(1) %out @@ -142,6 +155,19 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v2i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <2 x i32>, ptr addrspace(4) %in store <2 x i32> %ld, ptr addrspace(1) %out @@ -226,6 +252,19 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-HSA-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v3i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i32>, ptr addrspace(4) %in store <3 x i32> %ld, ptr addrspace(1) %out @@ -307,6 +346,20 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <4 x i32>, ptr addrspace(4) %in store <4 x i32> %ld, ptr addrspace(1) %out @@ -421,6 +474,24 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] offset:16 ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v8i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <8 x i32>, ptr addrspace(4) %in store <8 x i32> %ld, ptr addrspace(1) %out @@ -562,6 +633,27 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v9i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x20 +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, s12 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: global_store_b32 v8, v9, s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <9 x i32>, ptr addrspace(4) %in store <9 x i32> %ld, ptr addrspace(1) %out @@ -708,6 +800,28 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v10i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20 +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: global_store_b64 v10, v[8:9], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v10, v[4:7], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <10 x i32>, ptr addrspace(4) %in store <10 x i32> %ld, ptr addrspace(1) %out @@ -865,6 +979,28 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX9-HSA-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9] ; GFX9-HSA-NEXT: global_store_dwordx3 v7, v[4:6], s[8:9] offset:32 ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v11i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20 +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v8, s12 +; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v11, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v11, v[4:7], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <11 x i32>, ptr addrspace(4) %in store <11 x i32> %ld, ptr addrspace(1) %out @@ -1019,6 +1155,29 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX9-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[8:9] offset:16 ; GFX9-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[8:9] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v12i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20 +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v12, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14 +; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 +; GFX12-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 +; GFX12-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 +; GFX12-NEXT: v_dual_mov_b32 v11, s3 :: v_dual_mov_b32 v10, s2 +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: global_store_b128 v12, v[0:3], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v12, v[8:11], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <12 x i32>, ptr addrspace(4) %in store <12 x i32> %ld, ptr addrspace(1) %out @@ -1203,6 +1362,30 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v16i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9 +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11 +; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s5 +; GFX12-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v11, s7 +; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s1 +; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s3 +; GFX12-NEXT: v_mov_b32_e32 v14, s2 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i32>, ptr addrspace(4) %in store <16 x i32> %ld, ptr addrspace(1) %out @@ -1276,6 +1459,18 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_zextload_i32_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load i32, ptr addrspace(4) %in %ext = zext i32 %ld to i64 store i64 %ext, ptr addrspace(1) %out @@ -1354,6 +1549,21 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_sextload_i32_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s3, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load i32, ptr addrspace(4) %in %ext = sext i32 %ld to i64 store i64 %ext, ptr addrspace(1) %out @@ -1427,6 +1637,18 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_zextload_v1i32_to_v1i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <1 x i32>, ptr addrspace(4) %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -1505,6 +1727,21 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_sextload_v1i32_to_v1i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s3, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <1 x i32>, ptr addrspace(4) %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -1589,6 +1826,20 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_zextload_v2i32_to_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <2 x i32>, ptr addrspace(4) %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -1686,6 +1937,23 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_sextload_v2i32_to_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s4, s3, 31 +; GFX12-NEXT: s_ashr_i32 s5, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <2 x i32>, ptr addrspace(4) %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -1799,6 +2067,23 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_zextload_v4i32_to_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <4 x i32>, ptr addrspace(4) %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -1938,6 +2223,28 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_sextload_v4i32_to_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s8, s7, 31 +; GFX12-NEXT: s_ashr_i32 s9, s6, 31 +; GFX12-NEXT: s_ashr_i32 s2, s5, 31 +; GFX12-NEXT: s_ashr_i32 s3, s4, 31 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2 +; GFX12-NEXT: v_mov_b32_e32 v6, s5 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <4 x i32>, ptr addrspace(4) %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -2114,6 +2421,29 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[8:9] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_zextload_v8i32_to_v8i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <8 x i32>, ptr addrspace(4) %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, ptr addrspace(1) %out @@ -2350,6 +2680,38 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_sextload_v8i32_to_v8i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s16, s11, 31 +; GFX12-NEXT: s_ashr_i32 s17, s10, 31 +; GFX12-NEXT: s_ashr_i32 s14, s9, 31 +; GFX12-NEXT: s_ashr_i32 s15, s8, 31 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17 +; GFX12-NEXT: s_ashr_i32 s12, s7, 31 +; GFX12-NEXT: s_ashr_i32 s13, s6, 31 +; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16 +; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15 +; GFX12-NEXT: s_ashr_i32 s2, s5, 31 +; GFX12-NEXT: s_ashr_i32 s3, s4, 31 +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14 +; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12 +; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3 +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_mov_b32_e32 v14, s5 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <8 x i32>, ptr addrspace(4) %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, ptr addrspace(1) %out @@ -2777,6 +3139,59 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX9-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_sextload_v16i32_to_v16i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s28, s11, 31 +; GFX12-NEXT: s_ashr_i32 s29, s10, 31 +; GFX12-NEXT: s_ashr_i32 s33, s15, 31 +; GFX12-NEXT: s_ashr_i32 s34, s14, 31 +; GFX12-NEXT: s_ashr_i32 s26, s9, 31 +; GFX12-NEXT: s_ashr_i32 s27, s8, 31 +; GFX12-NEXT: s_ashr_i32 s30, s13, 31 +; GFX12-NEXT: s_ashr_i32 s31, s12, 31 +; GFX12-NEXT: v_dual_mov_b32 v28, 0 :: v_dual_mov_b32 v1, s34 +; GFX12-NEXT: s_ashr_i32 s24, s7, 31 +; GFX12-NEXT: s_ashr_i32 s25, s6, 31 +; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31 +; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28 +; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s27 +; GFX12-NEXT: s_ashr_i32 s22, s5, 31 +; GFX12-NEXT: s_ashr_i32 s23, s4, 31 +; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30 +; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s26 +; GFX12-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v17, s25 +; GFX12-NEXT: s_ashr_i32 s20, s3, 31 +; GFX12-NEXT: s_ashr_i32 s21, s2, 31 +; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s24 +; GFX12-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v21, s23 +; GFX12-NEXT: s_ashr_i32 s18, s1, 31 +; GFX12-NEXT: s_ashr_i32 s19, s0, 31 +; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s22 +; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v25, s21 +; GFX12-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v27, s20 +; GFX12-NEXT: v_mov_b32_e32 v26, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v28, v[0:3], s[16:17] offset:112 +; GFX12-NEXT: global_store_b128 v28, v[4:7], s[16:17] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v2, s1 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v28, v[8:11], s[16:17] offset:80 +; GFX12-NEXT: global_store_b128 v28, v[12:15], s[16:17] offset:64 +; GFX12-NEXT: global_store_b128 v28, v[16:19], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v28, v[20:23], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v28, v[24:27], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v28, v[0:3], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <16 x i32>, ptr addrspace(4) %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -3085,6 +3500,41 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_zextload_v16i32_to_v16i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s14 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s15 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112 +; GFX12-NEXT: v_mov_b32_e32 v0, s12 +; GFX12-NEXT: v_mov_b32_e32 v2, s13 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v0, s10 +; GFX12-NEXT: v_mov_b32_e32 v2, s11 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80 +; GFX12-NEXT: v_mov_b32_e32 v0, s8 +; GFX12-NEXT: v_mov_b32_e32 v2, s9 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s7 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <16 x i32>, ptr addrspace(4) %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -3902,6 +4352,104 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s33 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_sextload_v32i32_to_v32i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 +; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s49, s15, 31 +; GFX12-NEXT: s_ashr_i32 s64, s31, 31 +; GFX12-NEXT: s_ashr_i32 s65, s30, 31 +; GFX12-NEXT: s_ashr_i32 s62, s29, 31 +; GFX12-NEXT: s_ashr_i32 s63, s28, 31 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s65 +; GFX12-NEXT: s_ashr_i32 s60, s27, 31 +; GFX12-NEXT: s_ashr_i32 s61, s26, 31 +; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, s64 +; GFX12-NEXT: v_dual_mov_b32 v2, s31 :: v_dual_mov_b32 v5, s63 +; GFX12-NEXT: s_ashr_i32 s58, s25, 31 +; GFX12-NEXT: s_ashr_i32 s59, s24, 31 +; GFX12-NEXT: v_dual_mov_b32 v4, s28 :: v_dual_mov_b32 v7, s62 +; GFX12-NEXT: v_dual_mov_b32 v6, s29 :: v_dual_mov_b32 v9, s61 +; GFX12-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v11, s60 +; GFX12-NEXT: v_dual_mov_b32 v10, s27 :: v_dual_mov_b32 v13, s59 +; GFX12-NEXT: s_ashr_i32 s57, s23, 31 +; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s58 +; GFX12-NEXT: s_ashr_i32 s24, s22, 31 +; GFX12-NEXT: v_mov_b32_e32 v14, s25 +; GFX12-NEXT: s_ashr_i32 s55, s21, 31 +; GFX12-NEXT: s_ashr_i32 s56, s20, 31 +; GFX12-NEXT: s_ashr_i32 s53, s19, 31 +; GFX12-NEXT: s_ashr_i32 s54, s18, 31 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s22 +; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s23 +; GFX12-NEXT: v_mov_b32_e32 v5, s56 +; GFX12-NEXT: s_ashr_i32 s51, s17, 31 +; GFX12-NEXT: s_ashr_i32 s52, s16, 31 +; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s55 +; GFX12-NEXT: v_dual_mov_b32 v6, s21 :: v_dual_mov_b32 v9, s54 +; GFX12-NEXT: s_ashr_i32 s50, s14, 31 +; GFX12-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v11, s53 +; GFX12-NEXT: v_dual_mov_b32 v10, s19 :: v_dual_mov_b32 v13, s52 +; GFX12-NEXT: s_ashr_i32 s45, s11, 31 +; GFX12-NEXT: s_ashr_i32 s46, s10, 31 +; GFX12-NEXT: s_ashr_i32 s47, s13, 31 +; GFX12-NEXT: s_ashr_i32 s48, s12, 31 +; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s51 +; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v17, s50 +; GFX12-NEXT: s_ashr_i32 s43, s9, 31 +; GFX12-NEXT: s_ashr_i32 s44, s8, 31 +; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s49 +; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v21, s48 +; GFX12-NEXT: s_ashr_i32 s41, s7, 31 +; GFX12-NEXT: s_ashr_i32 s42, s6, 31 +; GFX12-NEXT: v_dual_mov_b32 v20, s12 :: v_dual_mov_b32 v23, s47 +; GFX12-NEXT: v_mov_b32_e32 v22, s13 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v0, s10 +; GFX12-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v2, s11 +; GFX12-NEXT: v_mov_b32_e32 v5, s44 +; GFX12-NEXT: s_ashr_i32 s39, s5, 31 +; GFX12-NEXT: s_ashr_i32 s40, s4, 31 +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s43 +; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s42 +; GFX12-NEXT: s_ashr_i32 s35, s3, 31 +; GFX12-NEXT: s_ashr_i32 s38, s2, 31 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s41 +; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s40 +; GFX12-NEXT: s_ashr_i32 s33, s1, 31 +; GFX12-NEXT: s_ashr_i32 s34, s0, 31 +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s39 +; GFX12-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v17, s38 +; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s35 +; GFX12-NEXT: v_dual_mov_b32 v18, s3 :: v_dual_mov_b32 v21, s34 +; GFX12-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s33 +; GFX12-NEXT: v_mov_b32_e32 v22, s1 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <32 x i32>, ptr addrspace(4) %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out @@ -4480,6 +5028,67 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[36:37] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_zextload_v32i32_to_v32i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, s31 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:240 +; GFX12-NEXT: v_mov_b32_e32 v0, s28 +; GFX12-NEXT: v_mov_b32_e32 v2, s29 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:224 +; GFX12-NEXT: v_mov_b32_e32 v0, s26 +; GFX12-NEXT: v_mov_b32_e32 v2, s27 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:208 +; GFX12-NEXT: v_mov_b32_e32 v0, s24 +; GFX12-NEXT: v_mov_b32_e32 v2, s25 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:192 +; GFX12-NEXT: v_mov_b32_e32 v0, s22 +; GFX12-NEXT: v_mov_b32_e32 v2, s23 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:176 +; GFX12-NEXT: v_mov_b32_e32 v0, s20 +; GFX12-NEXT: v_mov_b32_e32 v2, s21 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:160 +; GFX12-NEXT: v_mov_b32_e32 v0, s18 +; GFX12-NEXT: v_mov_b32_e32 v2, s19 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:144 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: v_mov_b32_e32 v2, s17 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:128 +; GFX12-NEXT: v_mov_b32_e32 v0, s14 +; GFX12-NEXT: v_mov_b32_e32 v2, s15 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:112 +; GFX12-NEXT: v_mov_b32_e32 v0, s12 +; GFX12-NEXT: v_mov_b32_e32 v2, s13 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v0, s10 +; GFX12-NEXT: v_mov_b32_e32 v2, s11 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:80 +; GFX12-NEXT: v_mov_b32_e32 v0, s8 +; GFX12-NEXT: v_mov_b32_e32 v2, s9 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s7 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <32 x i32>, ptr addrspace(4) %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out @@ -4814,6 +5423,44 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] ; GFX9-HSA-NEXT: s_endpgm +; +; GFX12-LABEL: constant_load_v32i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 +; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29 +; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31 +; GFX12-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25 +; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s27 +; GFX12-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v9, s21 +; GFX12-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s17 +; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s19 +; GFX12-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v17, s13 +; GFX12-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v19, s15 +; GFX12-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s9 +; GFX12-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v23, s11 +; GFX12-NEXT: v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v25, s5 +; GFX12-NEXT: v_dual_mov_b32 v24, s4 :: v_dual_mov_b32 v27, s7 +; GFX12-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v29, s1 +; GFX12-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v31, s3 +; GFX12-NEXT: v_mov_b32_e32 v30, s2 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v32, v[0:3], s[36:37] offset:112 +; GFX12-NEXT: global_store_b128 v32, v[4:7], s[36:37] offset:96 +; GFX12-NEXT: global_store_b128 v32, v[8:11], s[36:37] offset:80 +; GFX12-NEXT: global_store_b128 v32, v[12:15], s[36:37] offset:64 +; GFX12-NEXT: global_store_b128 v32, v[16:19], s[36:37] offset:48 +; GFX12-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32 +; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16 +; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load <32 x i32>, ptr addrspace(4) %in store <32 x i32> %ld, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 17061e41b5bb6..9b3830671acbd 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_i64: @@ -59,6 +60,19 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load i64, ptr addrspace(4) %in store i64 %ld, ptr addrspace(1) %out ret void @@ -125,6 +139,20 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v2i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <2 x i64>, ptr addrspace(4) %in store <2 x i64> %ld, ptr addrspace(1) %out @@ -222,6 +250,25 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v3i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9 +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i64>, ptr addrspace(4) %in store <3 x i64> %ld, ptr addrspace(1) %out @@ -322,6 +369,24 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 17: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v4i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <4 x i64>, ptr addrspace(4) %in store <4 x i64> %ld, ptr addrspace(1) %out @@ -490,6 +555,30 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 35: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v8i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9 +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11 +; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s5 +; GFX12-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v11, s7 +; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s1 +; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s3 +; GFX12-NEXT: v_mov_b32_e32 v14, s2 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <8 x i64>, ptr addrspace(4) %in store <8 x i64> %ld, ptr addrspace(1) %out @@ -807,6 +896,44 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; EG-NEXT: ALU clause starting at 71: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v16i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 +; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29 +; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31 +; GFX12-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25 +; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s27 +; GFX12-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v9, s21 +; GFX12-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s17 +; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s19 +; GFX12-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v17, s13 +; GFX12-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v19, s15 +; GFX12-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s9 +; GFX12-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v23, s11 +; GFX12-NEXT: v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v25, s5 +; GFX12-NEXT: v_dual_mov_b32 v24, s4 :: v_dual_mov_b32 v27, s7 +; GFX12-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v29, s1 +; GFX12-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v31, s3 +; GFX12-NEXT: v_mov_b32_e32 v30, s2 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v32, v[0:3], s[36:37] offset:112 +; GFX12-NEXT: global_store_b128 v32, v[4:7], s[36:37] offset:96 +; GFX12-NEXT: global_store_b128 v32, v[8:11], s[36:37] offset:80 +; GFX12-NEXT: global_store_b128 v32, v[12:15], s[36:37] offset:64 +; GFX12-NEXT: global_store_b128 v32, v[16:19], s[36:37] offset:48 +; GFX12-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32 +; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16 +; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i64>, ptr addrspace(4) %in store <16 x i64> %ld, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 9ebd201879825..f18a34515a826 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s ; TODO: NOT AND define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { @@ -74,6 +75,18 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_i8: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load i8, ptr addrspace(4) %in store i8 %ld, ptr addrspace(1) %out @@ -149,6 +162,18 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v2i8: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <2 x i8>, ptr addrspace(4) %in store <2 x i8> %ld, ptr addrspace(1) %out @@ -250,6 +275,20 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, ; EG-NEXT: LSHR * T8.X, T0.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v3i8: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[0:1] offset:2 +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i8>, ptr addrspace(4) %in store <3 x i8> %ld, ptr addrspace(1) %out @@ -308,6 +347,18 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v4i8: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <4 x i8>, ptr addrspace(4) %in store <4 x i8> %ld, ptr addrspace(1) %out @@ -369,6 +420,19 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v8i8: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <8 x i8>, ptr addrspace(4) %in store <8 x i8> %ld, ptr addrspace(1) %out @@ -436,6 +500,20 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_load_v16i8: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i8>, ptr addrspace(4) %in store <16 x i8> %ld, ptr addrspace(1) %out @@ -501,6 +579,18 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_i8_to_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = zext i8 %a to i32 store i32 %ext, ptr addrspace(1) %out @@ -567,6 +657,18 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) +; +; GFX12-LABEL: constant_sextload_i8_to_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ld = load i8, ptr addrspace(4) %in %ext = sext i8 %ld to i32 store i32 %ext, ptr addrspace(1) %out @@ -632,6 +734,18 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v1i8_to_v1i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = zext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(1) %out @@ -698,6 +812,18 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) +; +; GFX12-LABEL: constant_sextload_v1i8_to_v1i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = sext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(1) %out @@ -782,6 +908,22 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; EG-NEXT: AND_INT T4.X, T0.W, literal.x, ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, ; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) +; +; GFX12-LABEL: constant_zextload_v2i8_to_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = zext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(1) %out @@ -866,6 +1008,22 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) ; EG-NEXT: BFE_INT * T4.Y, PV.W, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v2i8_to_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = sext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(1) %out @@ -951,6 +1109,23 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LSHR * T7.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v3i8_to_v3i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 +; GFX12-NEXT: s_and_b32 s3, s2, 0xff +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i8>, ptr addrspace(4) %in %ext = zext <3 x i8> %ld to <3 x i32> @@ -1038,6 +1213,24 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; EG-NEXT: LSHR T4.X, PS, literal.x, ; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; +; GFX12-LABEL: constant_sextload_v3i8_to_v3i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 +; GFX12-NEXT: s_sext_i32_i8 s3, s2 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i8>, ptr addrspace(4) %in %ext = sext <3 x i8> %ld to <3 x i32> @@ -1126,6 +1319,25 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; EG-NEXT: AND_INT T4.X, T4.X, literal.x, ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, ; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) +; +; GFX12-LABEL: constant_zextload_v4i8_to_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 +; GFX12-NEXT: s_lshr_b32 s3, s2, 24 +; GFX12-NEXT: s_and_b32 s4, s2, 0xff +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = zext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(1) %out @@ -1216,6 +1428,25 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, ; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; +; GFX12-LABEL: constant_sextload_v4i8_to_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 +; GFX12-NEXT: s_ashr_i32 s3, s2, 24 +; GFX12-NEXT: s_sext_i32_i8 s4, s2 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = sext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(1) %out @@ -1349,6 +1580,32 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) ; EG-NEXT: LSHR * T8.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v8i8_to_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 +; GFX12-NEXT: s_lshr_b32 s5, s2, 24 +; GFX12-NEXT: s_and_b32 s7, s2, 0xff +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX12-NEXT: s_lshr_b32 s4, s3, 24 +; GFX12-NEXT: s_and_b32 s6, s3, 0xff +; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5 +; GFX12-NEXT: v_mov_b32_e32 v6, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = zext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(1) %out @@ -1487,6 +1744,34 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; EG-NEXT: LSHR T8.X, PS, literal.x, ; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; +; GFX12-LABEL: constant_sextload_v8i8_to_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 +; GFX12-NEXT: s_ashr_i32 s6, s2, 24 +; GFX12-NEXT: s_sext_i32_i8 s7, s2 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX12-NEXT: s_ashr_i32 s4, s3, 24 +; GFX12-NEXT: s_bfe_i32 s5, s3, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s3, s3 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v4, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s5 +; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(1) %out @@ -1711,6 +1996,46 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LSHR * T14.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v16i8_to_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 +; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5 +; GFX12-NEXT: s_lshr_b32 s8, s6, 24 +; GFX12-NEXT: s_lshr_b32 s9, s7, 24 +; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4 +; GFX12-NEXT: s_and_b32 s12, s6, 0xff +; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX12-NEXT: s_and_b32 s13, s7, 0xff +; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX12-NEXT: s_and_b32 s11, s5, 0xff +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s9 +; GFX12-NEXT: s_lshr_b32 s3, s5, 24 +; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3 +; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: s_lshr_b32 s2, s4, 24 +; GFX12-NEXT: s_and_b32 s10, s4, 0xff +; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5 +; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9 +; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_and_b32 v13, 0xffff, v13 +; GFX12-NEXT: v_mov_b32_e32 v14, s4 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = zext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(1) %out @@ -1947,6 +2272,50 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; EG-NEXT: LSHR T14.X, PS, literal.x, ; EG-NEXT: BFE_INT * T12.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; +; GFX12-LABEL: constant_sextload_v16i8_to_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 +; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5 +; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4 +; GFX12-NEXT: s_ashr_i32 s12, s7, 24 +; GFX12-NEXT: s_sext_i32_i8 s13, s7 +; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80010 +; GFX12-NEXT: s_ashr_i32 s10, s6, 24 +; GFX12-NEXT: s_bfe_i32 s11, s6, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s6, s6 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s12 +; GFX12-NEXT: s_ashr_i32 s8, s5, 24 +; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8 +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: s_ashr_i32 s2, s4, 24 +; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s4, s4 +; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_mov_b32_e32 v6, s11 +; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v8, s5 +; GFX12-NEXT: v_mov_b32_e32 v10, s9 +; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v12, s4 +; GFX12-NEXT: v_mov_b32_e32 v14, s3 +; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(1) %out @@ -2352,6 +2721,77 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) ; EG-NEXT: LSHR * T26.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v32i8_to_v32i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10 +; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9 +; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 +; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7 +; GFX12-NEXT: s_lshr_b32 s15, s9, 24 +; GFX12-NEXT: s_lshr_b32 s17, s11, 24 +; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6 +; GFX12-NEXT: s_and_b32 s23, s9, 0xff +; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX12-NEXT: s_and_b32 s25, s11, 0xff +; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX12-NEXT: s_lshr_b32 s14, s8, 24 +; GFX12-NEXT: s_lshr_b32 s16, s10, 24 +; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5 +; GFX12-NEXT: s_and_b32 s22, s8, 0xff +; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX12-NEXT: s_and_b32 s24, s10, 0xff +; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s17 +; GFX12-NEXT: s_lshr_b32 s13, s7, 24 +; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4 +; GFX12-NEXT: s_and_b32 s21, s7, 0xff +; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13 +; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_and_b32 v25, 0xffff, v11 +; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_and_b32 v29, 0xffff, v10 +; GFX12-NEXT: v_dual_mov_b32 v24, s21 :: v_dual_and_b32 v9, 0xffff, v9 +; GFX12-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v11, s15 +; GFX12-NEXT: v_mov_b32_e32 v26, s7 +; GFX12-NEXT: s_lshr_b32 s12, s6, 24 +; GFX12-NEXT: s_and_b32 s20, s6, 0xff +; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14 +; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12 +; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20 +; GFX12-NEXT: s_lshr_b32 s3, s5, 24 +; GFX12-NEXT: s_and_b32 s19, s5, 0xff +; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v27, s13 :: v_dual_mov_b32 v22, s6 +; GFX12-NEXT: s_lshr_b32 s2, s4, 24 +; GFX12-NEXT: s_and_b32 s18, s4, 0xff +; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19 +; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_mov_b32_e32 v14, s4 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = zext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(1) %out @@ -2788,6 +3228,84 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; EG-NEXT: LSHR T26.X, PS, literal.x, ; EG-NEXT: BFE_INT * T24.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; +; GFX12-LABEL: constant_sextload_v32i8_to_v32i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11 +; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9 +; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 +; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10 +; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6 +; GFX12-NEXT: s_ashr_i32 s20, s9, 24 +; GFX12-NEXT: s_bfe_i32 s21, s9, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s9, s9 +; GFX12-NEXT: s_ashr_i32 s24, s11, 24 +; GFX12-NEXT: s_sext_i32_i8 s25, s11 +; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80010 +; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5 +; GFX12-NEXT: s_ashr_i32 s18, s8, 24 +; GFX12-NEXT: s_bfe_i32 s19, s8, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s8, s8 +; GFX12-NEXT: s_ashr_i32 s22, s10, 24 +; GFX12-NEXT: s_bfe_i32 s23, s10, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s10, s10 +; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s24 +; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4 +; GFX12-NEXT: s_ashr_i32 s12, s5, 24 +; GFX12-NEXT: s_ashr_i32 s14, s6, 24 +; GFX12-NEXT: s_ashr_i32 s16, s7, 24 +; GFX12-NEXT: s_bfe_i32 s17, s7, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s7, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22 +; GFX12-NEXT: v_mov_b32_e32 v2, s11 +; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14 +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8 +; GFX12-NEXT: v_bfe_i32 v29, v10, 0, 8 +; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8 +; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s12 +; GFX12-NEXT: v_mov_b32_e32 v11, s20 +; GFX12-NEXT: s_ashr_i32 s2, s4, 24 +; GFX12-NEXT: s_bfe_i32 s15, s6, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s6, s6 +; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18 +; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16 +; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_mov_b32_e32 v30, s19 +; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s5, s5 +; GFX12-NEXT: v_mov_b32_e32 v24, s7 +; GFX12-NEXT: v_mov_b32_e32 v26, s17 +; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s4, s4 +; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v20, s6 +; GFX12-NEXT: v_mov_b32_e32 v22, s15 +; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v16, s5 +; GFX12-NEXT: v_mov_b32_e32 v18, s13 +; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v12, s4 +; GFX12-NEXT: v_mov_b32_e32 v14, s3 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(1) %out @@ -3565,6 +4083,140 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00) ; EG-NEXT: LSHR * T50.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v64i8_to_v64i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v0, 8, s15 +; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1 +; GFX12-NEXT: s_lshr_b32 s34, s15, 24 +; GFX12-NEXT: s_and_b32 s50, s15, 0xff +; GFX12-NEXT: s_bfe_u32 s15, s15, 0x80010 +; GFX12-NEXT: s_lshr_b32 s27, s9, 24 +; GFX12-NEXT: v_lshrrev_b16 v2, 8, s14 +; GFX12-NEXT: v_lshrrev_b16 v3, 8, s13 +; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11 +; GFX12-NEXT: v_lshrrev_b16 v8, 8, s9 +; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6 +; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5 +; GFX12-NEXT: v_lshrrev_b16 v15, 8, s4 +; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3 +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0 +; GFX12-NEXT: v_dual_mov_b32 v52, 0 :: v_dual_and_b32 v5, 0xffff, v5 +; GFX12-NEXT: v_dual_mov_b32 v48, s50 :: v_dual_and_b32 v9, 0xffff, v9 +; GFX12-NEXT: v_mov_b32_e32 v50, s15 +; GFX12-NEXT: s_and_b32 s44, s9, 0xff +; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v49, 0xffff, v0 +; GFX12-NEXT: v_mov_b32_e32 v51, s34 +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s12 +; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 +; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7 +; GFX12-NEXT: s_lshr_b32 s31, s13, 24 +; GFX12-NEXT: s_lshr_b32 s33, s14, 24 +; GFX12-NEXT: s_and_b32 s43, s8, 0xff +; GFX12-NEXT: s_and_b32 s48, s13, 0xff +; GFX12-NEXT: s_and_b32 s49, s14, 0xff +; GFX12-NEXT: s_bfe_u32 s14, s14, 0x80010 +; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010 +; GFX12-NEXT: s_lshr_b32 s30, s12, 24 +; GFX12-NEXT: s_and_b32 s47, s12, 0xff +; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010 +; GFX12-NEXT: s_lshr_b32 s26, s8, 24 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: v_dual_mov_b32 v44, s49 :: v_dual_and_b32 v13, 0xffff, v13 +; GFX12-NEXT: v_dual_mov_b32 v46, s14 :: v_dual_and_b32 v17, 0xffff, v15 +; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX12-NEXT: v_and_b32_e32 v23, 0xffff, v12 +; GFX12-NEXT: s_and_b32 s42, s7, 0xff +; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v45, 0xffff, v2 +; GFX12-NEXT: v_dual_mov_b32 v47, s33 :: v_dual_mov_b32 v30, s43 +; GFX12-NEXT: global_store_b128 v52, v[48:51], s[16:17] offset:240 +; GFX12-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_and_b32 v49, 0xffff, v3 +; GFX12-NEXT: v_mov_b32_e32 v48, s48 +; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s31 +; GFX12-NEXT: v_mov_b32_e32 v26, s42 +; GFX12-NEXT: s_lshr_b32 s25, s7, 24 +; GFX12-NEXT: s_lshr_b32 s28, s10, 24 +; GFX12-NEXT: v_lshrrev_b16 v7, 8, s10 +; GFX12-NEXT: v_dual_mov_b32 v40, s47 :: v_dual_and_b32 v27, 0xffff, v11 +; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v42, s12 :: v_dual_and_b32 v31, 0xffff, v10 +; GFX12-NEXT: s_and_b32 s45, s10, 0xff +; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010 +; GFX12-NEXT: v_and_b32_e32 v41, 0xffff, v4 +; GFX12-NEXT: v_dual_mov_b32 v43, s30 :: v_dual_mov_b32 v28, s7 +; GFX12-NEXT: s_lshr_b32 s29, s11, 24 +; GFX12-NEXT: s_and_b32 s46, s11, 0xff +; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX12-NEXT: s_lshr_b32 s24, s6, 24 +; GFX12-NEXT: s_and_b32 s41, s6, 0xff +; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v8 +; GFX12-NEXT: global_store_b128 v52, v[44:47], s[16:17] offset:224 +; GFX12-NEXT: v_mov_b32_e32 v46, s29 +; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v6 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v52, v[48:51], s[16:17] offset:208 +; GFX12-NEXT: global_store_b128 v52, v[40:43], s[16:17] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v43, s46 :: v_dual_mov_b32 v22, s41 +; GFX12-NEXT: v_dual_mov_b32 v45, s11 :: v_dual_mov_b32 v24, s6 +; GFX12-NEXT: s_and_b32 s40, s5, 0xff +; GFX12-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_and_b32 v39, 0xffff, v7 +; GFX12-NEXT: v_dual_mov_b32 v40, s10 :: v_dual_mov_b32 v41, s28 +; GFX12-NEXT: v_mov_b32_e32 v20, s40 +; GFX12-NEXT: s_lshr_b32 s23, s5, 24 +; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX12-NEXT: v_mov_b32_e32 v37, s27 +; GFX12-NEXT: s_lshr_b32 s22, s4, 24 +; GFX12-NEXT: s_and_b32 s38, s3, 0xff +; GFX12-NEXT: s_and_b32 s39, s4, 0xff +; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v33, s26 :: v_dual_mov_b32 v16, s39 +; GFX12-NEXT: v_dual_mov_b32 v29, s25 :: v_dual_mov_b32 v18, s4 +; GFX12-NEXT: s_lshr_b32 s21, s3, 24 +; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v12, s38 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v52, v[43:46], s[16:17] offset:176 +; GFX12-NEXT: global_store_b128 v52, v[38:41], s[16:17] offset:160 +; GFX12-NEXT: global_store_b128 v52, v[34:37], s[16:17] offset:144 +; GFX12-NEXT: global_store_b128 v52, v[30:33], s[16:17] offset:128 +; GFX12-NEXT: global_store_b128 v52, v[26:29], s[16:17] offset:112 +; GFX12-NEXT: global_store_b128 v52, v[22:25], s[16:17] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23 +; GFX12-NEXT: v_mov_b32_e32 v14, s3 +; GFX12-NEXT: s_lshr_b32 s20, s2, 24 +; GFX12-NEXT: s_and_b32 s37, s2, 0xff +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v8, s37 +; GFX12-NEXT: s_lshr_b32 s19, s1, 24 +; GFX12-NEXT: s_and_b32 s36, s1, 0xff +; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v15, s21 :: v_dual_mov_b32 v10, s2 +; GFX12-NEXT: s_lshr_b32 s18, s0, 24 +; GFX12-NEXT: s_and_b32 s35, s0, 0xff +; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v4, s36 +; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s19 +; GFX12-NEXT: v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v52, v[20:23], s[16:17] offset:80 +; GFX12-NEXT: global_store_b128 v52, v[16:19], s[16:17] offset:64 +; GFX12-NEXT: global_store_b128 v52, v[12:15], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v52, v[8:11], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v52, v[4:7], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v52, v[0:3], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i8>, ptr addrspace(4) %in %ext = zext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(1) %out @@ -4396,6 +5048,151 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: LSHR T50.X, PS, literal.x, ; EG-NEXT: BFE_INT * T48.Y, PV.W, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; +; GFX12-LABEL: constant_sextload_v64i8_to_v64i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v0, 8, s15 +; GFX12-NEXT: s_ashr_i32 s49, s15, 24 +; GFX12-NEXT: s_bfe_i32 s50, s15, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s15, s15 +; GFX12-NEXT: v_lshrrev_b16 v2, 8, s14 +; GFX12-NEXT: v_lshrrev_b16 v3, 8, s13 +; GFX12-NEXT: v_dual_mov_b32 v54, 0 :: v_dual_mov_b32 v45, s49 +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s12 +; GFX12-NEXT: s_ashr_i32 s45, s13, 24 +; GFX12-NEXT: s_ashr_i32 s47, s14, 24 +; GFX12-NEXT: v_bfe_i32 v43, v0, 0, 8 +; GFX12-NEXT: v_dual_mov_b32 v42, s15 :: v_dual_mov_b32 v49, s47 +; GFX12-NEXT: v_dual_mov_b32 v44, s50 :: v_dual_mov_b32 v53, s45 +; GFX12-NEXT: s_bfe_i32 s46, s13, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s13, s13 +; GFX12-NEXT: s_bfe_i32 s48, s14, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s14, s14 +; GFX12-NEXT: s_ashr_i32 s43, s12, 24 +; GFX12-NEXT: s_bfe_i32 s44, s12, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s12, s12 +; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11 +; GFX12-NEXT: s_ashr_i32 s39, s10, 24 +; GFX12-NEXT: s_ashr_i32 s41, s11, 24 +; GFX12-NEXT: s_bfe_i32 s42, s11, 0x80010 +; GFX12-NEXT: v_bfe_i32 v47, v2, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v46, s14 +; GFX12-NEXT: v_bfe_i32 v51, v3, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v50, s13 +; GFX12-NEXT: v_mov_b32_e32 v52, s46 +; GFX12-NEXT: s_sext_i32_i8 s11, s11 +; GFX12-NEXT: v_mov_b32_e32 v48, s48 +; GFX12-NEXT: v_lshrrev_b16 v7, 8, s10 +; GFX12-NEXT: s_ashr_i32 s35, s8, 24 +; GFX12-NEXT: s_ashr_i32 s37, s9, 24 +; GFX12-NEXT: v_bfe_i32 v41, v4, 0, 8 +; GFX12-NEXT: global_store_b128 v54, v[42:45], s[16:17] offset:240 +; GFX12-NEXT: v_dual_mov_b32 v40, s12 :: v_dual_mov_b32 v37, s37 +; GFX12-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v33, s35 +; GFX12-NEXT: v_mov_b32_e32 v43, s43 +; GFX12-NEXT: v_lshrrev_b16 v8, 8, s9 +; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 +; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6 +; GFX12-NEXT: s_bfe_i32 s40, s10, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s10, s10 +; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5 +; GFX12-NEXT: v_lshrrev_b16 v15, 8, s4 +; GFX12-NEXT: s_ashr_i32 s33, s7, 24 +; GFX12-NEXT: s_bfe_i32 s38, s9, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s9, s9 +; GFX12-NEXT: global_store_b128 v54, v[46:49], s[16:17] offset:224 +; GFX12-NEXT: v_bfe_i32 v44, v6, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v45, s42 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v54, v[50:53], s[16:17] offset:208 +; GFX12-NEXT: global_store_b128 v54, v[40:43], s[16:17] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v43, s11 :: v_dual_mov_b32 v46, s41 +; GFX12-NEXT: v_mov_b32_e32 v29, s33 +; GFX12-NEXT: s_ashr_i32 s28, s5, 24 +; GFX12-NEXT: s_ashr_i32 s30, s6, 24 +; GFX12-NEXT: s_bfe_i32 s36, s8, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s8, s8 +; GFX12-NEXT: v_bfe_i32 v39, v7, 0, 8 +; GFX12-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v25, s30 +; GFX12-NEXT: v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v41, s39 +; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3 +; GFX12-NEXT: s_ashr_i32 s24, s3, 24 +; GFX12-NEXT: s_ashr_i32 s26, s4, 24 +; GFX12-NEXT: s_bfe_i32 s31, s6, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s6, s6 +; GFX12-NEXT: s_bfe_i32 s34, s7, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s7, s7 +; GFX12-NEXT: v_bfe_i32 v35, v8, 0, 8 +; GFX12-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v19, s26 +; GFX12-NEXT: v_mov_b32_e32 v36, s38 +; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2 +; GFX12-NEXT: s_ashr_i32 s18, s0, 24 +; GFX12-NEXT: s_ashr_i32 s20, s1, 24 +; GFX12-NEXT: s_ashr_i32 s22, s2, 24 +; GFX12-NEXT: s_bfe_i32 s29, s5, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s5, s5 +; GFX12-NEXT: v_bfe_i32 v31, v10, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v30, s8 +; GFX12-NEXT: v_dual_mov_b32 v32, s36 :: v_dual_mov_b32 v7, s20 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1 +; GFX12-NEXT: s_bfe_i32 s27, s4, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s4, s4 +; GFX12-NEXT: v_bfe_i32 v23, v12, 0, 8 +; GFX12-NEXT: v_bfe_i32 v27, v11, 0, 8 +; GFX12-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v3, s18 +; GFX12-NEXT: v_mov_b32_e32 v28, s34 +; GFX12-NEXT: v_mov_b32_e32 v22, s6 +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0 +; GFX12-NEXT: s_bfe_i32 s25, s3, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s3, s3 +; GFX12-NEXT: v_bfe_i32 v17, v15, 0, 8 +; GFX12-NEXT: v_bfe_i32 v21, v14, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v24, s31 +; GFX12-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v15, s24 +; GFX12-NEXT: v_mov_b32_e32 v11, s22 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v54, v[43:46], s[16:17] offset:176 +; GFX12-NEXT: global_store_b128 v54, v[38:41], s[16:17] offset:160 +; GFX12-NEXT: global_store_b128 v54, v[34:37], s[16:17] offset:144 +; GFX12-NEXT: global_store_b128 v54, v[30:33], s[16:17] offset:128 +; GFX12-NEXT: global_store_b128 v54, v[26:29], s[16:17] offset:112 +; GFX12-NEXT: global_store_b128 v54, v[22:25], s[16:17] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28 +; GFX12-NEXT: s_bfe_i32 s23, s2, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s2, s2 +; GFX12-NEXT: v_mov_b32_e32 v16, s4 +; GFX12-NEXT: v_mov_b32_e32 v18, s27 +; GFX12-NEXT: s_bfe_i32 s21, s1, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s1, s1 +; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v12, s3 +; GFX12-NEXT: v_mov_b32_e32 v14, s25 +; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s0, s0 +; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v10, s23 +; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v4, s1 +; GFX12-NEXT: v_mov_b32_e32 v6, s21 +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s19 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v54, v[20:23], s[16:17] offset:80 +; GFX12-NEXT: global_store_b128 v54, v[16:19], s[16:17] offset:64 +; GFX12-NEXT: global_store_b128 v54, v[12:15], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v54, v[8:11], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v54, v[4:7], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v54, v[0:3], s[16:17] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <64 x i8>, ptr addrspace(4) %in %ext = sext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(1) %out @@ -4466,6 +5263,19 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; EG-NEXT: MOV * T0.Y, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_i8_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = zext i8 %a to i64 store i64 %ext, ptr addrspace(1) %out @@ -4539,6 +5349,21 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) ; EG-NEXT: ASHR * T0.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_i8_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = sext i8 %a to i64 store i64 %ext, ptr addrspace(1) %out @@ -4608,6 +5433,18 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; EG-NEXT: MOV * T0.Y, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v1i8_to_v1i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = zext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -4681,6 +5518,21 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) ; EG-NEXT: ASHR * T0.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v1i8_to_v1i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = sext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -4772,6 +5624,22 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; EG-NEXT: MOV T4.W, 0.0, ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, ; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) +; +; GFX12-LABEL: constant_zextload_v2i8_to_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v2, 8, v0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = zext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -4867,6 +5735,25 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, ; EG-NEXT: ASHR * T4.W, PV.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v2i8_to_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v4, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = sext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -4980,6 +5867,29 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: LSHR * T7.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v4i8_to_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: s_lshr_b32 s4, s2, 24 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_and_b32 s2, s2, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = zext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -5108,6 +6018,32 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; EG-NEXT: ASHR T4.Y, PV.X, literal.y, ; EG-NEXT: ASHR * T5.W, PV.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v4i8_to_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 +; GFX12-NEXT: s_lshr_b32 s4, s2, 16 +; GFX12-NEXT: s_lshr_b32 s6, s2, 24 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = sext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -5288,6 +6224,40 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) ; EG-NEXT: LSHR * T12.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v8i8_to_v8i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_lshr_b32 s5, s3, 24 +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2 +; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_lshr_b32 s4, s2, 24 +; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_and_b32 s2, s2, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: s_and_b32 s2, s3, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = zext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(1) %out @@ -5506,6 +6476,46 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; EG-NEXT: ASHR T11.W, PV.Z, literal.y, ; EG-NEXT: ASHR * T7.W, T7.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v7, 8, s3 +; GFX12-NEXT: s_lshr_b32 s6, s3, 16 +; GFX12-NEXT: s_lshr_b32 s8, s2, 16 +; GFX12-NEXT: s_lshr_b32 s10, s2, 24 +; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 8 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 +; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 +; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s13 +; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX12-NEXT: v_mov_b32_e32 v12, s4 +; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(1) %out @@ -5820,6 +6830,61 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) ; EG-NEXT: LSHR * T22.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v16i8_to_v16i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_lshr_b32 s3, s7, 24 +; GFX12-NEXT: s_lshr_b32 s2, s5, 24 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010 +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_lshr_b32 s2, s6, 24 +; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_lshr_b32 s2, s4, 24 +; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_and_b32 s2, s6, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5 +; GFX12-NEXT: s_and_b32 s2, s7, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 +; GFX12-NEXT: s_and_b32 s2, s5, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: s_and_b32 s2, s4, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = zext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -6214,6 +7279,72 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; EG-NEXT: ASHR T7.W, PV.Z, literal.y, ; EG-NEXT: ASHR * T13.W, T13.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v16i8_to_v16i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v10, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v11, 8, s6 +; GFX12-NEXT: v_lshrrev_b16 v21, 8, s5 +; GFX12-NEXT: v_lshrrev_b16 v23, 8, s4 +; GFX12-NEXT: s_lshr_b32 s8, s7, 16 +; GFX12-NEXT: s_lshr_b32 s10, s6, 16 +; GFX12-NEXT: s_lshr_b32 s12, s6, 24 +; GFX12-NEXT: v_bfe_i32 v22, v10, 0, 8 +; GFX12-NEXT: v_bfe_i32 v10, v11, 0, 8 +; GFX12-NEXT: s_lshr_b32 s18, s4, 24 +; GFX12-NEXT: s_mov_b32 s20, s7 +; GFX12-NEXT: s_lshr_b32 s14, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 +; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: v_bfe_i32 v28, v21, 0, 8 +; GFX12-NEXT: s_lshr_b32 s16, s4, 16 +; GFX12-NEXT: s_mov_b32 s22, s5 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5 +; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v13, s11 +; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v5, s15 +; GFX12-NEXT: v_bfe_i32 v24, v23, 0, 8 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25 +; GFX12-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21 +; GFX12-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v27, s23 +; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v17, s17 +; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GFX12-NEXT: v_mov_b32_e32 v26, s22 +; GFX12-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19 +; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v30, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v30, v[20:23], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v30, v[12:15], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v30, v[8:11], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v30, v[4:7], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v30, v[26:29], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v30, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v30, v[22:25], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out @@ -6807,6 +7938,105 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; EG-NEXT: LSHR * T42.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v32i8_to_v32i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_bfe_u32 s10, s7, 0x80010 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10 +; GFX12-NEXT: s_lshr_b32 s11, s7, 24 +; GFX12-NEXT: s_lshr_b32 s10, s5, 24 +; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 +; GFX12-NEXT: s_and_b32 s7, s7, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:240 +; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-NEXT: s_lshr_b32 s10, s3, 24 +; GFX12-NEXT: s_bfe_u32 s11, s3, 0x80010 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:176 +; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-NEXT: s_lshr_b32 s10, s1, 24 +; GFX12-NEXT: s_bfe_u32 s11, s1, 0x80010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112 +; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-NEXT: s_lshr_b32 s10, s6, 24 +; GFX12-NEXT: s_bfe_u32 s11, s6, 0x80010 +; GFX12-NEXT: s_and_b32 s6, s6, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-NEXT: s_lshr_b32 s10, s4, 24 +; GFX12-NEXT: s_bfe_u32 s11, s4, 0x80010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:208 +; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-NEXT: s_lshr_b32 s10, s2, 24 +; GFX12-NEXT: s_bfe_u32 s11, s2, 0x80010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:144 +; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-NEXT: s_lshr_b32 s10, s0, 24 +; GFX12-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80 +; GFX12-NEXT: v_mov_b32_e32 v0, s11 +; GFX12-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5 +; GFX12-NEXT: s_and_b32 s5, s5, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:224 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 +; GFX12-NEXT: s_and_b32 s4, s4, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:192 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 +; GFX12-NEXT: s_and_b32 s3, s3, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:160 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2 +; GFX12-NEXT: s_and_b32 s2, s2, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:128 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1 +; GFX12-NEXT: s_and_b32 s1, s1, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0 +; GFX12-NEXT: s_and_b32 s0, s0, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = zext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out @@ -7567,6 +8797,124 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; EG-NEXT: ASHR T11.W, PV.Z, literal.y, ; EG-NEXT: ASHR * T26.W, T26.Z, literal.y, ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) +; +; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6 +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5 +; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2 +; GFX12-NEXT: s_lshr_b32 s24, s7, 16 +; GFX12-NEXT: v_bfe_i32 v31, v1, 0, 8 +; GFX12-NEXT: s_lshr_b32 s42, s2, 24 +; GFX12-NEXT: s_mov_b32 s48, s7 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 +; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1 +; GFX12-NEXT: s_lshr_b32 s26, s6, 16 +; GFX12-NEXT: s_lshr_b32 s44, s1, 16 +; GFX12-NEXT: s_ashr_i64 s[58:59], s[6:7], 56 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: v_lshrrev_b16 v6, 8, s3 +; GFX12-NEXT: v_lshrrev_b16 v3, 8, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24 +; GFX12-NEXT: s_lshr_b32 s28, s6, 24 +; GFX12-NEXT: s_lshr_b32 s30, s5, 16 +; GFX12-NEXT: s_lshr_b32 s40, s2, 16 +; GFX12-NEXT: v_bfe_i32 v11, v8, 0, 8 +; GFX12-NEXT: v_bfe_i32 v23, v4, 0, 8 +; GFX12-NEXT: v_bfe_i32 v27, v2, 0, 8 +; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58 +; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26 +; GFX12-NEXT: v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48 +; GFX12-NEXT: v_mov_b32_e32 v30, s49 +; GFX12-NEXT: s_lshr_b32 s46, s0, 24 +; GFX12-NEXT: s_mov_b32 s50, s5 +; GFX12-NEXT: s_mov_b32 s52, s3 +; GFX12-NEXT: s_lshr_b32 s34, s4, 16 +; GFX12-NEXT: s_lshr_b32 s36, s4, 24 +; GFX12-NEXT: s_ashr_i64 s[22:23], s[2:3], 56 +; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56 +; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GFX12-NEXT: v_bfe_i32 v19, v5, 0, 8 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX12-NEXT: s_lshr_b32 s38, s3, 16 +; GFX12-NEXT: s_mov_b32 s54, s1 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000 +; GFX12-NEXT: s_lshr_b32 s20, s0, 16 +; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56 +; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX12-NEXT: v_bfe_i32 v15, v6, 0, 8 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28 +; GFX12-NEXT: v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30 +; GFX12-NEXT: v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56 +; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34 +; GFX12-NEXT: v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40 +; GFX12-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000 +; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:240 +; GFX12-NEXT: v_mov_b32_e32 v33, s44 +; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:224 +; GFX12-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17 +; GFX12-NEXT: v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4 +; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v17, s14 +; GFX12-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12 +; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36 +; GFX12-NEXT: v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38 +; GFX12-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18 +; GFX12-NEXT: v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20 +; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2 +; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10 +; GFX12-NEXT: v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22 +; GFX12-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6 +; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v0, v[37:40], s[8:9] offset:208 +; GFX12-NEXT: global_store_b128 v0, v[25:28], s[8:9] offset:192 +; GFX12-NEXT: global_store_b128 v0, v[41:44], s[8:9] offset:176 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[8:9] offset:160 +; GFX12-NEXT: global_store_b128 v0, v[45:48], s[8:9] offset:144 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[8:9] offset:128 +; GFX12-NEXT: global_store_b128 v0, v[49:52], s[8:9] offset:112 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[8:9] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v2, s11 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[53:56], s[8:9] offset:80 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[8:9] offset:64 +; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out @@ -7657,6 +9005,18 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_i8_to_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = zext i8 %a to i16 store i16 %ext, ptr addrspace(1) %out @@ -7733,6 +9093,18 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_i8_to_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = sext i8 %a to i16 store i16 %ext, ptr addrspace(1) %out @@ -7807,6 +9179,18 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v1i8_to_v1i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = zext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, ptr addrspace(1) %out @@ -7883,6 +9267,18 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v1i8_to_v1i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = sext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, ptr addrspace(1) %out @@ -7965,6 +9361,23 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; EG-NEXT: OR_INT T5.X, PS, PV.W, ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_zextload_v2i8_to_v2i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX12-NEXT: v_lshrrev_b16 v1, 8, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = zext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, ptr addrspace(1) %out @@ -8061,6 +9474,23 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; EG-NEXT: OR_INT T5.X, PS, PV.W, ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX12-LABEL: constant_sextload_v2i8_to_v2i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX12-NEXT: v_ashrrev_i16 v1, 8, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = sext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, ptr addrspace(1) %out @@ -8173,6 +9603,29 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV T5.X, PV.Y, ; EG-NEXT: MOV * T8.X, T4.X, +; +; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s3, s2, 16 +; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2 +; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3 +; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2 +; GFX12-NEXT: s_lshr_b32 s2, s2, 24 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1 +; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, ptr addrspace(1) %out @@ -8301,6 +9754,28 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV T5.X, PV.Y, ; EG-NEXT: MOV * T8.X, T4.X, +; +; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x80000 +; GFX12-NEXT: s_lshr_b32 s3, s2, 16 +; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2 +; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4 +; GFX12-NEXT: s_ashr_i32 s2, s2, 24 +; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, ptr addrspace(1) %out @@ -8470,6 +9945,36 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; EG-NEXT: MOV T5.X, PV.W, ; EG-NEXT: MOV * T12.X, T8.X, ; EG-NEXT: MOV * T12.Z, T4.X, +; +; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s5, s2, 16 +; GFX12-NEXT: s_lshr_b32 s6, s3, 16 +; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2 +; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s3 +; GFX12-NEXT: v_and_b32_e64 v3, 0xff, s6 +; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s5 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3 +; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: s_lshr_b32 s4, s2, 24 +; GFX12-NEXT: s_lshr_b32 s2, s3, 24 +; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3 +; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, ptr addrspace(1) %out @@ -8671,6 +10176,35 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; EG-NEXT: MOV T5.X, PV.W, ; EG-NEXT: MOV * T12.X, T8.X, ; EG-NEXT: MOV * T12.Z, T4.X, +; +; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_bfe_i32 s8, s2, 0x80000 +; GFX12-NEXT: s_bfe_i32 s9, s3, 0x80000 +; GFX12-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-NEXT: s_lshr_b32 s7, s3, 16 +; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2 +; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3 +; GFX12-NEXT: s_ashr_i64 s[4:5], s[2:3], 56 +; GFX12-NEXT: v_and_b32_e64 v3, 0xffff, s8 +; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s9 +; GFX12-NEXT: s_ashr_i32 s2, s2, 24 +; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000 +; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3 +; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, ptr addrspace(1) %out @@ -8972,6 +10506,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG-NEXT: MOV * T20.Z, T12.X, ; EG-NEXT: MOV T19.X, T8.X, ; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 +; +; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s3, s6, 16 +; GFX12-NEXT: s_lshr_b32 s9, s7, 16 +; GFX12-NEXT: s_lshr_b32 s11, s4, 16 +; GFX12-NEXT: s_lshr_b32 s13, s5, 16 +; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s5 +; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s4 +; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s7 +; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s6 +; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s9 +; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s3 +; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s13 +; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s11 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6 +; GFX12-NEXT: v_lshrrev_b16 v3, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v0, 8, s4 +; GFX12-NEXT: v_lshrrev_b16 v2, 8, s5 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX12-NEXT: s_lshr_b32 s2, s6, 24 +; GFX12-NEXT: s_lshr_b32 s8, s7, 24 +; GFX12-NEXT: s_lshr_b32 s10, s4, 24 +; GFX12-NEXT: s_lshr_b32 s12, s5, 24 +; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5 +; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6 +; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7 +; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11 +; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12 +; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9 +; GFX12-NEXT: v_lshl_or_b32 v1, s10, 16, v10 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, ptr addrspace(1) %out @@ -9334,6 +10917,56 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; EG-NEXT: MOV * T20.Z, T12.X, ; EG-NEXT: MOV T19.X, T8.X, ; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 +; +; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s8, s6, 16 +; GFX12-NEXT: v_ashrrev_i16 v5, 8, s6 +; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000 +; GFX12-NEXT: s_lshr_b32 s10, s4, 16 +; GFX12-NEXT: s_lshr_b32 s11, s5, 16 +; GFX12-NEXT: v_ashrrev_i16 v1, 8, s4 +; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5 +; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000 +; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000 +; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 +; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6 +; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000 +; GFX12-NEXT: s_lshr_b32 s9, s7, 16 +; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s4 +; GFX12-NEXT: s_bfe_i32 s3, s11, 0x80000 +; GFX12-NEXT: s_bfe_i32 s4, s10, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v2, 8, s7 +; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5 +; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12 +; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8 +; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6 +; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11 +; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10 +; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000 +; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3 +; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4 +; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7 +; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v11 +; GFX12-NEXT: v_lshl_or_b32 v0, v5, 16, v12 +; GFX12-NEXT: v_lshl_or_b32 v1, v13, 16, v16 +; GFX12-NEXT: v_lshl_or_b32 v7, v9, 16, v14 +; GFX12-NEXT: v_lshl_or_b32 v5, v10, 16, v15 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, ptr addrspace(1) %out @@ -9895,6 +11528,93 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: MOV * T38.Z, T28.X, ; EG-NEXT: MOV T35.X, T24.X, ; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 +; +; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s25, s1, 16 +; GFX12-NEXT: s_lshr_b32 s21, s3, 16 +; GFX12-NEXT: s_lshr_b32 s23, s0, 16 +; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s1 +; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s3 +; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s2 +; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s5 +; GFX12-NEXT: v_and_b32_e64 v13, 0xff, s4 +; GFX12-NEXT: v_and_b32_e64 v14, 0xff, s25 +; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s0 +; GFX12-NEXT: v_and_b32_e64 v15, 0xff, s23 +; GFX12-NEXT: v_and_b32_e64 v17, 0xff, s21 +; GFX12-NEXT: s_lshr_b32 s17, s5, 16 +; GFX12-NEXT: v_lshrrev_b16 v8, 8, s4 +; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5 +; GFX12-NEXT: v_lshrrev_b16 v3, 8, s2 +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s3 +; GFX12-NEXT: v_lshrrev_b16 v2, 8, s1 +; GFX12-NEXT: v_and_b32_e64 v19, 0xff, s17 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_and_b32 v7, 0xffff, v7 +; GFX12-NEXT: v_lshrrev_b16 v0, 8, s0 +; GFX12-NEXT: v_and_b32_e32 v20, 0xffff, v15 +; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX12-NEXT: s_lshr_b32 s11, s6, 16 +; GFX12-NEXT: s_lshr_b32 s13, s7, 16 +; GFX12-NEXT: s_lshr_b32 s24, s1, 24 +; GFX12-NEXT: s_lshr_b32 s15, s4, 16 +; GFX12-NEXT: s_lshr_b32 s20, s3, 24 +; GFX12-NEXT: s_lshr_b32 s19, s2, 16 +; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX12-NEXT: v_lshl_or_b32 v6, v4, 16, v10 +; GFX12-NEXT: v_lshl_or_b32 v4, v3, 16, v11 +; GFX12-NEXT: v_lshl_or_b32 v3, s24, 16, v14 +; GFX12-NEXT: v_lshl_or_b32 v10, v9, 16, v12 +; GFX12-NEXT: v_lshl_or_b32 v8, v8, 16, v13 +; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s7 +; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s6 +; GFX12-NEXT: v_and_b32_e64 v13, 0xff, s13 +; GFX12-NEXT: v_and_b32_e64 v14, 0xff, s11 +; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v7 +; GFX12-NEXT: v_lshl_or_b32 v7, s20, 16, v15 +; GFX12-NEXT: v_and_b32_e64 v15, 0xff, s15 +; GFX12-NEXT: v_and_b32_e64 v18, 0xff, s19 +; GFX12-NEXT: s_lshr_b32 s16, s5, 24 +; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7 +; GFX12-NEXT: v_lshl_or_b32 v11, s16, 16, v17 +; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v14 +; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX12-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX12-NEXT: s_lshr_b32 s10, s6, 24 +; GFX12-NEXT: s_lshr_b32 s12, s7, 24 +; GFX12-NEXT: s_lshr_b32 s14, s4, 24 +; GFX12-NEXT: s_lshr_b32 s18, s2, 24 +; GFX12-NEXT: v_lshl_or_b32 v14, v5, 16, v9 +; GFX12-NEXT: v_lshl_or_b32 v12, v1, 16, v12 +; GFX12-NEXT: v_lshl_or_b32 v15, s12, 16, v13 +; GFX12-NEXT: v_lshl_or_b32 v13, s10, 16, v17 +; GFX12-NEXT: s_lshr_b32 s22, s0, 24 +; GFX12-NEXT: v_lshl_or_b32 v9, s14, 16, v19 +; GFX12-NEXT: v_lshl_or_b32 v5, s18, 16, v18 +; GFX12-NEXT: v_lshl_or_b32 v1, s22, 16, v20 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, ptr addrspace(1) %out @@ -10582,6 +12302,94 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: MOV * T38.Z, T28.X, ; EG-NEXT: MOV T35.X, T24.X, ; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 +; +; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_lshr_b32 s12, s4, 16 +; GFX12-NEXT: s_lshr_b32 s14, s2, 16 +; GFX12-NEXT: v_ashrrev_i16 v4, 8, s2 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX12-NEXT: s_bfe_i32 s20, s5, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v7, 8, s4 +; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000 +; GFX12-NEXT: s_lshr_b32 s17, s1, 16 +; GFX12-NEXT: s_lshr_b32 s15, s3, 16 +; GFX12-NEXT: s_lshr_b32 s16, s0, 16 +; GFX12-NEXT: v_ashrrev_i16 v0, 8, s1 +; GFX12-NEXT: s_bfe_i32 s18, s1, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0 +; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v5, 8, s5 +; GFX12-NEXT: s_ashr_i64 s[0:1], s[4:5], 56 +; GFX12-NEXT: v_and_b32_e64 v10, 0xffff, s2 +; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s20 +; GFX12-NEXT: s_bfe_i32 s1, s17, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v3, 8, s3 +; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX12-NEXT: s_bfe_i32 s2, s15, 0x80000 +; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1 +; GFX12-NEXT: s_bfe_i32 s1, s12, 0x80000 +; GFX12-NEXT: v_and_b32_e64 v2, 0xffff, s18 +; GFX12-NEXT: v_and_b32_e64 v6, 0xffff, s19 +; GFX12-NEXT: v_and_b32_e64 v8, 0xffff, s3 +; GFX12-NEXT: v_ashrrev_i16 v11, 8, s15 +; GFX12-NEXT: v_and_b32_e64 v13, 0xffff, s4 +; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s2 +; GFX12-NEXT: v_lshl_or_b32 v4, v4, 16, v10 +; GFX12-NEXT: v_lshl_or_b32 v10, v5, 16, v12 +; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s1 +; GFX12-NEXT: s_bfe_i32 s1, s7, 0x80000 +; GFX12-NEXT: s_lshr_b32 s11, s7, 16 +; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s1 +; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000 +; GFX12-NEXT: s_lshr_b32 s10, s6, 16 +; GFX12-NEXT: v_lshl_or_b32 v2, v0, 16, v2 +; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v6 +; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v8 +; GFX12-NEXT: v_lshl_or_b32 v8, v7, 16, v13 +; GFX12-NEXT: v_lshl_or_b32 v7, v11, 16, v15 +; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s1 +; GFX12-NEXT: s_bfe_i32 s1, s11, 0x80000 +; GFX12-NEXT: s_lshr_b32 s13, s5, 16 +; GFX12-NEXT: v_and_b32_e64 v22, 0xffff, s1 +; GFX12-NEXT: s_bfe_i32 s1, s10, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v9, 8, s17 +; GFX12-NEXT: s_bfe_i32 s3, s14, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v11, 8, s7 +; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6 +; GFX12-NEXT: v_ashrrev_i16 v21, 8, s11 +; GFX12-NEXT: v_ashrrev_i16 v23, 8, s10 +; GFX12-NEXT: v_and_b32_e64 v24, 0xffff, s1 +; GFX12-NEXT: s_bfe_i32 s5, s16, 0x80000 +; GFX12-NEXT: v_ashrrev_i16 v1, 8, s12 +; GFX12-NEXT: v_ashrrev_i16 v18, 8, s14 +; GFX12-NEXT: s_bfe_i32 s4, s13, 0x80000 +; GFX12-NEXT: v_and_b32_e64 v20, 0xffff, s3 +; GFX12-NEXT: v_ashrrev_i16 v17, 8, s16 +; GFX12-NEXT: v_and_b32_e64 v19, 0xffff, s5 +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX12-NEXT: v_mov_b32_e32 v16, 0 +; GFX12-NEXT: v_lshl_or_b32 v3, v9, 16, v14 +; GFX12-NEXT: v_lshl_or_b32 v14, v11, 16, v12 +; GFX12-NEXT: v_mov_b32_e32 v11, s0 +; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v15 +; GFX12-NEXT: v_lshl_or_b32 v15, v21, 16, v22 +; GFX12-NEXT: v_lshl_or_b32 v13, v23, 16, v24 +; GFX12-NEXT: v_lshl_or_b32 v9, v1, 16, v5 +; GFX12-NEXT: v_lshl_or_b32 v5, v18, 16, v20 +; GFX12-NEXT: v_lshl_or_b32 v1, v17, 16, v19 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir index 7cd6635a62113..08e0f2e58a369 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX11 +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX12 --- name: merge_s_load_x1_x1 @@ -16,18 +17,76 @@ body: | ... --- -name: merge_s_load_x1_x1_x1_x1 +name: merge_s_load_x1_x2 body: | bb.0: - ; CHECK-LABEL: name: merge_s_load_x1_x1_x1_x1 + ; CHECK-LABEL: name: merge_s_load_x1_x2 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 4, 0 :: (dereferenceable invariant load (s64)) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x1_x3 +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x1_x3 + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96)) +... + +--- +name: merge_s_load_x1_x1_x1 +body: | + bb.0: + ; GFX11-LABEL: name: merge_s_load_x1_x1_x1 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-LABEL: name: merge_s_load_x1_x1_x1 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) + %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: merge_s_load_x1_x1_x1_x1 +body: | + bb.0: + ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1 + ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub2 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1 %0:sgpr_64 = IMPLICIT_DEF %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) @@ -39,23 +98,40 @@ body: | name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 body: | bb.0: - ; CHECK-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY8]].sub0 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1 + ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1 + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3 + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY8]].sub0 + ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1 + ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0 + ; GFX11-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1 + ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY4]].sub0 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY4]].sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[COPY1]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub3 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]].sub0_sub1 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub2 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY10]].sub0 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY10]].sub1 %0:sgpr_64 = IMPLICIT_DEF %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) @@ -67,6 +143,24 @@ body: | %8:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 28, 0 :: (dereferenceable invariant load (s32)) ... +--- +name: merge_s_load_x2_x1 +body: | + bb.0: + ; GFX11-LABEL: name: merge_s_load_x2_x1 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64)) + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) + ; GFX12-LABEL: name: merge_s_load_x2_x1 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 + %0:sgpr_64 = IMPLICIT_DEF + %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) +... + --- name: merge_s_load_x2_x2 body: | @@ -101,6 +195,20 @@ body: | %4:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64)) ... +--- +name: merge_s_load_x3_x1 +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x3_x1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 + %0:sgpr_64 = IMPLICIT_DEF + %1:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32)) +... + --- name: merge_s_load_x4_x4 body: | diff --git a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir index 5ff0dbe65b5d1..c739c3caf1eb3 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir @@ -1,8 +1,8 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX10 +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX12 # CHECK-LABEL: name: merge_s_buffer_load_x2 # CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 4) - name: merge_s_buffer_load_x2 tracksRegLiveness: true body: | @@ -17,6 +17,41 @@ body: | ... --- +# CHECK-LABEL: name: merge_s_buffer_load_x1_x2 +# CHECK: S_BUFFER_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32)) +# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 4, 0 :: (dereferenceable invariant load (s64)) +name: merge_s_buffer_load_x1_x2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s64)) + + S_ENDPGM 0 +... +--- + +# CHECK-LABEL: name: merge_s_buffer_load_x2_x1 +# GFX10: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64)) +# GFX10: S_BUFFER_LOAD_DWORD_IMM %0, 8, 0 :: (dereferenceable invariant load (s32)) +# GFX12: S_BUFFER_LOAD_DWORDX3_IMM %0, 0, 0 :: (dereferenceable invariant load (s96), align 8) +name: merge_s_buffer_load_x2_x1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + # CHECK-LABEL: name: merge_s_buffer_load_x4 # CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128), align 4) name: merge_s_buffer_load_x4 @@ -35,6 +70,39 @@ body: | ... --- +# CHECK-LABEL: name: merge_s_buffer_load_x1_x3 +# CHECK: S_BUFFER_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32)) +# CHECK: S_BUFFER_LOAD_DWORDX3_IMM %0, 4, 0 :: (dereferenceable invariant load (s96), align 16) +name: merge_s_buffer_load_x1_x3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s96)) + + S_ENDPGM 0 +... +--- + +# CHECK-LABEL: name: merge_s_buffer_load_x3_x1 +# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128)) +name: merge_s_buffer_load_x3_x1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s96)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + # CHECK-LABEL: name: merge_s_buffer_load_x8 # CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4) name: merge_s_buffer_load_x8 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 098a4cbb36ede..4695cadd45aee 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -95,9 +95,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)