diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 3412bb5acf28c..5a2416debb417 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1863,15 +1863,6 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr, SIInstrFlags::FlatScratch); } -// If this matches zero_extend i32:x, return x -static SDValue matchZExtFromI32(SDValue Op) { - if (Op.getOpcode() != ISD::ZERO_EXTEND) - return SDValue(); - - SDValue ExtSrc = Op.getOperand(0); - return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); -} - // If this matches *_extend i32:x, return x // Otherwise if the value is I32 returns x. static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, @@ -1890,12 +1881,13 @@ static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, } // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) -bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, - SDValue Addr, - SDValue &SAddr, - SDValue &VOffset, - SDValue &Offset) const { +// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, bool &ScaleOffset, + bool NeedIOffset) const { int64_t ImmOffset = 0; + ScaleOffset = false; // Match the immediate offset first, which canonically is moved as low as // possible. @@ -1905,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, int64_t COffsetVal = cast(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, + if (NeedIOffset && + TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal)) { Addr = LHS; ImmOffset = COffsetVal; @@ -1915,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // saddr + large_offset -> saddr + // (voffset = large_offset & ~MaxOffset) + // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( - COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal; + if (NeedIOffset) { + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + } - if (isUInt<32>(RemainderOffset)) { + if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset) + : isUInt<32>(RemainderOffset)) { SDNode *VMov = CurDAG->getMachineNode( AMDGPU::V_MOV_B32_e32, SL, MVT::i32, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); @@ -1946,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // Match the variable offset. if (Addr.getOpcode() == ISD::ADD) { LHS = Addr.getOperand(0); - RHS = Addr.getOperand(1); if (!LHS->isDivergent()) { - // add (i64 sgpr), (zero_extend (i32 vgpr)) - if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + // add (i64 sgpr), (*_extend (i32 vgpr)) + RHS = Addr.getOperand(1); + ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtRHS = matchExtFromI32orI32( + RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { SAddr = LHS; - VOffset = ZextRHS; + VOffset = ExtRHS; } } + RHS = Addr.getOperand(1); if (!SAddr && !RHS->isDivergent()) { - // add (zero_extend (i32 vgpr)), (i64 sgpr) - if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + // add (*_extend (i32 vgpr)), (i64 sgpr) + ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtLHS = matchExtFromI32orI32( + LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { SAddr = RHS; - VOffset = ZextLHS; + VOffset = ExtLHS; } } @@ -1970,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, } } + if (Subtarget->hasScaleOffset() && + (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset() + ? AMDGPUISD::MAD_I64_I32 + : AMDGPUISD::MAD_U64_U32) || + (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 && + CurDAG->SignBitIsZero(Addr.getOperand(0)))) && + Addr.getOperand(0)->isDivergent() && + isa(Addr.getOperand(1)) && + !Addr.getOperand(2)->isDivergent()) { + // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr) + unsigned Size = + (unsigned)cast(N)->getMemoryVT().getFixedSizeInBits() / 8; + ScaleOffset = Addr.getConstantOperandVal(1) == Size; + if (ScaleOffset) { + SAddr = Addr.getOperand(2); + VOffset = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); + return true; + } + } + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || isa(Addr)) return false; @@ -1989,10 +2011,12 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const { - if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) return false; - CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(), MVT::i32); return true; } @@ -2000,10 +2024,11 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const { - if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) return false; - unsigned CPolVal = AMDGPU::CPol::GLC; + unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC; CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32); return true; } @@ -2091,7 +2116,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, - SDValue &Offset) const { + SDValue &Offset, + SDValue &CPol) const { int64_t ImmOffset = 0; SDValue LHS, RHS; @@ -2123,6 +2149,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); + CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); return true; } } @@ -2156,6 +2183,10 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; SAddr = SelectSAddrFI(CurDAG, SAddr); Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); + + bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */); + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index f7c7b3e144758..6123d75d7b616 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -162,7 +162,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, - SDValue &VOffset, SDValue &Offset) const; + SDValue &VOffset, SDValue &Offset, bool &ScaleOffset, + bool NeedIOffset = true) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; @@ -174,7 +175,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const; bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &SAddr, SDValue &Offset) const; + SDValue &SAddr, SDValue &Offset, + SDValue &CPol) const; bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index d2e718c1272f8..877c3ac34d555 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5616,7 +5616,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, - unsigned CPolBits) const { + unsigned CPolBits, + bool NeedIOffset) const { Register Addr = Root.getReg(); Register PtrBase; int64_t ConstOffset; @@ -5627,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0) { - if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, + if (NeedIOffset && + TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal)) { Addr = PtrBase; ImmOffset = ConstOffset; @@ -5640,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // saddr + large_offset -> saddr + // (voffset = large_offset & ~MaxOffset) + // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( - ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset; + if (NeedIOffset) { + std::tie(SplitImmOffset, RemainderOffset) = + TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal); + } - if (isUInt<32>(RemainderOffset)) { + if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset) + : isUInt<32>(RemainderOffset)) { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); Register HighBits = @@ -5654,12 +5660,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, HighBits) .addImm(RemainderOffset); + if (NeedIOffset) + return {{ + [=](MachineInstrBuilder &MIB) { + MIB.addReg(PtrBase); + }, // saddr + [=](MachineInstrBuilder &MIB) { + MIB.addReg(HighBits); + }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }, + }}; return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }, }}; } @@ -5691,18 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // It's possible voffset is an SGPR here, but the copy to VGPR will be // inserted later. - if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) { + bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset, + Subtarget->hasSignedGVSOffset()); + if (Register VOffset = matchExtendFromS32OrS32( + PtrBaseOffset, Subtarget->hasSignedGVSOffset())) { + if (NeedIOffset) + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(SAddr); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(VOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(ImmOffset); + }, + [=](MachineInstrBuilder &MIB) { // cpol + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); + }}}; return {{[=](MachineInstrBuilder &MIB) { // saddr MIB.addReg(SAddr); }, [=](MachineInstrBuilder &MIB) { // voffset MIB.addReg(VOffset); }, - [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(ImmOffset); - }, [=](MachineInstrBuilder &MIB) { // cpol - MIB.addImm(CPolBits); + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); }}}; } } @@ -5723,10 +5754,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) .addImm(0); + if (NeedIOffset) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol + }}; return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol }}; } @@ -5858,22 +5895,32 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) return std::nullopt; + unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */) + ? AMDGPU::CPol::SCAL + : 0; + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { int FI = LHSDef->MI->getOperand(1).getIndex(); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol }}; } + if (!isSGPR(LHS)) + if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI)) + LHS = Def->Reg; + if (!isSGPR(LHS)) return std::nullopt; return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr - [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol }}; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index e58fbb48ffb20..5f7f05c52ad21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -256,7 +256,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { selectScratchOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns - selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const; + selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits, + bool NeedIOffset = true) const; InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 1cc717bb437de..679c55dd0ea48 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -14,7 +14,7 @@ let WantsRoot = true in { def GlobalSAddr : ComplexPattern; def GlobalSAddrGLC : ComplexPattern; def ScratchSAddr : ComplexPattern; - def ScratchSVAddr : ComplexPattern; + def ScratchSVAddr : ComplexPattern; } class True16D16Table { @@ -1443,19 +1443,19 @@ class ScratchStoreSaddrPat ; class ScratchLoadSVaddrPat : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), - (inst $vaddr, $saddr, $offset, 0) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (inst $vaddr, $saddr, $offset, $cpol) >; class ScratchStoreSVaddrPat : GCNPat < - (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)), - (inst getVregSrcForVT.ret:$data, $vaddr, $saddr, $offset) + (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)), + (inst getVregSrcForVT.ret:$data, $vaddr, $saddr, $offset, $cpol) >; class ScratchLoadSVaddrPat_D16 : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)), - (inst $vaddr, $saddr, $offset, 0, $in) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)), + (inst $vaddr, $saddr, $offset, $cpol, $in) >; class ScratchLoadSVaddrPat_D16_t16 : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 407d79a30599b..56851571c6c68 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1167,6 +1167,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasFlatGVSMode() const { return FlatGVSMode; } + // FLAT GLOBAL VOffset is signed + bool hasSignedGVSOffset() const { return GFX1250Insts; } + bool enableSIScheduler() const { return EnableSIScheduler; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 8a80afd4a768f..fa0e4b9c23df3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -257,20 +257,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 15 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -357,20 +353,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: @@ -937,19 +929,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_add_co_u32 s0, 0x100, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -1048,19 +1038,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x100, s0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: @@ -1579,19 +1567,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -1692,19 +1678,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: @@ -4060,9 +4044,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -4113,9 +4095,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: @@ -4172,9 +4152,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -4223,9 +4201,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index d103423ae1675..95504052249e0 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -145,12 +145,13 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: s_wait_xcnt 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] ; GCN-NEXT: s_mov_b32 s0, exec_lo ; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index f54fbbaabe9f5..e6018e413a85d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -95,12 +95,24 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) { } define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) { -; GFX1250-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, 0xff800000 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, -1 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295 %load = load i8, ptr %gep0 %zext = zext i8 %load to i32 @@ -551,12 +563,21 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff ; Both 64-bit base and 32-bit offset are scalar define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) { -; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load i8, ptr %gep0 @@ -567,12 +588,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, ; Both 64-bit base and 32-bit offset are scalar, with immediate offset. define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) { -; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] offset:-24 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -24 @@ -584,12 +614,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr ; Both components uniform, zext forced to LHS of addressing expression define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) { -; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint ptr %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -602,12 +641,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in ; Both components uniform, zext forced to LHS of addressing expression, with immediate offset define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) { -; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] offset:128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint ptr %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -686,33 +734,13 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3 ; Cannot push the shift into 32-bits, and cannot match. define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) { -; GFX1250-SDAG-LABEL: flat_load_saddr_f32_natural_addressing: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3] -; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_f32_natural_addressing: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %voffset = load i32, ptr %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset @@ -743,8 +771,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, pt ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{} @@ -760,8 +787,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 scale_offset ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{} @@ -774,33 +800,13 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg ; Range is 1 beyond the limit where we can move the shift into 32-bits. define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) { -; GFX1250-SDAG-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3] -; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{} %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index a98df5c97293c..b0e6752386285 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -150,13 +150,11 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -321,15 +319,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -494,15 +491,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -664,17 +660,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX12-GISEL-LABEL: soff2_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -850,13 +844,11 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1032,13 +1024,11 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1200,17 +1190,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX12-GISEL-LABEL: soff4_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1386,13 +1374,11 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1565,13 +1551,11 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1672,9 +1656,7 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index 5d35adc8cbe0a..79907fd0c60bc 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -482,17 +482,16 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-SDAG-LABEL: test_v7i16_load_store_kernel: ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 ; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 8 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[12:13], 0 -; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0x1 -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] -; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset +; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 ; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7 ; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6 @@ -509,21 +508,20 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-GISEL-LABEL: test_v7i16_load_store_kernel: ; GCN-GISEL: ; %bb.0: ; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GCN-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GCN-GISEL-NEXT: s_wait_xcnt 0x0 ; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4 -; GCN-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 8 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 10 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0x1 -; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] -; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset +; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 ; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4 ; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll new file mode 100644 index 0000000000000..64392a15e9a9b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -0,0 +1,436 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b32_idx32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr %p, i32 %idx + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) { +; SDAG-LABEL: flat_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] +; SDAG-NEXT: flat_load_b32 v0, v[0:1] +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: flat_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] +; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load <2 x float>, ptr %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b96_idxpromi_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom + %ret = load <4 x float>, ptr %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxadd + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u8 v0, v0, s[0:1] offset:16 +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr %p, i64 %idxadd + %ld = load i8, ptr %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load <2 x float>, ptr %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b96_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom + %ret = load <4 x float>, ptr %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + store float 1.0, ptr %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b16_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom + store i16 1, ptr %arrayidx, align 2 + ret void +} + +define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds double, ptr %p, i64 %idxprom + store double 1.0, ptr %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_atomicrmw_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idxprom + atomicrmw add ptr %arrayidx, i32 1 monotonic + ret void +} + +define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) { +; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; SDAG-NEXT: s_mov_b32 s0, exec_lo +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; SDAG-NEXT: s_cbranch_execnz .LBB21_3 +; SDAG-NEXT: ; %bb.1: ; %Flow +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; SDAG-NEXT: s_cbranch_execnz .LBB21_4 +; SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: s_branch .LBB21_5 +; SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 1 +; SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_wait_xcnt 0x0 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; SDAG-NEXT: s_cbranch_execz .LBB21_2 +; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; SDAG-NEXT: s_wait_loadcnt 0x0 +; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; SDAG-NEXT: s_wait_xcnt 0x0 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SDAG-NEXT: s_branch .LBB21_5 +; SDAG-NEXT: .LBB21_5: +; +; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GISEL-NEXT: s_mov_b32 s2, exec_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0 +; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_cmpx_ne_u32_e64 s3, v5 +; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GISEL-NEXT: s_cbranch_execnz .LBB21_3 +; GISEL-NEXT: ; %bb.1: ; %Flow +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 +; GISEL-NEXT: s_cbranch_execnz .LBB21_4 +; GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_branch .LBB21_5 +; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GISEL-NEXT: s_wait_xcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 +; GISEL-NEXT: s_cbranch_execz .LBB21_2 +; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GISEL-NEXT: s_wait_alu 0xfffd +; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off +; GISEL-NEXT: s_wait_xcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GISEL-NEXT: s_branch .LBB21_5 +; GISEL-NEXT: .LBB21_5: +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i64, ptr %p, i64 %idxprom + %ret = atomicrmw add ptr %arrayidx, i64 1 monotonic + %ret.cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %ret.cast +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll new file mode 100644 index 0000000000000..faea84e34d7eb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b32_idx32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i32 %idx + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; SDAG-LABEL: global_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] +; SDAG-NEXT: global_load_b32 v0, v[0:1], off +; SDAG-NEXT: s_wait_loadcnt 0x0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: global_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] +; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd + %ld = load i16, ptr addrspace(1) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b96_idxpromi_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxadd + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16 +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %idxadd + %ld = load i8, ptr addrspace(1) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom + %ld = load i16, ptr addrspace(1) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd + %ld = load i16, ptr addrspace(1) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b96_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_store_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom + store float 1.0, ptr addrspace(1) %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) { +; GCN-LABEL: global_store_b16_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom + store i16 1, ptr addrspace(1) %arrayidx, align 2 + ret void +} + +define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_store_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GCN-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds double, ptr addrspace(1) %p, i64 %idxprom + store double 1.0, ptr addrspace(1) %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_atomicrmw_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %idxprom + atomicrmw add ptr addrspace(1) %arrayidx, i32 1 monotonic + ret void +} + +define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) { +; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1 +; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %p, i64 %idxprom + %ret = atomicrmw add ptr addrspace(1) %arrayidx, i64 1 monotonic + %ret.cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %ret.cast +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll new file mode 100644 index 0000000000000..27ecc837ea732 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll @@ -0,0 +1,322 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s + +define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) { +; GCN-LABEL: scratch_load_b32_alloca_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %p = alloca [32 x i32], align 4, addrspace(5) + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b32_idx32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i32 %idx + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b32_idxprom_wrong_stride: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: scratch_load_b32 v0, v0, s0 +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b96_idxpromi_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b32 v0, v0, s0 offset:64 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxadd + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_u8 v0, v0, s0 offset:16 +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(5) %p, i64 %idxadd + %ld = load i8, ptr addrspace(5) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_u16 v0, v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <2 x float> %ret +} + +; Multiplication is unsigned here, so we cannot match it. + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b96_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_store_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: scratch_store_b32 v0, v1, s0 scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + store float 1.0, ptr addrspace(5) %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_store_b16_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: scratch_store_b16 v0, v1, s0 scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom + store i16 1, ptr addrspace(5) %arrayidx, align 2 + ret void +} + +define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_store_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GCN-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds double, ptr addrspace(5) %p, i64 %idxprom + store double 1.0, ptr addrspace(5) %arrayidx, align 4 + ret void +} + +!0 = !{i32 0, i32 1024}