diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index bba7682cd7a0d..08804e4a86358 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1980,12 +1980,31 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return true; } +// For unbuffered smem loads, it is illegal for the Immediate Offset to be +// negative if the resulting (Offset + (M0 or SOffset or zero) is negative. +// Handle the case where the Immediate Offset + SOffset is negative. +bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, + bool Imm32Only, + bool IsBuffer, + int64_t ImmOffset) const { + if (AMDGPU::hasSMRDSignedImmOffset(*Subtarget) && !IsBuffer & !Imm32Only && + ImmOffset < 0) { + KnownBits SKnown = CurDAG->computeKnownBits(*SOffset); + if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0) + return false; + } + + return true; +} + // Match an immediate (if Offset is not null) or an SGPR (if SOffset is // not null) offset. If Imm32Only is true, match only 32-bit immediate // offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, - bool Imm32Only, bool IsBuffer) const { + bool Imm32Only, bool IsBuffer, + bool HasSOffset, + int64_t ImmOffset) const { assert((!SOffset || !Offset) && "Cannot match both soffset and offset at the same time!"); @@ -1993,15 +2012,18 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, if (!C) { if (!SOffset) return false; + if (ByteOffsetNode.getValueType().isScalarInteger() && ByteOffsetNode.getValueType().getSizeInBits() == 32) { *SOffset = ByteOffsetNode; - return true; + return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, + ImmOffset); } if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { *SOffset = ByteOffsetNode.getOperand(0); - return true; + return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, + ImmOffset); } } return false; @@ -2012,8 +2034,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, // GFX9 and GFX10 have signed byte immediate offsets. The immediate // offset for S_BUFFER instructions is unsigned. int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue(); - std::optional EncodedOffset = - AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer); + std::optional EncodedOffset = AMDGPU::getSMRDEncodedOffset( + *Subtarget, ByteOffset, IsBuffer, HasSOffset); if (EncodedOffset && Offset && !Imm32Only) { *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; @@ -2072,13 +2094,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // true, match only 32-bit immediate offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only, - bool IsBuffer) const { + bool Imm32Only, bool IsBuffer, + bool HasSOffset, + int64_t ImmOffset) const { if (SOffset && Offset) { assert(!Imm32Only && !IsBuffer); SDValue B; - return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) && - SelectSMRDBaseOffset(B, SBase, SOffset, nullptr); + + if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) + return false; + + int64_t ImmOff = 0; + if (ConstantSDNode *C = dyn_cast(*Offset)) + ImmOff = C->getSExtValue(); + + return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, + ImmOff); } // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -2097,11 +2128,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, } if (!N0 || !N1) return false; - if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) { + + if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset)) { SBase = N0; return true; } - if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) { + if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset)) { SBase = N1; return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index f987b747c0e21..d145511ccaae4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -143,6 +143,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool isFlatScratchBaseLegal(SDValue Addr) const; bool isFlatScratchBaseLegalSV(SDValue Addr) const; bool isFlatScratchBaseLegalSVImm(SDValue Addr) const; + bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only, + bool IsBuffer, int64_t ImmOffset = 0) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, @@ -185,11 +187,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false) const; + bool IsBuffer = false, bool HasSOffset = false, + int64_t ImmOffset = 0) const; SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false) const; + bool IsBuffer = false, bool HasSOffset = false, + int64_t ImmOffset = 0) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; @@ -201,6 +205,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, SDValue &Offset) const; + bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase, + SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index e13c13913d4e8..9f238795cbb7c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4198,10 +4198,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, return false; const GEPInfo &GEPI = AddrInfo[0]; - std::optional EncodedImm = - AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false); + std::optional EncodedImm; if (SOffset && Offset) { + EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, + /*HasSOffset=*/true); if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && AddrInfo.size() > 1) { const GEPInfo &GEPI2 = AddrInfo[1]; @@ -4211,6 +4212,17 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, Base = GEPI2.SgprParts[0]; *SOffset = OffsetReg; *Offset = *EncodedImm; + if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI)) + return true; + + // For unbuffered smem loads, it is illegal for the Immediate Offset + // to be negative if the resulting (Offset + (M0 or SOffset or zero) + // is negative. Handle the case where the Immediate Offset + SOffset + // is negative. + auto SKnown = KB->getKnownBits(*SOffset); + if (*Offset + SKnown.getMinValue().getSExtValue() < 0) + return false; + return true; } } @@ -4218,6 +4230,8 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, return false; } + EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, + /*HasSOffset=*/false); if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { Base = GEPI.SgprParts[0]; *Offset = *EncodedImm; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8a4a46ce50d1d..25c24c924f0a2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1315,6 +1315,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // of sign-extending. bool hasGetPCZeroExtension() const { return GFX12Insts; } + // \returns true if the target supports signed immediate offset for SMRD + // instructions. + bool hasSignedSMRDImmOffset() const { return getGeneration() >= GFX9; } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4e0074451aa58..2f10e8b6e9935 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -158,6 +158,12 @@ namespace llvm { namespace AMDGPU { +/// \returns true if the target supports signed immediate offset for SMRD +/// instructions. +bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { + return isGFX9Plus(ST); +} + /// \returns True if \p STI is AMDHSA. bool isHsaAbi(const MCSubtargetInfo &STI) { return STI.getTargetTriple().getOS() == Triple::AMDHSA; @@ -2874,10 +2880,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) { return isGCN3Encoding(ST) || isGFX10Plus(ST); } -static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { - return isGFX9Plus(ST); -} - bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset) { if (isGFX12Plus(ST)) @@ -2912,7 +2914,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, } std::optional getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset, bool IsBuffer) { + int64_t ByteOffset, bool IsBuffer, + bool HasSOffset) { + // For unbuffered smem loads, it is illegal for the Immediate Offset to be + // negative if the resulting (Offset + (M0 or SOffset or zero) is negative. + // Handle case where SOffset is not present. + if (!IsBuffer && hasSMRDSignedImmOffset(ST) && !HasSOffset && ByteOffset < 0) + return std::nullopt; + if (isGFX12Plus(ST)) // 24 bit signed offsets return isInt<24>(ByteOffset) ? std::optional(ByteOffset) : std::nullopt; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 943588fe701cc..a326ac927ef6c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1300,6 +1300,7 @@ bool hasVOPD(const MCSubtargetInfo &STI); bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI); int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); unsigned hasKernargPreload(const MCSubtargetInfo &STI); +bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -1472,7 +1473,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset); /// S_LOAD instructions have a signed offset, on other subtargets it is /// unsigned. S_BUFFER has an unsigned offset for all subtargets. std::optional getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset, bool IsBuffer); + int64_t ByteOffset, bool IsBuffer, + bool HasSOffset = false); /// \return The encoding that can be used for a 32-bit literal offset in an SMRD /// instruction. This is only useful on CI.s diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir index c44477273dad0..504f7697a0fcc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir @@ -1234,7 +1234,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0 :: (load (s32), addrspace 4) + ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 + ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -1 @@ -1304,7 +1312,15 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0 :: (load (s32), addrspace 4) + ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1 + ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4) ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -524288 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll index 139f82b3dc9f7..9ee0acf2aa2db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -88,11 +88,13 @@ entry: ret void } -; GFX9_10 can use a signed immediate byte offset +; GFX9+ can use a signed immediate byte offset but not without sgpr[offset] ; GCN-LABEL: {{^}}smrd6: ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 ; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 -; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], -0x4 +; GFX9_10: s_add_u32 s2, s2, -4 +; GFX9_10: s_addc_u32 s3, s3, -1 +; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { entry: %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll index 54dc5b8b9d3dd..41d2360dd5e1e 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll @@ -297,9 +297,11 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: .LBB5_1: ; %loop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190 ; GFX9-NEXT: s_add_i32 s2, s2, -1 +; GFX9-NEXT: s_add_u32 s4, s0, 0xfffffe70 +; GFX9-NEXT: s_addc_u32 s5, s1, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %end @@ -307,10 +309,14 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr, ; ; GFX12-LABEL: test_sink_smem_offset_neg400: ; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_movk_i32 s4, 0xfe70 +; GFX12-NEXT: s_mov_b32 s5, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: .LBB5_1: ; %loop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190 +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll index c69207c0472e7..08da89ec0fb22 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll @@ -19,15 +19,31 @@ define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace( } define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) { -; GCN-LABEL: test_s_load_i8_imm: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_i8 s0, s[0:1], -0x64 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; DAG-LABEL: test_s_load_i8_imm: +; DAG: ; %bb.0: +; DAG-NEXT: s_movk_i32 s2, 0xff9c +; DAG-NEXT: s_mov_b32 s3, -1 +; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; DAG-NEXT: s_load_i8 s0, s[0:1], 0x0 +; DAG-NEXT: s_wait_kmcnt 0x0 +; DAG-NEXT: v_mov_b32_e32 v2, s0 +; DAG-NEXT: global_store_b32 v[0:1], v2, off +; DAG-NEXT: s_nop 0 +; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; DAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_s_load_i8_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffff9c +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GISEL-NEXT: s_load_i8 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(4) %in, i64 -100 %ld = load i8, ptr addrspace(4) %gep %sext = sext i8 %ld to i32 @@ -195,15 +211,31 @@ define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace } define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) { -; GCN-LABEL: test_s_load_i16_imm: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_i16 s0, s[0:1], -0xc8 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; DAG-LABEL: test_s_load_i16_imm: +; DAG: ; %bb.0: +; DAG-NEXT: s_movk_i32 s2, 0xff38 +; DAG-NEXT: s_mov_b32 s3, -1 +; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; DAG-NEXT: s_load_i16 s0, s[0:1], 0x0 +; DAG-NEXT: s_wait_kmcnt 0x0 +; DAG-NEXT: v_mov_b32_e32 v2, s0 +; DAG-NEXT: global_store_b32 v[0:1], v2, off +; DAG-NEXT: s_nop 0 +; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; DAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_s_load_i16_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffff38 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GISEL-NEXT: s_load_i16 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm %gep = getelementptr i16, ptr addrspace(4) %in, i64 -100 %ld = load i16, ptr addrspace(4) %gep %sext = sext i16 %ld to i32 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index d9cbbc11f9a73..2f7e91faa4184 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -157,12 +157,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg4096: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1000 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4096: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf000 +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4096: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -198,12 +211,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg4097: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1001 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4097: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xefff +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4097: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xffffefff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -239,12 +265,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg4098: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x1002 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4098: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xeffe +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4098: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xffffeffe +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -376,12 +415,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg2048: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x800 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2048: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf800 +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2048: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff800 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -413,12 +465,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg2049: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x801 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2049: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf7ff +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2049: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff7ff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -450,12 +515,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_neg2050: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x802 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2050: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf7fe +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2050: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff7fe +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -525,12 +603,25 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFF: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], -0x800000 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0xFFFFFF: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_mov_b32 s0, 0xff800000 +; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0xFFFFFF: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xff800000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608 %load = load i8, ptr addrspace(1) %gep0 %zext = zext i8 %load to i32 @@ -1721,12 +1812,29 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:-0x18 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_mov_b32 s5, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[4:5] +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xffe8 +; GFX12-SDAG-NEXT: s_mov_b32 s3, -1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffffe8 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll index 77fd0bc058aca..2b517736ecff3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll @@ -53,14 +53,25 @@ entry: } define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: prefetch_data_sgpr_min_offset: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_prefetch_data s[0:1], -0x800000, null, 0 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_mov_b32 s2, 0xff800000 +; GFX12-SDAG-NEXT: s_mov_b32 s3, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: prefetch_data_sgpr_min_offset: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) @@ -215,14 +226,25 @@ entry: } define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: prefetch_inst_sgpr_min_offset: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_prefetch_inst s[0:1], -0x800000, null, 0 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-LABEL: prefetch_inst_sgpr_min_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_mov_b32 s2, 0xff800000 +; GFX12-SDAG-NEXT: s_mov_b32 s3, -1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: prefetch_inst_sgpr_min_offset: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_sgpr_min_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index 4ce9260b8d53d..52db7fea08e05 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -88,11 +88,13 @@ entry: ret void } -; GFX9_10 can use a signed immediate byte offset +; GFX9+ can use a signed immediate byte offset but not without sgpr[offset] ; GCN-LABEL: {{^}}smrd6: ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 -; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, -0x4 +; GFX9_10: s_add_u32 s2, s2, -4 +; GFX9_10: s_addc_u32 s3, s3, -1 +; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { entry: %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1