diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e32d0c861ffbb..fc55e29325cd2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1805,6 +1805,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, return true; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( + SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + KnownBits VKnown = CurDAG->computeKnownBits(VAddr); + KnownBits SKnown = KnownBits::computeForAddSub( + true, false, CurDAG->computeKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const { @@ -1832,6 +1850,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VAddr = SDValue(VMov, 0); SAddr = LHS; + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) + return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); return true; } @@ -1854,6 +1874,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; } + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) + return false; SAddr = SelectSAddrFI(CurDAG, SAddr); Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 21f97f2e87f28..93d43e17ba794 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -188,6 +188,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { SDValue &VOffset, SDValue &Offset) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; + bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, + uint64_t ImmOffset) const; bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a01582c60897c..f20cd8e4a7d1e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3985,6 +3985,24 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { }}; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( + Register VAddr, Register SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + auto VKnown = KnownBits->getKnownBits(VAddr); + auto SKnown = KnownBits::computeForAddSub( + true, false, KnownBits->getKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); @@ -4013,6 +4031,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register LHS = AddrDef->MI->getOperand(1).getReg(); auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); + if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) + return None; + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { int FI = LHSDef->MI->getOperand(1).getIndex(); return {{ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 6a101f950abf9..dd74a26efdac9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -210,6 +210,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; + bool checkFlatScratchSVSSwizzleBug(Register VAddr, Register SAddr, + uint64_t ImmOffset) const; InstructionSelector::ComplexRendererFns selectScratchSVAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a7102351ae197..4f54e76ccbb68 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1044,6 +1044,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasVOPDInsts() const { return HasVOPDInsts; } + bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } + /// Return true if the target has the S_DELAY_ALU instruction. bool hasDelayAlu() const { return GFX11Insts; } diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 3dd8dcf0c09f6..32297e863a46b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -51,12 +51,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -132,17 +132,17 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -311,12 +311,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -395,18 +395,18 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -576,16 +576,17 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 2 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: v_add3_u32 v2, 4, s0, v0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v2, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v2, v3, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -664,18 +665,19 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: v_add3_u32 v3, 4, s0, v0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v3, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index a8e97b5f7f49a..c49c617bbe8a8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4091,11 +4091,11 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_movk_i32 s0, 0xef7f -; GFX11-NEXT: scratch_store_b8 v0, v1, s0 dlc +; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_u8 v0, v0, s0 glc dlc +; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4149,11 +4149,11 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-PAL-NEXT: s_movk_i32 s0, 0xef7f -; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, s0 dlc +; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, s0 glc dlc +; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] bb: