diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 1265ec40c06d6..2104bda1cfffd 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -6405,6 +6405,13 @@ operations. ``buffer/global/flat_load/store/atomic`` instructions to global memory are termed vector memory operations. +``global_load_lds`` or ``buffer/global_load`` instructions with the `lds` flag +are LDS DMA loads. They interact with caches as if the loaded data were +being loaded to registers and not to LDS, and so therefore support the same +cache modifiers. They cannot be performed atomically. They implement volatile +(via aux/cpol bit 31) and nontemporal (via metadata) as if they were loads +from the global address space. + Private address space uses ``buffer_load/store`` using the scratch V# (GFX6-GFX8), or ``scratch_load/store`` (GFX9-GFX11). Since only a single thread is accessing the memory, atomic memory orderings are not meaningful, and all diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index afce1fe6af854..2e8942b88c214 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2819,20 +2819,24 @@ class AMDGPULoadToLDS : "", [SDNPMemOperand]>; def int_amdgcn_load_to_lds : AMDGPULoadToLDS; -class AMDGPUGlobalLoadLDS : - ClangBuiltin<"__builtin_amdgcn_global_load_lds">, - Intrinsic < - [], - [LLVMQualPointerType<1>, // Base global pointer to load from - LLVMQualPointerType<3>, // LDS base pointer to store to - llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) - llvm_i32_ty, // imm offset (applied to both global and LDS address) - llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, - // bit 1 = sc1, - // bit 4 = scc)) - [IntrWillReturn, NoCapture>, NoCapture>, - ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], - "", [SDNPMemOperand]>; +class AMDGPUGlobalLoadLDS + : ClangBuiltin<"__builtin_amdgcn_global_load_lds">, + Intrinsic< + [], + [LLVMQualPointerType<1>, // Base global pointer to load from + LLVMQualPointerType<3>, // LDS base pointer to store to + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) + llvm_i32_ty, // imm offset (applied to both global and LDS address) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, + // bit 1 = sc1, + // bit 4 = scc, + // bit 31 = volatile + // (compiler + // implemented))) + [IntrWillReturn, NoCapture>, NoCapture>, + ImmArg>, ImmArg>, ImmArg>, + IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand]>; def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; // This is IntrHasSideEffects because it reads from a volatile hardware register. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 12915c7344426..97c2c9c5316b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3446,10 +3446,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { : 0); // swz MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + // Don't set the offset value here because the pointer points to the base of + // the buffer. MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; - StorePtrI.V = nullptr; + LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(), + AMDGPUAS::BUFFER_RESOURCE)); + LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & @@ -3627,13 +3631,17 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ if (isSGPR(Addr)) MIB.addReg(VOffset); - MIB.add(MI.getOperand(4)) // offset - .add(MI.getOperand(5)); // cpol + MIB.add(MI.getOperand(4)); // offset + + unsigned Aux = MI.getOperand(5).getImm(); + MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol MachineMemOperand *LoadMMO = *MI.memoperands_begin(); MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); LoadPtrI.Offset = MI.getOperand(4).getImm(); MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(), + AMDGPUAS::GLOBAL_ADDRESS)); LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index ecc28244cc71e..b7a92a0a1d634 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -423,6 +423,9 @@ enum CPol { // Volatile (used to preserve/signal operation volatility for buffer // operations not a real instruction bit) VOLATILE = 1 << 31, + // The set of "cache policy" bits used for compiler features that + // do not correspond to handware features. + VIRTUAL_BITS = VOLATILE, }; } // namespace CPol diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 363717b017ef0..72955030591ed 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1652,6 +1652,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); Info.ptrVal = CI.getArgOperand(1); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + auto *Aux = cast(CI.getArgOperand(CI.arg_size() - 1)); + if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) + Info.flags |= MachineMemOperand::MOVolatile; return true; } case Intrinsic::amdgcn_ds_bvh_stack_rtn: @@ -11237,8 +11240,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, MachinePointerInfo StorePtrI = LoadPtrI; LoadPtrI.V = PoisonValue::get( - PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); - LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; + PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE)); + LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & @@ -11325,7 +11328,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } Ops.push_back(Op.getOperand(5)); // Offset - Ops.push_back(Op.getOperand(6)); // CPol + + unsigned Aux = Op.getConstantOperandVal(6); + Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL, + MVT::i32)); // CPol + Ops.push_back(M0Val.getValue(0)); // Chain Ops.push_back(M0Val.getValue(1)); // Glue diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c501ebba0c7ed..e037926105a14 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" @@ -276,6 +277,12 @@ class SIMemOpAccess final { /// rmw operation, "std::nullopt" otherwise. std::optional getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; + + /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store, + /// along with an indication of whether this is a load or store. If it is not + /// a direct-to-LDS operation, returns std::nullopt. + std::optional + getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const; }; class SICacheControl { @@ -691,6 +698,9 @@ class SIMemoryLegalizer final { /// instructions are added/deleted or \p MI is modified, false otherwise. bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + /// Expands LDS DMA operation \p MI. Returns true if instructions are + /// added/deleted or \p MI is modified, false otherwise. + bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); public: SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {}; @@ -820,6 +830,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { return SIAtomicAddrSpace::SCRATCH; if (AS == AMDGPUAS::REGION_ADDRESS) return SIAtomicAddrSpace::GDS; + if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || + AS == AMDGPUAS::BUFFER_STRIDED_POINTER) + return SIAtomicAddrSpace::GLOBAL; return SIAtomicAddrSpace::OTHER; } @@ -975,6 +988,16 @@ std::optional SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( return constructFromMIWithMMO(MI); } +std::optional +SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!SIInstrInfo::isLDSDMA(*MI)) + return std::nullopt; + + return constructFromMIWithMMO(MI); +} + SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { TII = ST.getInstrInfo(); IV = getIsaVersion(ST.getCPU()); @@ -1078,7 +1101,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1405,7 +1428,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1707,7 +1730,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1940,7 +1963,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2236,7 +2259,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2575,7 +2598,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write instructions. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2891,6 +2914,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && MI->mayStore()); + + // The volatility or nontemporal-ness of the operation is a + // function of the global memory, not the LDS. + SIMemOp OpKind = + SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE; + + // Handle volatile and/or nontemporal markers on direct-to-LDS loads and + // stores. The operation is treated as a volatile/nontemporal store + // to its second argument. + return CC->enableVolatileAndOrNonTemporal( + MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(), + MOI.isNonTemporal(), MOI.isLastUse()); +} + bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) { const MachineModuleInfo &MMI = getAnalysis().getMMI(); @@ -2942,14 +2982,17 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; - if (const auto &MOI = MOA.getLoadInfo(MI)) + if (const auto &MOI = MOA.getLoadInfo(MI)) { Changed |= expandLoad(*MOI, MI); - else if (const auto &MOI = MOA.getStoreInfo(MI)) { + } else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); - } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) + } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) { + Changed |= expandLDSDMA(*MOI, MI); + } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) { Changed |= expandAtomicFence(*MOI, MI); - else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) + } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) { Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 53b2542cf9a7e..142290a39f8f4 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) { ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc +; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc -; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6 ; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc +; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc -; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) @@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll index 5d03dfb56c8cc..352af044b0a6d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll @@ -218,3 +218,174 @@ main_body: ret void } +define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_volatile: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b32 m0, s0 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:256 lds +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_volatile: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: s_mov_b32 m0, s0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:256 +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_volatile: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 m0, s0 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:256 lds +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_volatile: +; GFX942-GISEL: ; %bb.0: ; %main_body +; GFX942-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:256 +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0) + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: buffer_load_lds_dword_volatile: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: buffer_load_lds_dword_volatile: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_load_lds_dword_volatile: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile: +; GFX942-GISEL: ; %bb.0: ; %main_body +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:256 lds +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +main_body: + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0) + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_nontemporal: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v0, s[0:1] glc slc lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_nontemporal: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v0, s[0:1] nt +; GFX942-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_nontemporal: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v0, s[0:1] slc lds +; GFX10-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: ; %main_body +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: global_load_lds_dword v0, s[0:1] nt +; GFX942-GISEL-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: buffer_load_lds_dword_nontemporal: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: buffer_load_lds_dword_nontemporal: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_load_lds_dword_nontemporal: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds +; GFX10-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: ; %main_body +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-GISEL-NEXT: s_endpgm +main_body: + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + ret void +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll index f0204bd81470a..1dcd032e14c6a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll @@ -110,3 +110,43 @@ main_body: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0) ret void } + +define amdgpu_ps float @buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GCN-LABEL: buffer_load_lds_dword_volatile: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 glc lds +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:256 lds +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:512 lds +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 2147483648) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 256, i32 0) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 512, i32 0) + %res = load float, ptr addrspace(3) %lds + ret float %res +} + +define amdgpu_ps float @buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GCN-LABEL: buffer_load_lds_dword_nontemporal: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 glc slc lds +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0), !nontemporal !0 + %res = load float, ptr addrspace(3) %lds + ret float %res +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index b5d741b99c582..a9799993f5cdc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -80,25 +80,29 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(p ; PREGFX10-LABEL: buffer_load_volatile: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc +; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: buffer_load_volatile: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: buffer_load_volatile: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll index 97db15ba637a5..1d1d3e4a68fee 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll @@ -107,6 +107,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 @@ -120,7 +121,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen ; GFX12-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 9dac2393fd966..1e4b63327651e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -354,6 +354,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX9-SDAG-NEXT: s_mov_b32 s5, s10 @@ -365,8 +366,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-SDAG-NEXT: s_mov_b32 s3, s10 ; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -384,6 +385,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30 ; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 @@ -395,8 +397,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 ; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX942-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -414,6 +416,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-SDAG-NEXT: s_mov_b32 s5, s12 @@ -425,8 +428,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -444,6 +447,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 ; GFX942-GISEL-NEXT: s_mov_b32 s4, s7 @@ -455,8 +459,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -475,6 +479,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] ; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 @@ -488,8 +493,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-SDAG-NEXT: s_mov_b32 s2, s1 ; GFX10-SDAG-NEXT: s_mov_b32 s3, s10 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -508,6 +513,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5] ; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: s_clause 0x1 ; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 @@ -519,8 +525,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 ; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -541,6 +547,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 @@ -554,8 +561,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -576,6 +583,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] ; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 @@ -588,8 +596,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -610,6 +618,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 @@ -623,8 +632,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -645,6 +654,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] ; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 @@ -657,8 +667,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll new file mode 100644 index 0000000000000..0b40c81af414d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll @@ -0,0 +1,542 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -global-isel=0 < %s | FileCheck --check-prefixes=GFX90A %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=0 < %s | FileCheck --check-prefixes=GFX942 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=1 < %s | FileCheck --check-prefixes=GFX942-GISEL %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -global-isel=0 < %s | FileCheck --check-prefixes=GFX10 %s + +define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_volatile: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_volatile: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_volatile: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: global_load_dword v[0:1], off glc slc lds +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: global_load_lds_dword v[0:1], off nt +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off nt +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off slc lds +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @global_load_lds_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: global_load_lds_dword_volatile_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: global_load_lds_dword_volatile_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: global_load_lds_dword_volatile_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_volatile_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0 + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p1_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p1_dword_volatile: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p1_dword_volatile: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p1_dword_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p1_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p1_dword_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: global_load_dword v[0:1], off glc slc lds +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p1_dword_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: global_load_lds_dword v[0:1], off nt +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p1_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off nt +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p1_dword_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off slc lds +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p1_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p1_dword_volatile_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p1_dword_volatile_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_mov_b32 m0, s2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off sc0 sc1 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: global_load_lds_dword v[0:1], off offset:512 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p1_dword_volatile_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v[0:1], off glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v[0:1], off offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0 + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p7_dword_volatile: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p7_dword_volatile: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p7_dword_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648) + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p7_dword_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p7_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p7_dword_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0 + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @load_to_lds_p7_dword_volatile_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: load_to_lds_p7_dword_volatile_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: load_to_lds_p7_dword_volatile_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-NEXT: s_mov_b32 m0, s5 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX942-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: load_to_lds_p7_dword_volatile_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s5 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0 + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b32 m0, s4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 m0, s4 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b32 m0, s4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc slc lds +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen nt lds +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 m0, s4 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen slc lds +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 0), !nontemporal !0 + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0) + ret void +} + +define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) { +; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b32 m0, s4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc lds +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX90A-NEXT: s_endpgm +; +; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_mov_b32 m0, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX942-GISEL-NEXT: s_nop 0 +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX942-GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 m0, s4 +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen glc dlc lds +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen offset:512 lds +; GFX10-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648), !nontemporal !0 + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0) + ret void +} + +!0 = !{i32 1}