297 changes: 174 additions & 123 deletions llvm/docs/AMDGPUUsage.rst

Large diffs are not rendered by default.

113 changes: 112 additions & 1 deletion llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,20 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
Position Pos) const override;
};

class SIGfx11CacheControl : public SIGfx10CacheControl {
public:
SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}

bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;

bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
};

class SIMemoryLegalizer final : public MachineFunctionPass {
private:

Expand Down Expand Up @@ -834,7 +848,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
return std::make_unique<SIGfx7CacheControl>(ST);
return std::make_unique<SIGfx10CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX11)
return std::make_unique<SIGfx10CacheControl>(ST);
return std::make_unique<SIGfx11CacheControl>(ST);
}

bool SIGfx6CacheControl::enableLoadCacheBypass(
Expand Down Expand Up @@ -2012,6 +2028,101 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}

bool SIGfx11CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;

if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
// Set the L0 and L1 cache policies to MISS_EVICT.
// Note: there is no L2 cache coherent bypass control at the ISA level.
Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
// CU mode all waves of a work-group are on the same CU, and so the L0
// does not need to be bypassed.
if (!ST.isCuModeEnabled())
Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to bypass.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}

/// The scratch address space does not need the global memory caches
/// to be bypassed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.

/// Other address spaces do not have a cache.

return Changed;
}

bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {

// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());

// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
// handle it as do not want to pessimize all atomics. Also they do not support
// the nontemporal attribute.
assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);

bool Changed = false;

if (IsVolatile) {
// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
// and MISS_LRU for store instructions.
// Note: there is no L2 cache coherent bypass control at the ISA level.
if (Op == SIMemOp::LOAD)
Changed |= enableGLCBit(MI);

// Set MALL NOALLOC for load and store instructions.
Changed |= enableDLCBit(MI);

// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER);
return Changed;
}

if (IsNonTemporal) {
// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
// and L2 cache policy to STREAM.
// For stores setting both GLC and SLC configures L0 and L1 cache policy
// to MISS_EVICT and the L2 cache policy to STREAM.
if (Op == SIMemOp::STORE)
Changed |= enableGLCBit(MI);
Changed |= enableSLCBit(MI);

// Set MALL NOALLOC for load and store instructions.
Changed |= enableDLCBit(MI);
return Changed;
}

return Changed;
}

bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand All @@ -271,7 +271,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -404,7 +404,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -419,7 +419,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -566,7 +566,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -583,7 +583,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -7785,7 +7785,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand All @@ -7798,7 +7798,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -7937,7 +7937,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -7953,7 +7953,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -8107,7 +8107,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -8125,7 +8125,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand All @@ -134,7 +134,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -271,7 +271,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand All @@ -285,7 +285,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -415,7 +415,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_0:
Expand All @@ -428,7 +428,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -565,7 +565,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_1:
Expand All @@ -579,7 +579,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand All @@ -271,7 +271,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -406,7 +406,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -421,7 +421,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -570,7 +570,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -587,7 +587,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -7911,7 +7911,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand All @@ -7924,7 +7924,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -8065,7 +8065,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -8081,7 +8081,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -8237,7 +8237,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -8255,7 +8255,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
Expand All @@ -279,7 +279,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
Expand Down Expand Up @@ -363,7 +363,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
Expand All @@ -378,7 +378,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -260,7 +260,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -390,7 +390,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -402,7 +402,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -540,7 +540,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -553,7 +553,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -7847,7 +7847,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -7857,7 +7857,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -7987,7 +7987,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -7999,7 +7999,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -8137,7 +8137,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -8150,7 +8150,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -279,7 +279,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -407,7 +407,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_0:
Expand All @@ -418,7 +418,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_1:
Expand All @@ -557,7 +557,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -260,7 +260,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -392,7 +392,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -404,7 +404,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -544,7 +544,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -557,7 +557,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -7079,7 +7079,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -7089,7 +7089,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -7221,7 +7221,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -7233,7 +7233,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down Expand Up @@ -7373,7 +7373,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
Expand All @@ -7386,7 +7386,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
Expand All @@ -287,7 +287,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
Expand Down Expand Up @@ -377,7 +377,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
Expand All @@ -389,7 +389,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc
; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -171,7 +171,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc
; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -335,7 +335,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -347,7 +347,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -508,7 +508,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_store_0:
Expand All @@ -520,7 +520,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
Expand Down Expand Up @@ -683,7 +683,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_store_1:
Expand All @@ -696,7 +696,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
Expand All @@ -364,7 +364,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
Expand Down Expand Up @@ -479,7 +479,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
Expand All @@ -493,7 +493,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
Expand Down