Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 3 additions & 58 deletions llvm/docs/AMDGPUUsage.rst

Large diffs are not rendered by default.

51 changes: 14 additions & 37 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,12 +390,6 @@ class SICacheControl {
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;

/// Inserts any necessary instructions before the barrier start instruction
/// \p MI in order to support pairing of barriers and fences.
virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
return false;
};

/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
};
Expand Down Expand Up @@ -576,12 +570,8 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order, bool AtomicsOnly) const override;

bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;

bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
};

class SIGfx11CacheControl : public SIGfx10CacheControl {
Expand Down Expand Up @@ -2046,8 +2036,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// the WGP. Therefore need to wait for operations to complete to ensure
// they are visible to waves in the other CU as the L0 is per CU.
// Otherwise in CU mode and all waves of a work-group are on the same CU
// which shares the same L0.
if (!ST.isCuModeEnabled()) {
// which shares the same L0. Note that we still need to wait when
// performing a release in this mode to respect the transitivity of
// happens-before, e.g. other waves of the workgroup must be able to
// release the memory from another wave at a wider scope.
if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
VMCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
Expand Down Expand Up @@ -2202,22 +2195,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}

bool SIGfx10CacheControl::insertBarrierStart(
MachineBasicBlock::iterator &MI) const {
// We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
// mode. This is because a CU mode release fence does not emit any wait, which
// is fine when only dealing with vmem, but isn't sufficient in the presence
// of barriers which do not go through vmem.
// GFX12.5 does not require this additional wait.
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
return false;

BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
return true;
}

bool SIGfx11CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
Expand Down Expand Up @@ -2396,15 +2373,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// In WGP mode the waves of a work-group can be executing on either CU
// of the WGP. Therefore need to wait for operations to complete to
// ensure they are visible to waves in the other CU as the L0 is per CU.
//
// Otherwise in CU mode and all waves of a work-group are on the same CU
// which shares the same L0.
// which shares the same L0. Note that we still need to wait when
// performing a release in this mode to respect the transitivity of
// happens-before, e.g. other waves of the workgroup must be able to
// release the memory from another wave at a wider scope.
//
// GFX12.5:
// CU$ has two ports. To ensure operations are visible at the workgroup
// level, we need to ensure all operations in this port have completed
// so the other SIMDs in the WG can see them. There is no ordering
// guarantee between the ports.
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
Expand Down Expand Up @@ -2977,11 +2959,6 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
MI = II->getIterator();
}

if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
Changed |= CC->insertBarrierStart(MI);
continue;
}

if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,9 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
Expand All @@ -562,6 +564,8 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_release
; GFX11CU: bb.0.entry:
; GFX11CU-NEXT: S_WAITCNT_soft 1015
; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") release
Expand All @@ -587,7 +591,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
Expand All @@ -599,6 +605,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_acq_rel
; GFX11CU: bb.0.entry:
; GFX11CU-NEXT: S_WAITCNT_soft 1015
; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") acq_rel
Expand All @@ -624,7 +632,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 16240
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
Expand All @@ -636,6 +646,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
;
; GFX11CU-LABEL: name: workgroup_one_as_seq_cst
; GFX11CU: bb.0.entry:
; GFX11CU-NEXT: S_WAITCNT_soft 1015
; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup-one-as") seq_cst
Expand Down Expand Up @@ -1305,8 +1317,9 @@ define amdgpu_kernel void @workgroup_release() #0 {
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
Expand All @@ -1317,7 +1330,8 @@ define amdgpu_kernel void @workgroup_release() #0 {
;
; GFX11CU-LABEL: name: workgroup_release
; GFX11CU: bb.0.entry:
; GFX11CU-NEXT: S_WAITCNT_soft 64519
; GFX11CU-NEXT: S_WAITCNT_soft 7
; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") release
Expand Down Expand Up @@ -1345,8 +1359,9 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
;
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
Expand All @@ -1358,7 +1373,8 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
;
; GFX11CU-LABEL: name: workgroup_acq_rel
; GFX11CU: bb.0.entry:
; GFX11CU-NEXT: S_WAITCNT_soft 64519
; GFX11CU-NEXT: S_WAITCNT_soft 7
; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") acq_rel
Expand Down Expand Up @@ -1386,8 +1402,9 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
;
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
; GFX10CU-NEXT: S_WAITCNT_soft 112
; GFX10CU-NEXT: S_WAITCNT_lds_direct
; GFX10CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
Expand All @@ -1399,7 +1416,8 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
;
; GFX11CU-LABEL: name: workgroup_seq_cst
; GFX11CU: bb.0.entry:
; GFX11CU-NEXT: S_WAITCNT_soft 64519
; GFX11CU-NEXT: S_WAITCNT_soft 7
; GFX11CU-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX11CU-NEXT: S_ENDPGM 0
entry:
fence syncscope("workgroup") seq_cst
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10CU-NEXT: s_barrier
; GFX10CU-NEXT: ds_read_b32 v0, v0
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
Expand Down
20 changes: 8 additions & 12 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX10-CU-LABEL: test_s_barrier:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
Expand All @@ -26,7 +25,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX11-CU-LABEL: test_s_barrier:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
Expand All @@ -38,7 +36,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX12-CU-LABEL: test_s_barrier:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
Expand All @@ -63,8 +60,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
Expand All @@ -77,8 +74,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
Expand All @@ -94,8 +91,10 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
Expand Down Expand Up @@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-CU-NEXT: s_barrier
; GFX10-CU-NEXT: s_endpgm
;
Expand All @@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX11-CU-NEXT: s_barrier
; GFX11-CU-NEXT: s_endpgm
;
Expand All @@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_alu 0xffe3
; GFX12-CU-NEXT: s_barrier_signal -1
; GFX12-CU-NEXT: s_barrier_wait -1
; GFX12-CU-NEXT: s_endpgm
Expand Down
Loading
Loading