Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1871,9 +1871,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// \returns true if the subtarget supports clusters of workgroups.
bool hasClusters() const { return HasClusters; }

/// \returns true if the subtarget requires a wait for xcnt before atomic
/// flat/global stores & rmw.
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
/// \returns true if the subtarget requires a wait for xcnt before VMEM
/// accesses that must never be repeated in the event of a page fault/re-try.
/// Atomic stores/rmw and all volatile accesses fall under this criteria.
bool requiresWaitXCntForSingleAccessInstructions() const {
return GFX1250Insts;
}

/// \returns the number of significant bits in the immediate field of the
/// S_NOP instruction.
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9934,6 +9934,11 @@ void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
}
}

bool SIInstrInfo::isVBUFFER(const MachineInstr &MI) const {
return (ST.getGeneration() == GCNSubtarget::GFX12) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to have the isGFX12 check done in SIMemoryLegalizer, right next to the FIXME comment that explains why we need it. Then this helper function can become a generation-independent isBUF.

Also nit: you don't need the parens.

(MI.getDesc().TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF));
}

bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
if (!isSMRD(MI))
return false;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
}

/// \returns true if \p MI is a GFX12 VBUFFER instruction.
bool isVBUFFER(const MachineInstr &MI) const;

static bool isSMRD(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
}
Expand Down
19 changes: 16 additions & 3 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,13 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
}

// FIXME: The MMO of buffer atomic instructions does not always have an atomic
// ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
// here, but the lowering should really be cleaned up at some point.
if (ST.getInstrInfo()->isVBUFFER(*MI) && SIInstrInfo::isAtomic(*MI) &&
Ordering == AtomicOrdering::NotAtomic)
Ordering = AtomicOrdering::Monotonic;

SIAtomicScope Scope = SIAtomicScope::NONE;
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
Expand Down Expand Up @@ -2059,6 +2066,13 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);

if (ST.requiresWaitXCntForSingleAccessInstructions() &&
(TII->isFLAT(*MI) || TII->isVBUFFER(*MI))) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simpler to use isVMEM here and in finalizeStore below?

MachineBasicBlock &MBB = *MI->getParent();
BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
Changed = true;
}

// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
Expand All @@ -2077,9 +2091,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
bool Changed = false;

// GFX12.5 only: xcnt wait is needed before flat and global atomics
// stores/rmw.
if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
(TII->isFLAT(MI) || TII->isVBUFFER(MI))) {
MachineBasicBlock &MBB = *MI.getParent();
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
Changed = true;
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5136,6 +5136,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
Expand Down Expand Up @@ -6215,6 +6216,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: v_readlane_b32 s31, v9, 1
Expand Down
6 changes: 6 additions & 0 deletions llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7409,8 +7409,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v1, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v0
Expand All @@ -7431,8 +7433,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v1, v0
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v1, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v0
Expand Down Expand Up @@ -7686,6 +7690,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v0, v1
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v1
Expand All @@ -7706,6 +7711,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v0, v1
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v1
Expand Down
Loading