Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,12 @@ def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug",
[FeatureFP8ConversionInsts]
>;

def FeatureWriteCombiningMissesHazards : SubtargetFeature<"write-combining-misses-hazards",
"HasWriteCombiningMissesHazards",
"true",
"Write combining misses hazards that require s_wait_xcnt(0) before every atomic operation"
>;

def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
"HasPkFmacF16Inst",
"true",
Expand Down Expand Up @@ -2145,6 +2151,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureXNACK,
FeatureClusters,
FeatureD16Writes32BitVgpr,
FeatureWriteCombiningMissesHazards,
]>;

def FeatureISAVersion12_51 : FeatureSet<
Expand Down Expand Up @@ -2945,6 +2952,8 @@ def HasGWS : Predicate<"Subtarget->hasGWS()">;
def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">;
def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;

def HasWriteCombiningMissesHazards : Predicate<"Subtarget->hasWriteCombiningMissesHazards()">;

def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;

def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;

bool HasWriteCombiningMissesHazards = false;
bool RequiresCOV6 = false;
bool UseBlockVGPROpsForCSR = false;
bool HasGloballyAddressableScratch = false;
Expand Down Expand Up @@ -1834,6 +1834,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return getGeneration() == GFX12;
}

bool hasWriteCombiningMissesHazards() const {
return HasWriteCombiningMissesHazards;
}

// Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
// read.
bool hasScratchBaseForwardingHazard() const {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2086,6 +2086,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);

// An s_wait_xcnt(0) before every atomic store/RMW operation is required to
// work around the write combining misses hazard.
if (ST->hasWriteCombiningMissesHazards() && SIInstrInfo::isAtomic(MI) &&
SIInstrInfo::isVMEM(MI))
Wait.XCnt = 0;

// When forcing emit, we need to skip terminators because that would break the
// terminators of the MBB if we emit a waitcnt between terminators.
if (ForceEmitZeroFlag && !MI.isTerminator())
Expand Down
6 changes: 6 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1764,6 +1764,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
Expand Down Expand Up @@ -1802,6 +1803,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
Expand Down Expand Up @@ -1842,6 +1844,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
Expand Down Expand Up @@ -2088,6 +2091,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
Expand Down Expand Up @@ -2126,6 +2130,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
Expand Down Expand Up @@ -2168,6 +2173,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
Expand Down
Loading
Loading