Skip to content

Commit

Permalink
AMDGPU: Simplify DS atomicrmw fadd handling (#89468)
Browse files Browse the repository at this point in the history
DS atomic fadd F32 does respect the denormal mode, so we do not need to
consider the expected FP mode or unsafe-fp-atomics attribute. They don't
respect the rounding mode, but we don't care outside of strictfp. This
also reveals the fp-mode-is-flush check has been missing in the cases
that should be considering it alongside amdgpu-unsafe-fp-atomics.

This also stops considering the case where flushing is enabled for f64,
as flushing isn't mandated and we barely handle this case.
  • Loading branch information
arsenm committed Apr 22, 2024
1 parent 087b33b commit 5b6db43
Show file tree
Hide file tree
Showing 9 changed files with 342 additions and 155 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1896,7 +1896,7 @@ def HasVINTERPEncoding : Predicate<"Subtarget->hasVINTERPEncoding()">,
def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of FeatureGFX9Insts)>;

def HasLDSFPAtomicAdd : Predicate<"Subtarget->hasLDSFPAtomicAdd()">,
def HasLDSFPAtomicAddF32 : Predicate<"Subtarget->hasLDSFPAtomicAddF32()">,
AssemblerPredicate<(all_of FeatureGFX8Insts)>;

def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1624,7 +1624,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}

auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAdd()) {
if (ST.hasLDSFPAtomicAddF32()) {
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
if (ST.hasLdsAtomicAddF64())
Atomic.legalFor({{S64, LocalPtr}});
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ defm DS_AND_B32 : DS_1A1D_NORET_mc<"ds_and_b32">;
defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">;
defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">;

let SubtargetPredicate = HasLDSFPAtomicAdd in {
let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">;
}

Expand Down Expand Up @@ -523,7 +523,7 @@ defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;

defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32>;

let SubtargetPredicate = HasLDSFPAtomicAdd in {
let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32>;
}
defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32>;
Expand Down Expand Up @@ -697,7 +697,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",

} // let SubtargetPredicate = isGFX8Plus

let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] in {
let SubtargetPredicate = HasLDSFPAtomicAddF32, OtherPredicates = [HasDsSrc2Insts] in {
def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
}

Expand Down Expand Up @@ -1088,7 +1088,7 @@ let SubtargetPredicate = isGFX11Plus in {
defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B32, DS_CMPSTORE_B32, i32, "atomic_cmp_swap">;
}

let SubtargetPredicate = HasLDSFPAtomicAdd in {
let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_F32, DS_ADD_F32, f32, "atomic_load_fadd">;
}

Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -960,7 +960,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasScalarAtomics;
}

bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }

/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
Expand Down
47 changes: 28 additions & 19 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15977,6 +15977,8 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
SNaN, Depth);
}

#if 0
// FIXME: This should be checked before unsafe fp atomics are enabled
// Global FP atomic instructions have a hardcoded FP mode and do not support
// FP32 denormals, and only support v2f16 denormals.
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
Expand All @@ -15986,6 +15988,7 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
return DenormMode == DenormalMode::getPreserveSign();
return DenormMode == DenormalMode::getIEEE();
}
#endif

// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
// floating point atomic instructions. May generate more efficient code,
Expand Down Expand Up @@ -16046,8 +16049,31 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::FAdd: {
Type *Ty = RMW->getType();

if (Ty->isHalfTy())
// TODO: Handle REGION_ADDRESS
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
// DS F32 FP atomics do respect the denormal mode, but the rounding mode
// is fixed to round-to-nearest-even.
//
// F64 / PK_F16 / PK_BF16 never flush and are also fixed to
// round-to-nearest-even.
//
// We ignore the rounding mode problem, even in strictfp. The C++ standard
// suggests it is OK if the floating-point mode may not match the calling
// thread.
if (Ty->isFloatTy()) {
return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
: AtomicExpansionKind::CmpXChg;
}

if (Ty->isDoubleTy()) {
// Ignores denormal mode, but we don't consider flushing mandatory.
return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
: AtomicExpansionKind::CmpXChg;
}

// TODO: Handle v2f16/v2bf16 cases for gfx940
return AtomicExpansionKind::CmpXChg;
}

if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;
Expand Down Expand Up @@ -16091,7 +16117,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// space. If it is in global address space, we emit the global atomic
// fadd; if it is in shared address space, we emit the LDS atomic fadd.
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
Subtarget->hasLDSFPAtomicAdd()) {
Subtarget->hasLDSFPAtomicAddF32()) {
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
return AtomicExpansionKind::Expand;
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
Expand All @@ -16101,23 +16127,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}

// DS FP atomics do respect the denormal mode, but the rounding mode is
// fixed to round-to-nearest-even.
// The only exception is DS_ADD_F64 which never flushes regardless of mode.
if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
if (!Ty->isDoubleTy())
return AtomicExpansionKind::None;

if (fpModeMatchesGlobalFPAtomicMode(RMW))
return AtomicExpansionKind::None;

return RMW->getFunction()
->getFnAttribute("amdgpu-unsafe-fp-atomics")
.getValueAsString() == "true"
? ReportUnsafeHWInst(AtomicExpansionKind::None)
: AtomicExpansionKind::CmpXChg;
}

return AtomicExpansionKind::CmpXChg;
}
case AtomicRMWInst::FMin:
Expand Down
38 changes: 8 additions & 30 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2074,28 +2074,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB67_3
; GFX90A-NEXT: s_cbranch_execz .LBB67_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, s0
; GFX90A-NEXT: ds_read_b64 v[2:3], v4
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB67_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB67_2
; GFX90A-NEXT: .LBB67_3:
; GFX90A-NEXT: .LBB67_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
Expand All @@ -2106,28 +2095,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB67_3
; GFX940-NEXT: s_cbranch_execz .LBB67_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, s0
; GFX940-NEXT: ds_read_b64 v[2:3], v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB67_2: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB67_2
; GFX940-NEXT: .LBB67_3:
; GFX940-NEXT: .LBB67_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=si-lower \
; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-HW

; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope system due to an unsafe request.
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent due to an unsafe request.
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope workgroup due to an unsafe request.
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope wavefront due to an unsafe request.
Expand Down
52 changes: 14 additions & 38 deletions llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2213,29 +2213,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB72_3
; GFX90A-NEXT: s_cbranch_execz .LBB72_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
; GFX90A-NEXT: ds_read_b64 v[2:3], v0
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB72_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB72_2
; GFX90A-NEXT: .LBB72_3:
; GFX90A-NEXT: .LBB72_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
Expand All @@ -2245,29 +2233,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB72_3
; GFX940-NEXT: s_cbranch_execz .LBB72_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s4
; GFX940-NEXT: ds_read_b64 v[2:3], v0
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB72_2: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB72_2
; GFX940-NEXT: .LBB72_3:
; GFX940-NEXT: .LBB72_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
Expand Down
Loading

0 comments on commit 5b6db43

Please sign in to comment.