diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 9b09550159993..5c2c6d4b13c66 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1896,7 +1896,7 @@ def HasVINTERPEncoding : Predicate<"Subtarget->hasVINTERPEncoding()">, def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of FeatureGFX9Insts)>; -def HasLDSFPAtomicAdd : Predicate<"Subtarget->hasLDSFPAtomicAdd()">, +def HasLDSFPAtomicAddF32 : Predicate<"Subtarget->hasLDSFPAtomicAddF32()">, AssemblerPredicate<(all_of FeatureGFX8Insts)>; def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e55d1de01b4fd..780dfaae11ef3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1624,7 +1624,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); - if (ST.hasLDSFPAtomicAdd()) { + if (ST.hasLDSFPAtomicAddF32()) { Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); if (ST.hasLdsAtomicAddF64()) Atomic.legalFor({{S64, LocalPtr}}); diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 0773ef7f32341..d63f04ab6d4c9 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -443,7 +443,7 @@ defm DS_AND_B32 : DS_1A1D_NORET_mc<"ds_and_b32">; defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">; defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">; -let SubtargetPredicate = HasLDSFPAtomicAdd in { +let SubtargetPredicate = HasLDSFPAtomicAddF32 in { defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">; } @@ -523,7 +523,7 @@ defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>; defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32>; -let SubtargetPredicate = HasLDSFPAtomicAdd in { +let SubtargetPredicate = HasLDSFPAtomicAddF32 in { defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32>; } defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32>; @@ -697,7 +697,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", } // let SubtargetPredicate = isGFX8Plus -let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] in { +let SubtargetPredicate = HasLDSFPAtomicAddF32, OtherPredicates = [HasDsSrc2Insts] in { def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; } @@ -1088,7 +1088,7 @@ let SubtargetPredicate = isGFX11Plus in { defm : DSAtomicCmpXChg_mc; } -let SubtargetPredicate = HasLDSFPAtomicAdd in { +let SubtargetPredicate = HasLDSFPAtomicAddF32 in { defm : DSAtomicRetNoRetPat_mc; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8a4a46ce50d1d..2ca5ae306b11b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -960,7 +960,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return HasScalarAtomics; } - bool hasLDSFPAtomicAdd() const { return GFX8Insts; } + bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } + bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c4592d43f4f8a..12b7f05c44630 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15977,6 +15977,8 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, SNaN, Depth); } +#if 0 +// FIXME: This should be checked before unsafe fp atomics are enabled // Global FP atomic instructions have a hardcoded FP mode and do not support // FP32 denormals, and only support v2f16 denormals. static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { @@ -15986,6 +15988,7 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { return DenormMode == DenormalMode::getPreserveSign(); return DenormMode == DenormalMode::getIEEE(); } +#endif // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe // floating point atomic instructions. May generate more efficient code, @@ -16046,8 +16049,31 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::FAdd: { Type *Ty = RMW->getType(); - if (Ty->isHalfTy()) + // TODO: Handle REGION_ADDRESS + if (AS == AMDGPUAS::LOCAL_ADDRESS) { + // DS F32 FP atomics do respect the denormal mode, but the rounding mode + // is fixed to round-to-nearest-even. + // + // F64 / PK_F16 / PK_BF16 never flush and are also fixed to + // round-to-nearest-even. + // + // We ignore the rounding mode problem, even in strictfp. The C++ standard + // suggests it is OK if the floating-point mode may not match the calling + // thread. + if (Ty->isFloatTy()) { + return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; + } + + if (Ty->isDoubleTy()) { + // Ignores denormal mode, but we don't consider flushing mandatory. + return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; + } + + // TODO: Handle v2f16/v2bf16 cases for gfx940 return AtomicExpansionKind::CmpXChg; + } if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy())) return AtomicExpansionKind::CmpXChg; @@ -16091,7 +16117,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // space. If it is in global address space, we emit the global atomic // fadd; if it is in shared address space, we emit the LDS atomic fadd. if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() && - Subtarget->hasLDSFPAtomicAdd()) { + Subtarget->hasLDSFPAtomicAddF32()) { if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) return AtomicExpansionKind::Expand; if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) @@ -16101,23 +16127,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AtomicExpansionKind::CmpXChg; } - // DS FP atomics do respect the denormal mode, but the rounding mode is - // fixed to round-to-nearest-even. - // The only exception is DS_ADD_F64 which never flushes regardless of mode. - if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) { - if (!Ty->isDoubleTy()) - return AtomicExpansionKind::None; - - if (fpModeMatchesGlobalFPAtomicMode(RMW)) - return AtomicExpansionKind::None; - - return RMW->getFunction() - ->getFnAttribute("amdgpu-unsafe-fp-atomics") - .getValueAsString() == "true" - ? ReportUnsafeHWInst(AtomicExpansionKind::None) - : AtomicExpansionKind::CmpXChg; - } - return AtomicExpansionKind::CmpXChg; } case AtomicRMWInst::FMin: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 1a76f8cf87ffb..4e94a646f6da5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -2074,28 +2074,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB67_3 +; GFX90A-NEXT: s_cbranch_execz .LBB67_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, s0 -; GFX90A-NEXT: ds_read_b64 v[2:3], v4 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 -; GFX90A-NEXT: .LBB67_2: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB67_2 -; GFX90A-NEXT: .LBB67_3: +; GFX90A-NEXT: .LBB67_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: @@ -2106,28 +2095,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB67_3 +; GFX940-NEXT: s_cbranch_execz .LBB67_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, s0 -; GFX940-NEXT: ds_read_b64 v[2:3], v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB67_2: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] -; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] +; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB67_2 -; GFX940-NEXT: .LBB67_3: +; GFX940-NEXT: .LBB67_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll index 001a4e999aee9..b3fb67a7d7e0c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll @@ -1,7 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=si-lower \ ; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-HW -; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope system due to an unsafe request. ; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent due to an unsafe request. ; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope workgroup due to an unsafe request. ; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope wavefront due to an unsafe request. diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index a948fab8f1c1b..121fab51024fd 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -2213,29 +2213,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB72_3 +; GFX90A-NEXT: s_cbranch_execz .LBB72_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: ds_read_b64 v[2:3], v0 -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB72_2: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB72_2 -; GFX90A-NEXT: .LBB72_3: +; GFX90A-NEXT: .LBB72_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: @@ -2245,29 +2233,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB72_3 +; GFX940-NEXT: s_cbranch_execz .LBB72_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: ds_read_b64 v[2:3], v0 -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB72_2: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] -; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] +; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB72_2 -; GFX940-NEXT: .LBB72_3: +; GFX940-NEXT: .LBB72_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 337e51f9a912c..17318b2c62ca8 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -3670,84 +3670,308 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignor } define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(ptr addrspace(3) %ptr, double %value) #5 { -; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( -; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( +; GFX90A-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void ; %unused = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic ret void } define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(ptr addrspace(3) %ptr, double %value) #5 { -; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( -; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret double [[TMP5]] +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( +; GFX90A-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8 +; GFX90A-NEXT: ret double [[RET]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8 +; GFX940-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] ; %ret = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic ret double %ret } define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, double %value) #5 { -; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( -; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void ; %unused = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 ret void } define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, double %value) #5 { -; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( -; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret double [[TMP5]] +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret double [[RET]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] ; %ret = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 ret double %ret