diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b3d5f11978814..2ef194a66bafd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12942,6 +12942,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { if (AMDGPU::isFlatGlobalAddrSpace(AS) && Subtarget->hasAtomicFaddNoRtnInsts()) { + if (Subtarget->hasGFX940Insts()) + return AtomicExpansionKind::None; + if (unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 31d35df6d0990..dd5b07ee28bb6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: flat_load_dword v1, v[0:1] -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret void @@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: flat_load_dword v1, v[0:1] -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret void @@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v2, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index acad79d009a48..eb300bb7baff6 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -56,25 +56,11 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX940-LABEL: syncscope_system: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: syncscope_system: @@ -373,23 +359,8 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX940-LABEL: no_unsafe: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: no_unsafe: diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 0872c61601999..4bd9991912977 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: flat_load_dword v1, v[0:1] -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret void @@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: flat_load_dword v1, v[0:1] -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret void @@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v2, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index ae62666a6878e..a9200419569ed 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -704,26 +704,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst @@ -789,26 +776,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst @@ -893,26 +867,12 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst @@ -976,26 +936,12 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_system: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst @@ -1068,26 +1014,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[4:5] -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] sc0 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1124,26 +1057,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst @@ -1212,27 +1133,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst @@ -1269,26 +1177,12 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst @@ -1353,27 +1247,13 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_system: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX940-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst @@ -1454,26 +1334,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll index 22eec1d98072f..5d7825bb37887 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll @@ -38,20 +38,8 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX90A-NEXT: ret float [[TMP5]] ; ; GFX940-LABEL: @syncscope_system( -; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: ret float [[TMP5]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] seq_cst, align 4 +; GFX940-NEXT: ret float [[RES]] ; ; GFX1100-LABEL: @syncscope_system( ; GFX1100-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 4 @@ -278,20 +266,8 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX90A-NEXT: ret float [[TMP5]] ; ; GFX940-LABEL: @no_unsafe( -; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("workgroup") seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: ret float [[TMP5]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4 +; GFX940-NEXT: ret float [[RES]] ; ; GFX1100-LABEL: @no_unsafe( ; GFX1100-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 4 diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 87c4e92684200..6287b5f7ffdbc 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -433,20 +433,8 @@ define float @test_atomicrmw_fadd_f32_flat(ptr %ptr, float %value) { ; GFX90A-NEXT: ret float [[TMP5]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat( -; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: ret float [[TMP5]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat( ; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 @@ -534,20 +522,8 @@ define float @test_atomicrmw_fadd_f32_global(ptr addrspace(1) %ptr, float %value ; GFX90A-NEXT: ret float [[TMP5]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global( -; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: ret float [[TMP5]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global( ; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 @@ -635,19 +611,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_ieee(ptr addrspace(1) %ptr, f ; GFX90A-NEXT: ret void ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( -; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 ; GFX940-NEXT: ret void ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( @@ -736,19 +700,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1) ; GFX90A-NEXT: ret void ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( -; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 ; GFX940-NEXT: ret void ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( @@ -1582,20 +1534,8 @@ define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) { ; GFX90A-NEXT: ret double [[TMP5]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat( -; GFX940-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: ret double [[TMP5]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8 +; GFX940-NEXT: ret double [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat( ; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 @@ -1683,20 +1623,8 @@ define double @test_atomicrmw_fadd_f64_global(ptr addrspace(1) %ptr, double %val ; GFX90A-NEXT: ret double [[TMP5]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f64_global( -; GFX940-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: ret double [[TMP5]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8 +; GFX940-NEXT: ret double [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f64_global( ; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 @@ -1861,20 +1789,8 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: ret float [[TMP5]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_agent( -; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: ret float [[TMP5]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] syncscope("agent") monotonic, align 4 +; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_agent( ; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 @@ -1962,20 +1878,8 @@ define float @test_atomicrmw_fadd_f32_global_one_as(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: ret float [[TMP5]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_one_as( -; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 -; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("one-as") monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: ret float [[TMP5]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] syncscope("one-as") monotonic, align 4 +; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_one_as( ; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4