Skip to content

Commit

Permalink
[AMDGPU] Do not exapnd fp atomics on gfx940
Browse files Browse the repository at this point in the history
FP atomics are safe on gfx940. This fixes regression after D131560.

Fixes: SWDEV-380468

Differential Revision: https://reviews.llvm.org/D143603
  • Loading branch information
rampitec committed Feb 8, 2023
1 parent e887627 commit 94def1b
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 411 deletions.
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -12942,6 +12942,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {

if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
Subtarget->hasAtomicFaddNoRtnInsts()) {
if (Subtarget->hasGFX940Insts())
return AtomicExpansionKind::None;

if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;

Expand Down
49 changes: 6 additions & 43 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
Expand Up @@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_mov_b64 s[2:3], 0
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: flat_load_dword v1, v[0:1]
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB1_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret void
Expand All @@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_mov_b64 s[2:3], 0
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: flat_load_dword v1, v[0:1]
; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB2_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret void
Expand All @@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB4_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret float %ret
Expand Down
33 changes: 2 additions & 31 deletions llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
Expand Up @@ -56,25 +56,11 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
; GFX940-LABEL: syncscope_system:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: v_add_f32_e32 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB0_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-LABEL: syncscope_system:
Expand Down Expand Up @@ -373,23 +359,8 @@ define float @no_unsafe(ptr %addr, float %val) {
; GFX940-LABEL: no_unsafe:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: v_add_f32_e32 v4, v5, v2
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB3_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-LABEL: no_unsafe:
Expand Down
49 changes: 6 additions & 43 deletions llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
Expand Up @@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_mov_b64 s[2:3], 0
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: flat_load_dword v1, v[0:1]
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB1_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret void
Expand All @@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_mov_b64 s[2:3], 0
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: flat_load_dword v1, v[0:1]
; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB2_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret void
Expand All @@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB4_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret float %ret
Expand Down

0 comments on commit 94def1b

Please sign in to comment.