Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Disable atomic optimization of fadd/fsub with result #96479

Merged
merged 3 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,17 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
case AtomicRMWInst::FAdd:
case AtomicRMWInst::FSub:
case AtomicRMWInst::FMax:
case AtomicRMWInst::FMin:
break;
case AtomicRMWInst::FAdd:
case AtomicRMWInst::FSub:
if (!I.use_empty()) {
// Bail out because the way we would calculate the result value is
// incorrect in the presence of NaNs and infinities.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a todo to fix the 0->-0 case and check no-nan/no-infs?

return;
}
break;
}

// Only 32 and 64 bit floating point atomic ops are supported.
Expand Down
91 changes: 15 additions & 76 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -149,91 +149,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %p
}

define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw
; GFX90A_GFX940: bb.1 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw
; GFX11: bb.1 (%ir-block.0):
; GFX11-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX11-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE
; GFX11-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.2
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.2 (%ir-block.5):
; GFX11-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $exec_lo
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_3]], 0, [[S_MOV_B32_3]], [[V_ADD_F32_e64_3]], 0, implicit $exec
; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 15
; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_4]]
; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 16
; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_5]], [[V_MOV_B32_dpp5]]
; GFX11-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 31
; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_6]]
; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]]
; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY13]], implicit $exec
; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY14]], implicit $exec
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.3 (%ir-block.36):
; GFX11-NEXT: successors: %bb.5(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX11-NEXT: S_BRANCH %bb.5
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.4.Flow:
; GFX11-NEXT: successors: %bb.6(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.6
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.5 (%ir-block.39):
; GFX11-NEXT: successors: %bb.4(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
; GFX11-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.4
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.6 (%ir-block.47):
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
ret float %ret
Expand Down
84 changes: 16 additions & 68 deletions llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -155,82 +155,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %p
}

define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
;
; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw
; GFX11: bb.0 (%ir-block.0):
; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]]
; GFX11-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_32 = SI_PS_LIVE
; GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX11-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.1
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.1 (%ir-block.5):
; GFX11-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $exec_lo
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], killed [[S_MOV_B32_1]], implicit-def dead $scc, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15
; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_3]]
; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16
; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed [[V_READLANE_B32_]], killed [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]]
; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31
; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_5]]
; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.2
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.2 (%ir-block.36):
; GFX11-NEXT: successors: %bb.4(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %2
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY5]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX11-NEXT: S_BRANCH %bb.4
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.3.Flow:
; GFX11-NEXT: successors: %bb.5(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.5
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.4 (%ir-block.39):
; GFX11-NEXT: successors: %bb.3(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
; GFX11-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.5 (%ir-block.47):
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
ret float %ret
Expand Down
44 changes: 14 additions & 30 deletions llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,43 +4,27 @@
define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_4
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GCN-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GCN-NEXT: s_mov_b64 s[2:3], 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dword s6, s[0:1], 0x0
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: .LBB0_2: ; %atomicrmw.start
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: .LBB0_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_mov_b32_e32 v5, v1
; GCN-NEXT: v_add_f32_e32 v4, v5, v2
; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_add_f32_e32 v1, 4.0, v2
; GCN-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz .LBB0_2
; GCN-NEXT: ; %bb.3: ; %Flow
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: .LBB0_4: ; %Flow2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN-NEXT: s_cbranch_execnz .LBB0_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: v_readfirstlane_b32 s0, v1
; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GCN-NEXT: v_mad_f32 v0, v0, 4.0, s0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: global_store_dword v[0:1], v1, off
; GCN-NEXT: s_endpgm
%result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
store float %result, ptr addrspace(1) undef
Expand Down
Loading
Loading