480 changes: 40 additions & 440 deletions llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s

declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck %s

define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) {
; CHECK-LABEL: IllegalGEPConst:
Expand All @@ -17,14 +15,16 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_addc_u32 s1, s5, s1
; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
%i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
%i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8
%i.5 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}

attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" }
!0 = !{}
115 changes: 0 additions & 115 deletions llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,118 +5,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s

define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX908_GFX11_GFX12-NEXT: {{ $}}
; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0
;
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
ret void
}

define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX908_GFX11_GFX12-NEXT: {{ $}}
; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0
;
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
ret void
}

define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX908_GFX11_GFX12-NEXT: {{ $}}
; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0
;
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
ret void
}

define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX908_GFX11_GFX12-NEXT: {{ $}}
; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0
;
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
ret void
}

define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) {
; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw
; GFX908_GFX11_GFX12: bb.0 (%ir-block.0):
Expand Down Expand Up @@ -324,7 +212,4 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
ret void
}

declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)

!0 = !{}
120 changes: 0 additions & 120 deletions llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,126 +4,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s

define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
;
; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic
; GFX11: bb.0 (%ir-block.0):
; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
ret float %ret
}

define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
;
; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic
; GFX11: bb.0 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
ret float %ret
}

define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
;
; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic
; GFX11: bb.0 (%ir-block.0):
; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
ret float %ret
}

define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
;
; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic
; GFX11: bb.0 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
ret float %ret
}

define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw
; GFX90A_GFX940: bb.0 (%ir-block.0):
Expand Down
171 changes: 0 additions & 171 deletions llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,174 +2,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s

define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}

define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}

define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}

define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}

define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}

define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}

define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret void
}

define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]]
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
ret double %ret
}

define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
; GFX90A_GFX940: bb.0 (%ir-block.0):
Expand Down Expand Up @@ -299,7 +131,4 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
ret double %ret
}

declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1), double)
declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1), double)

!0 = !{}
58 changes: 29 additions & 29 deletions llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s

define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic

define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn
; GFX908: bb.0 (%ir-block.0):
; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX908-NEXT: {{ $}}
Expand All @@ -16,10 +17,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
Expand All @@ -28,14 +29,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}

define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn
; GFX908: bb.0 (%ir-block.0):
; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX908-NEXT: {{ $}}
Expand All @@ -44,10 +45,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs
; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
Expand All @@ -56,14 +57,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}

define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat
; GFX908: bb.0 (%ir-block.0):
; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX908-NEXT: {{ $}}
Expand All @@ -72,10 +73,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
Expand All @@ -84,14 +85,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}

define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat
; GFX908: bb.0 (%ir-block.0):
; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX908-NEXT: {{ $}}
Expand All @@ -100,10 +101,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
Expand All @@ -112,11 +113,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}

declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
!0 = !{}
39 changes: 19 additions & 20 deletions llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s

define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic
define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
Expand All @@ -14,15 +14,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspac
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %ret
}

define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic
define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
Expand All @@ -31,15 +31,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr ad
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %ret
}

define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic
define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX90A_GFX940-NEXT: {{ $}}
Expand All @@ -48,15 +48,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr add
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %ret
}

define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic
define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
Expand All @@ -65,12 +65,11 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(p
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %ret
}

declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
!0 = !{}
23 changes: 14 additions & 9 deletions llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck --check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck --check-prefix=GCN %s

; Test using saddr addressing mode of global_* flat atomic instructions.

Expand All @@ -11,49 +11,54 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn(ptr addrspace(1) inreg %sbase
; GCN-LABEL: global_fadd_saddr_f32_nortn:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep0, float %data)
%ret = atomicrmw fadd ptr addrspace(1) %gep0, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}

define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, float %data) {
; GCN-LABEL: global_fadd_saddr_f32_nortn_neg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep1, float %data)
%ret = atomicrmw fadd ptr addrspace(1) %gep1, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}

define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x half> %data) {
; GCN-LABEL: global_fadd_saddr_v2f16_nortn:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep0, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %gep0, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}

define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x half> %data) {
; GCN-LABEL: global_fadd_saddr_v2f16_nortn_neg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep1, <2 x half> %data)
%ret = atomicrmw fadd ptr addrspace(1) %gep1, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
ret void
}

declare float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) nocapture, float) #0
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) nocapture, <2 x half>) #0

attributes #0 = { argmemonly nounwind willreturn }
!0 = !{}
77 changes: 0 additions & 77 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll

This file was deleted.

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr a
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to ptr addrspace(1)
call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %castback, float 100.0)
%unused = atomicrmw fadd ptr addrspace(1) %castback, float 100.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret void
}

declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #1

attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind willreturn }

!0 = !{}
28 changes: 0 additions & 28 deletions llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll

This file was deleted.

Large diffs are not rendered by default.

99 changes: 69 additions & 30 deletions llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s

declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8
declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #8
declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #8
declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #0
declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #0

define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-LABEL: InferNothing:
Expand All @@ -21,37 +20,49 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr %b, i64 %i.2
%i.4 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.3, double %c) #8
%i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}


define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) {
; CHECK-LABEL: InferFadd:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24
; CHECK-NEXT: s_mov_b64 s[2:3], exec
; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_cbranch_execz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_load_dword s8, s[0:1], 0x24
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s3, s2, 31
; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; CHECK-NEXT: s_ashr_i32 s9, s8, 31
; CHECK-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
; CHECK-NEXT: s_add_u32 s0, s4, s0
; CHECK-NEXT: v_mov_b32_e32 v0, s6
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_addc_u32 s1, s5, s1
; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1]
; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
%i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
%i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8
%0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}

Expand All @@ -75,7 +86,7 @@ entry:
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
%i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
%i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #8
%i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #1
ret void
}

Expand All @@ -99,41 +110,57 @@ entry:
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
%i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
%i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #8
%i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #1
ret void
}

define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) {
; CHECK-LABEL: InferMixed:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_mov_b64 s[2:3], exec
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s3, s2, 31
; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; CHECK-NEXT: v_mov_b32_e32 v0, s8
; CHECK-NEXT: v_mov_b32_e32 v1, s9
; CHECK-NEXT: s_add_u32 s0, s4, s0
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; CHECK-NEXT: s_addc_u32 s1, s5, s1
; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: global_atomic_add_f64 v4, v[2:3], s[0:1] offset:-7
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc
; CHECK-NEXT: s_cbranch_execz .LBB4_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_load_dword s0, s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s1, s0, 31
; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; CHECK-NEXT: s_add_u32 s0, s4, s0
; CHECK-NEXT: s_addc_u32 s1, s5, s1
; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1]
; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: .LBB4_2:
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
%i.2 = sext i32 %i to i64
%i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
br label %bb1

bb1:
bb1: ; preds = %entry
%i.7 = ptrtoint ptr addrspace(1) %i.3 to i64
%i.8 = add nsw i64 %i.7, 1
%i.9 = inttoptr i64 %i.8 to ptr addrspace(1)
%i.10 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %d, double %c) #23
%0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
%i.11 = addrspacecast ptr addrspace(1) %i.9 to ptr
%i.12 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.11, double %c) #23
%1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}

Expand All @@ -158,10 +185,21 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_cbranch_vccnz .LBB5_1
; CHECK-NEXT: ; %bb.2: ; %bb1
; CHECK-NEXT: v_mov_b32_e32 v0, s6
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_mov_b64 s[0:1], exec
; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_cbranch_execz .LBB5_4
; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: .LBB5_4:
; CHECK-NEXT: s_endpgm
entry:
%i = add nsw i32 %a, -1
Expand All @@ -170,20 +208,21 @@ entry:
%i.4 = ptrtoint ptr addrspace(1) %i.3 to i64
br label %bb0

bb0:
bb0: ; preds = %bb0, %entry
%phi = phi ptr addrspace(1) [ %i.3, %entry ], [ %i.9, %bb0 ]
%i.7 = ptrtoint ptr addrspace(1) %phi to i64
%i.8 = sub nsw i64 %i.7, 1
%cmp2 = icmp eq i64 %i.8, 0
%i.9 = inttoptr i64 %i.7 to ptr addrspace(1)
br i1 %cmp2, label %bb1, label %bb0

bb1:
bb1: ; preds = %bb0
%i.10 = addrspacecast ptr addrspace(1) %i.9 to ptr
%i.11 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.10, double %c) #23
%0 = atomicrmw fadd ptr %i.10, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
ret void
}

attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #1 = { mustprogress nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }

attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" }

!0 = !{}