@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s -check-prefix=GFX940
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %ptr , float %data )
declare <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %ptr , <2 x half > %data )
Expand All
@@ -15,7 +15,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret:
Expand All
@@ -25,6 +28,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %ptr , float %data )
ret void
Expand Down
Expand Up
@@ -90,8 +95,10 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_rtn:
Expand All
@@ -101,8 +108,10 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %ptr , float %data )
ret float %ret
Expand Down
Expand Up
@@ -145,7 +154,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
Expand All
@@ -155,6 +167,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
%ret = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %ptr , <2 x half > %data )
ret void
Expand All
@@ -164,8 +178,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn:
Expand All
@@ -175,8 +191,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %ptr , <2 x half > %data )
ret <2 x half > %ret
Expand Down
Expand Up
@@ -286,8 +304,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %dat
; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset:
Expand All
@@ -297,8 +317,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %dat
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float , ptr %ptr , i64 1023
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %gep , float %data )
Expand All
@@ -312,8 +334,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %dat
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset:
Expand All
@@ -323,8 +347,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %dat
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float , ptr %ptr , i64 -256
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %gep , float %data )
Expand All
@@ -335,8 +361,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %da
; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset:
Expand All
@@ -346,8 +374,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %da
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float , ptr %ptr , i64 1023
%unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %gep , float %data )
Expand All
@@ -361,8 +391,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %da
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset:
Expand All
@@ -372,8 +404,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %da
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float , ptr %ptr , i64 -256
%unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %gep , float %data )
Expand All
@@ -384,8 +418,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2
; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset:
Expand All
@@ -395,8 +431,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half >, ptr %ptr , i64 1023
%result = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %gep , <2 x half > %data )
Expand All
@@ -410,8 +448,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset:
Expand All
@@ -421,8 +461,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half >, ptr %ptr , i64 -256
%result = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %gep , <2 x half > %data )
Expand All
@@ -433,8 +475,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x ha
; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset:
Expand All
@@ -444,8 +488,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half >, ptr %ptr , i64 1023
%unused = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %gep , <2 x half > %data )
Expand All
@@ -459,8 +505,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset:
Expand All
@@ -470,8 +518,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half >, ptr %ptr , i64 -256
%unused = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %gep , <2 x half > %data )
Expand Down