@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s -check-prefix=GFX940
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %ptr , float %data )
declare <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %ptr , <2 x half > %data )
Expand All
@@ -15,7 +15,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret:
Expand All
@@ -25,6 +28,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %ptr , float %data )
ret void
Expand Down
Expand Up
@@ -88,8 +93,10 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_rtn:
Expand All
@@ -99,8 +106,10 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %ptr , float %data )
ret float %ret
Expand Down
Expand Up
@@ -142,7 +151,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
Expand All
@@ -152,6 +164,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
%ret = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %ptr , <2 x half > %data )
ret void
Expand All
@@ -161,8 +175,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn:
Expand All
@@ -172,8 +188,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %ptr , <2 x half > %data )
ret <2 x half > %ret
Expand Down
Expand Up
@@ -279,8 +297,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %dat
; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset:
Expand All
@@ -290,8 +310,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %dat
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float , ptr %ptr , i64 1023
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %gep , float %data )
Expand All
@@ -305,8 +327,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %dat
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset:
Expand All
@@ -316,8 +340,10 @@ define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %dat
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float , ptr %ptr , i64 -256
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %gep , float %data )
Expand All
@@ -328,8 +354,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %da
; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset:
Expand All
@@ -339,8 +367,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %da
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float , ptr %ptr , i64 1023
%unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %gep , float %data )
Expand All
@@ -354,8 +384,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %da
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset:
Expand All
@@ -365,8 +397,10 @@ define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %da
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float , ptr %ptr , i64 -256
%unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32 (ptr %gep , float %data )
Expand All
@@ -377,8 +411,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2
; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset:
Expand All
@@ -388,8 +424,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half >, ptr %ptr , i64 1023
%result = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %gep , <2 x half > %data )
Expand All
@@ -403,8 +441,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset:
Expand All
@@ -414,8 +454,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half >, ptr %ptr , i64 -256
%result = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %gep , <2 x half > %data )
Expand All
@@ -426,8 +468,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x ha
; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset:
Expand All
@@ -437,8 +481,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half >, ptr %ptr , i64 1023
%unused = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %gep , <2 x half > %data )
Expand All
@@ -452,8 +498,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0
; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset:
Expand All
@@ -463,8 +511,10 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half >, ptr %ptr , i64 -256
%unused = call <2 x half > @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16 (ptr %gep , <2 x half > %data )
Expand Down