Expand Up
@@ -234,17 +234,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All
@@ -265,17 +265,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W32-NEXT: s_mul_i32 s3, s3, 5
; GFX12W32-NEXT: v_mov_b32_e32 v1, s3
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -512,17 +512,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
Expand All
@@ -544,17 +544,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_mul_i32 s4, s2, s4
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
Expand Down
Expand Up
@@ -857,17 +857,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v0, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -900,16 +900,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; %bb.3:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12W32-NEXT: v_mov_b32_e32 v0, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -1230,18 +1230,18 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_load_b32 s5, s[0:1], 0x44
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v0, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mov_b32_e32 v2, s5
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB3_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -1276,17 +1276,17 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: s_load_b32 s8, s[0:1], 0x44
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12W32-NEXT: v_mov_b32_e32 v0, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mov_b32_e32 v2, s8
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB3_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -1368,10 +1368,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v1, 1
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -1608,18 +1608,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All
@@ -1640,18 +1640,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W32-NEXT: s_mul_i32 s3, s3, 5
; GFX12W32-NEXT: v_mov_b32_e32 v1, s3
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -1892,17 +1892,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
Expand All
@@ -1925,17 +1925,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_mul_i32 s4, s2, s4
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
Expand Down
Expand Up
@@ -2240,17 +2240,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v0, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB7_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -2283,17 +2283,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; %bb.3:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12W32-NEXT: v_mov_b32_e32 v0, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Expand Up
@@ -2375,10 +2375,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v1, 1
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down