370 changes: 185 additions & 185 deletions llvm/test/CodeGen/AMDGPU/cttz.ll

Large diffs are not rendered by default.

424 changes: 212 additions & 212 deletions llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll

Large diffs are not rendered by default.

665 changes: 331 additions & 334 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Large diffs are not rendered by default.

48 changes: 24 additions & 24 deletions llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -376,32 +376,32 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
;
; GFX9-LABEL: uniform_vec_i16_LH:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX906-LABEL: uniform_vec_i16_LH:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[4:5]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_LH:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3
; GFX11-NEXT: s_pack_lh_b32_b16 s0, s6, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down Expand Up @@ -466,32 +466,32 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
;
; GFX9-LABEL: uniform_vec_i16_HH:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX906-LABEL: uniform_vec_i16_HH:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[4:5]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_HH:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3
; GFX11-NEXT: s_pack_hh_b32_b16 s0, s6, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/ds_read2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -319,16 +319,16 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2
;
; GFX9-LABEL: read2_ptr_is_subreg_arg_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: ds_read_b32 v1, v1 offset:32
; GFX9-NEXT: ds_read_b32 v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
Expand Down Expand Up @@ -370,16 +370,16 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %
;
; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: ds_read_b32 v1, v1 offset:32
; GFX9-NEXT: ds_read_b32 v2, v2 offset:32
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
Expand Down
50 changes: 25 additions & 25 deletions llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -208,30 +208,30 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
;
; VI-LABEL: extract_vector_elt_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extract_vector_elt_v3f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s6
; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:2
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down Expand Up @@ -278,16 +278,16 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x34
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s4, s4, 4
; GFX11-NEXT: s_lshl_b32 s0, s2, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], s0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/fabs.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -212,14 +212,14 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
;
; GFX11-LABEL: s_fabs_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; GFX11-NEXT: s_and_b32 s0, s6, 0x7fff7fff
; GFX11-NEXT: s_and_b32 s1, s7, 0x7fff7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AMDGPU/fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,14 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
;
; VI-LABEL: fabs_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset0_b32 s3, 31
; VI-NEXT: s_bitset0_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_and_b32 s0, s7, 0x7fffffff
; VI-NEXT: s_and_b32 s1, s6, 0x7fffffff
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
Expand Down Expand Up @@ -185,12 +185,12 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa
;
; VI-LABEL: fabsf_fn_fold:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mul_f32_e64 v2, |s6|, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @fabsf(float %in0)
Expand All @@ -215,12 +215,12 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i
;
; VI-LABEL: fabs_fold:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mul_f32_e64 v2, |s6|, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in0)
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
;
; GFX12-LABEL: s_test_canonicalize_var_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: v_max_num_f32_e64 v1, s6, s6
; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
Expand Down
63 changes: 31 additions & 32 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1811,60 +1811,59 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
;
; VI-LABEL: s_copysign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_movk_i32 s4, 0x7fff
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_lshr_b32 s3, s3, 16
; VI-NEXT: s_lshr_b32 s2, s2, 16
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: s_lshr_b32 s1, s7, 16
; VI-NEXT: s_lshr_b32 s2, s6, 16
; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_copysign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: s_lshr_b32 s1, s7, 16
; GFX9-NEXT: s_lshr_b32 s2, s6, 16
; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: s_copysign_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s3
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
; GFX11-NEXT: s_lshr_b32 s2, s2, 16
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_lshr_b32 s0, s7, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_lshr_b32 s0, s6, 16
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
66 changes: 33 additions & 33 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,25 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag
;
; VI-LABEL: s_test_copysign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_bfi_b32 v2, s4, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_bfi_b32 v2, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s6, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down Expand Up @@ -1018,26 +1018,26 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f32_fpext_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_bfi_b32 v2, s4, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s7
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_bfi_b32 v2, s0, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_fpext_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s6, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down Expand Up @@ -1116,26 +1116,26 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f32_fpext_bf16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_bfi_b32 v2, s4, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s7
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_bfi_b32 v2, s0, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_fpext_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s6, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
286 changes: 143 additions & 143 deletions llvm/test/CodeGen/AMDGPU/fdiv.ll

Large diffs are not rendered by default.

40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/flat_atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4300,27 +4300,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
;
; GCN2-LABEL: atomic_cmpxchg_i32_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: s_add_u32 s0, s4, 16
; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -4557,25 +4557,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
;
; GCN2-LABEL: atomic_cmpxchg_i32:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_cmpxchg_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down
112 changes: 56 additions & 56 deletions llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3853,13 +3853,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
;
; GCN2-LABEL: atomic_max_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_ashr_i32 s5, s3, 31
; GCN2-NEXT: s_mov_b32 s4, s3
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: s_ashr_i32 s1, s7, 31
; GCN2-NEXT: s_mov_b32 s0, s7
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -3869,7 +3869,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
Expand All @@ -3883,21 +3883,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
;
; GCN3-LABEL: atomic_max_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -4057,21 +4057,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
;
; GCN2-LABEL: atomic_max_i32_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_ashr_i32 s5, s3, 31
; GCN2-NEXT: s_mov_b32 s4, s3
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: s_ashr_i32 s1, s7, 31
; GCN2-NEXT: s_mov_b32 s0, s7
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
Expand All @@ -4085,21 +4085,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
;
; GCN3-LABEL: atomic_max_i32_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -4996,13 +4996,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
;
; GCN2-LABEL: atomic_umax_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_ashr_i32 s5, s3, 31
; GCN2-NEXT: s_mov_b32 s4, s3
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: s_ashr_i32 s1, s7, 31
; GCN2-NEXT: s_mov_b32 s0, s7
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -5012,7 +5012,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
Expand All @@ -5026,21 +5026,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
;
; GCN3-LABEL: atomic_umax_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_max_u32_e32 v2, s2, v3
; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down Expand Up @@ -6790,13 +6790,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
;
; GCN2-LABEL: atomic_min_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_ashr_i32 s5, s3, 31
; GCN2-NEXT: s_mov_b32 s4, s3
; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: s_ashr_i32 s1, s7, 31
; GCN2-NEXT: s_mov_b32 s0, s7
; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -6806,7 +6806,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
Expand All @@ -6820,21 +6820,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
;
; GCN3-LABEL: atomic_min_i32_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_ashr_i32 s5, s3, 31
; GCN3-NEXT: s_mov_b32 s4, s3
; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: s_ashr_i32 s1, s7, 31
; GCN3-NEXT: s_mov_b32 s0, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_min_i32_e32 v2, s2, v3
; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
Expand Down
176 changes: 88 additions & 88 deletions llvm/test/CodeGen/AMDGPU/fma-combine.ll

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,19 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
;
; GFX11-LABEL: multiple_fadd_use_test_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0
; GFX11-NEXT: v_add_f32_e64 v1, s2, -1.0
; GFX11-NEXT: v_add_f32_e64 v0, s7, -1.0
; GFX11-NEXT: v_add_f32_e64 v1, s6, -1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v0|, |v1|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_add_f32_e64 v0, |v0|, |v0|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0
; GFX11-NEXT: v_fma_f32 v0, -v1, v0, 1.0
; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down Expand Up @@ -174,14 +174,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
;
; GFX11-LABEL: multiple_use_fadd_fmad_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2|
; GFX11-NEXT: v_fma_f32 v2, |s2|, 2.0, s3
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
; GFX11-NEXT: v_add_f32_e64 v1, |s6|, |s6|
; GFX11-NEXT: v_fma_f32 v2, |s6|, 2.0, s7
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc
; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
480 changes: 240 additions & 240 deletions llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/fnearbyint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -124,23 +124,23 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fnearbyint_v2f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_rndne_f32_e32 v1, s3
; VI-NEXT: v_rndne_f32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_rndne_f32_e32 v1, s7
; VI-NEXT: v_rndne_f32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v1, s3
; GFX11-NEXT: v_rndne_f32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: v_rndne_f32_e32 v1, s7
; GFX11-NEXT: v_rndne_f32_e32 v0, s6
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
46 changes: 23 additions & 23 deletions llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2813,15 +2813,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
;
; VI-LABEL: s_fneg_select_infloop_regression_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_bitcmp1_b32 s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, float 0.0, float %arg
Expand Down Expand Up @@ -3161,15 +3161,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
;
; VI-LABEL: s_fneg_select_infloop_regression_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, 1, s1
; VI-NEXT: s_cselect_b32 s0, 0, s0
; VI-NEXT: s_xor_b32 s0, s0, 0x80008000
; VI-NEXT: s_cmp_eq_u32 s1, 1
; VI-NEXT: s_cselect_b32 s0, 0, s0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_and_b32 s0, 1, s5
; VI-NEXT: s_cselect_b32 s1, 0, s4
; VI-NEXT: s_xor_b32 s1, s1, 0x80008000
; VI-NEXT: s_cmp_eq_u32 s0, 1
; VI-NEXT: s_cselect_b32 s0, 0, s1
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
Expand Down Expand Up @@ -3293,15 +3293,15 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
;
; VI-LABEL: s_fabs_select_infloop_regression_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_bitcmp1_b32 s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, float 0.0, float %arg
Expand Down Expand Up @@ -3343,15 +3343,15 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
;
; VI-LABEL: s_fneg_fabs_select_infloop_regression:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_bitcmp1_b32 s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, float 0.0, float %arg
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -447,14 +447,14 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
;
; GFX11-LABEL: fneg_fabs_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000
; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000
; GFX11-NEXT: s_or_b32 s0, s6, 0x80008000
; GFX11-NEXT: s_or_b32 s1, s7, 0x80008000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x,
;
; VI-LABEL: fneg_fabsf_fadd_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_sub_f32_e64 v2, s3, |v0|
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_sub_f32_e64 v2, s7, |v0|
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %x)
Expand All @@ -49,12 +49,12 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x,
;
; VI-LABEL: fneg_fabsf_fmul_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0|
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mul_f32_e64 v2, s7, -|v0|
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %x)
Expand Down Expand Up @@ -213,14 +213,14 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fneg_fabsf_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s3, 31
; VI-NEXT: s_bitset1_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_or_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s1, s6, 0x80000000
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
Expand Down
26 changes: 13 additions & 13 deletions llvm/test/CodeGen/AMDGPU/fneg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,27 +65,27 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
;
; VI-LABEL: s_fneg_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_xor_b32 s0, s7, 0x80000000
; VI-NEXT: s_xor_b32 s1, s6, 0x80000000
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s6, 0x80000000
; GFX11-NEXT: s_xor_b32 s1, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
52 changes: 26 additions & 26 deletions llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
Expand All @@ -36,19 +36,19 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
Expand Down Expand Up @@ -116,19 +116,19 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6
; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
Expand Down Expand Up @@ -164,19 +164,19 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6
; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
Expand Down Expand Up @@ -212,20 +212,20 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1]
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
Expand Down
44 changes: 22 additions & 22 deletions llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX12-NEXT: s_endpgm
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
Expand Down Expand Up @@ -190,10 +190,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX12-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
Expand Down Expand Up @@ -235,10 +235,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
;
; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX12-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
Expand Down Expand Up @@ -280,10 +280,10 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
;
; GFX12-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -316,18 +316,18 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: ds_pk_add_f16 v0, v1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: ds_pk_add_f16 v0, v1
; GFX12-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
Expand Down Expand Up @@ -359,10 +359,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: ds_pk_add_bf16 v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -371,9 +371,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
;
; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
Expand Down
26 changes: 13 additions & 13 deletions llvm/test/CodeGen/AMDGPU/fp-classify.ll
Original file line number Diff line number Diff line change
Expand Up @@ -362,29 +362,29 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_not_pattern_2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2
; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0
; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s6, s6
; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s7|, v0
; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2
; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
; GFX11-NEXT: v_cmp_o_f32_e64 s0, s6, s6
; GFX11-NEXT: v_cmp_neq_f32_e64 s1, 0x7f800000, |s7|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s2, s3
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_and_b32 s0, s0, s1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
66 changes: 33 additions & 33 deletions llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -664,51 +664,51 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
;
; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
; GFX10-NEXT: global_store_dword v1, v0, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; GFX1030-NEXT: v_mov_b32_e32 v0, s8
; GFX1030-NEXT: v_mov_b32_e32 v1, s9
; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; GFX1030-NEXT: v_mov_b32_e32 v1, 0
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: global_store_dword v1, v0, s[6:7]
; GFX1030-NEXT: global_store_dword v1, v0, s[10:11]
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX1100-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
; GFX1100-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
; GFX1100-NEXT: v_mov_b32_e32 v1, 0
; GFX1100-NEXT: s_waitcnt vmcnt(0)
; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7]
; GFX1100-NEXT: global_store_b32 v1, v0, s[10:11]
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
;
; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_mov_b32 s0, 4
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[4:7], s0 offen th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v1, v0, s[6:7]
; GFX12-NEXT: global_store_b32 v1, v0, s[10:11]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
Expand Down Expand Up @@ -743,37 +743,37 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
;
; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7]
; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11]
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s8
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s9
; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; G_GFX1030-NEXT: v_mov_b32_e32 v1, 0
; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
; G_GFX1030-NEXT: global_store_dword v1, v0, s[6:7]
; G_GFX1030-NEXT: global_store_dword v1, v0, s[10:11]
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
; G_GFX1100-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0
; G_GFX1100-NEXT: s_waitcnt vmcnt(0)
; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7]
; G_GFX1100-NEXT: global_store_b32 v1, v0, s[10:11]
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
Expand Down
92 changes: 46 additions & 46 deletions llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8)
; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x1
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
; GFX1030-NEXT: v_mov_b32_e32 v0, s2
; GFX1030-NEXT: v_mov_b32_e32 v1, s3
; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_clause 0x1
; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
Expand Down Expand Up @@ -408,22 +408,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8)
; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x1
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
; GFX1030-NEXT: v_mov_b32_e32 v0, s2
; GFX1030-NEXT: v_mov_b32_e32 v1, s3
; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_clause 0x1
; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
Expand Down Expand Up @@ -607,37 +607,37 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
;
; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
; GFX10-NEXT: global_store_dword v1, v0, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; GFX1030-NEXT: v_mov_b32_e32 v0, s8
; GFX1030-NEXT: v_mov_b32_e32 v1, s9
; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; GFX1030-NEXT: v_mov_b32_e32 v1, 0
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: global_store_dword v1, v0, s[6:7]
; GFX1030-NEXT: global_store_dword v1, v0, s[10:11]
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX1100-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
; GFX1100-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
; GFX1100-NEXT: v_mov_b32_e32 v1, 0
; GFX1100-NEXT: s_waitcnt vmcnt(0)
; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7]
; GFX1100-NEXT: global_store_b32 v1, v0, s[10:11]
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
Expand Down Expand Up @@ -672,37 +672,37 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
;
; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7]
; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11]
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s8
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s9
; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
; G_GFX1030-NEXT: v_mov_b32_e32 v1, 0
; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
; G_GFX1030-NEXT: global_store_dword v1, v0, s[6:7]
; G_GFX1030-NEXT: global_store_dword v1, v0, s[10:11]
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
; G_GFX1100-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0
; G_GFX1100-NEXT: s_waitcnt vmcnt(0)
; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7]
; G_GFX1100-NEXT: global_store_b32 v1, v0, s[10:11]
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,38 +28,38 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp16_to_fp32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
; GFX8-NEXT: s_mov_b32 s11, s7
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_mov_b32 s10, s2
; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s8, s2
; GFX8-NEXT: s_mov_b32 s9, s3
; GFX8-NEXT: s_mov_b32 s8, s6
; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: test_convert_fp16_to_fp32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
; GFX11-NEXT: s_mov_b32 s11, s7
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s8, s2
; GFX11-NEXT: s_mov_b32 s9, s3
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s8, s6
; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
; GFX11-NEXT: s_mov_b32 s5, s1
; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,41 +27,41 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp16_to_fp64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
; GFX8-NEXT: s_mov_b32 s11, s7
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_mov_b32 s10, s2
; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s8, s2
; GFX8-NEXT: s_mov_b32 s9, s3
; GFX8-NEXT: s_mov_b32 s8, s6
; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: test_convert_fp16_to_fp64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
; GFX11-NEXT: s_mov_b32 s11, s7
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s8, s2
; GFX11-NEXT: s_mov_b32 s9, s3
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s8, s6
; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
; GFX11-NEXT: s_mov_b32 s5, s1
; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,38 +27,38 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp32_to_fp16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
; GFX8-NEXT: s_mov_b32 s11, s7
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_mov_b32 s10, s2
; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s8, s2
; GFX8-NEXT: s_mov_b32 s9, s3
; GFX8-NEXT: s_mov_b32 s8, s6
; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: test_convert_fp32_to_fp16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
; GFX11-NEXT: s_mov_b32 s11, s7
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s8, s2
; GFX11-NEXT: s_mov_b32 s9, s3
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s8, s6
; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_mov_b32 s5, s1
; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
48 changes: 24 additions & 24 deletions llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -452,26 +452,26 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s7
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s11
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b64 v2, v[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: v_mov_b32_e32 v2, s6
; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; GFX1030-NEXT: v_mov_b32_e32 v2, s7
; GFX1030-NEXT: v_mov_b32_e32 v0, s8
; GFX1030-NEXT: v_mov_b32_e32 v1, s9
; GFX1030-NEXT: v_mov_b32_e32 v2, s10
; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; GFX1030-NEXT: v_mov_b32_e32 v2, s11
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: ds_write_b64 v2, v[0:1]
; GFX1030-NEXT: s_endpgm
Expand Down Expand Up @@ -506,26 +506,26 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: v_mov_b32_e32 v2, s6
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s7
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: v_mov_b32_e32 v2, s10
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s11
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; G_GFX1030-NEXT: v_mov_b32_e32 v2, s7
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s8
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s9
; G_GFX1030-NEXT: v_mov_b32_e32 v2, s10
; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; G_GFX1030-NEXT: v_mov_b32_e32 v2, s11
; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
; G_GFX1030-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX1030-NEXT: s_endpgm
Expand Down
76 changes: 38 additions & 38 deletions llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: v_mov_b32_e32 v2, s6
; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
; GFX1030-NEXT: v_mov_b32_e32 v0, s2
; GFX1030-NEXT: v_mov_b32_e32 v1, s3
; GFX1030-NEXT: v_mov_b32_e32 v2, s8
; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
Expand Down Expand Up @@ -291,14 +291,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: v_mov_b32_e32 v2, s6
; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
; GFX1030-NEXT: v_mov_b32_e32 v0, s2
; GFX1030-NEXT: v_mov_b32_e32 v1, s3
; GFX1030-NEXT: v_mov_b32_e32 v2, s8
; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
Expand Down Expand Up @@ -452,26 +452,26 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
;
; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s7
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s11
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b64 v2, v[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: v_mov_b32_e32 v2, s6
; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; GFX1030-NEXT: v_mov_b32_e32 v2, s7
; GFX1030-NEXT: v_mov_b32_e32 v0, s8
; GFX1030-NEXT: v_mov_b32_e32 v1, s9
; GFX1030-NEXT: v_mov_b32_e32 v2, s10
; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; GFX1030-NEXT: v_mov_b32_e32 v2, s11
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: ds_write_b64 v2, v[0:1]
; GFX1030-NEXT: s_endpgm
Expand Down Expand Up @@ -506,26 +506,26 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
;
; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: v_mov_b32_e32 v2, s6
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s7
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: v_mov_b32_e32 v2, s10
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s11
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; G_GFX1030-NEXT: v_mov_b32_e32 v2, s7
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s8
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s9
; G_GFX1030-NEXT: v_mov_b32_e32 v2, s10
; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; G_GFX1030-NEXT: v_mov_b32_e32 v2, s11
; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
; G_GFX1030-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX1030-NEXT: s_endpgm
Expand Down
38 changes: 19 additions & 19 deletions llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,15 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fp_to_sint_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e32 v1, s3
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: v_cvt_i32_f32_e32 v1, s7
; VI-NEXT: v_cvt_i32_f32_e32 v0, s6
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_v2i32:
Expand Down Expand Up @@ -329,24 +329,24 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fp_to_sint_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s8, 0x2f800000
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s3
; VI-NEXT: v_trunc_f32_e32 v0, s7
; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_floor_f32_e32 v1, v1
; VI-NEXT: s_mov_b32 s0, 0xcf800000
; VI-NEXT: v_fma_f32 v2, v1, s0, |v0|
; VI-NEXT: v_trunc_f32_e32 v4, s2
; VI-NEXT: s_mov_b32 s4, 0xcf800000
; VI-NEXT: v_fma_f32 v2, v1, s4, |v0|
; VI-NEXT: v_trunc_f32_e32 v4, s6
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
; VI-NEXT: v_floor_f32_e32 v3, v3
; VI-NEXT: v_cvt_u32_f32_e32 v5, v3
; VI-NEXT: v_fma_f32 v3, v3, s0, |v4|
; VI-NEXT: v_fma_f32 v3, v3, s4, |v4|
; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0
; VI-NEXT: v_cvt_u32_f32_e32 v6, v3
; VI-NEXT: v_xor_b32_e32 v2, v2, v0
Expand All @@ -357,9 +357,9 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
; VI-NEXT: v_xor_b32_e32 v0, v6, v1
; VI-NEXT: v_xor_b32_e32 v4, v5, v1
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_v2i64:
Expand Down
38 changes: 19 additions & 19 deletions llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x
;
; VI-LABEL: fp_to_uint_v2f32_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v1, s3
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: v_cvt_u32_f32_e32 v1, s7
; VI-NEXT: v_cvt_u32_f32_e32 v0, s6
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v2f32_to_v2i32:
Expand Down Expand Up @@ -264,26 +264,26 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
;
; VI-LABEL: fp_to_uint_v2f32_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0xcf800000
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s3
; VI-NEXT: v_trunc_f32_e32 v4, s2
; VI-NEXT: v_trunc_f32_e32 v0, s7
; VI-NEXT: v_trunc_f32_e32 v4, s6
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; VI-NEXT: v_floor_f32_e32 v5, v1
; VI-NEXT: s_mov_b32 s2, 0xcf800000
; VI-NEXT: v_floor_f32_e32 v6, v2
; VI-NEXT: v_fma_f32 v0, v5, s2, v0
; VI-NEXT: v_fma_f32 v0, v5, s0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v2, v0
; VI-NEXT: v_fma_f32 v0, v6, s2, v4
; VI-NEXT: v_fma_f32 v0, v6, s0, v4
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v2f32_to_v2i64:
Expand Down
88 changes: 44 additions & 44 deletions llvm/test/CodeGen/AMDGPU/fshl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -126,23 +126,23 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
;
; VI-LABEL: fshl_i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_alignbit_b32 v2, s6, v0, 25
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 25
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_i32_imm:
Expand All @@ -159,20 +159,20 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
;
; GFX10-LABEL: fshl_i32_imm:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 25
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_i32_imm:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: v_alignbit_b32 v1, s6, s7, 25
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down Expand Up @@ -720,29 +720,29 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
;
; VI-LABEL: orxor2or1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s4, s2, 7
; VI-NEXT: s_or_b32 s4, s3, s4
; VI-NEXT: s_cmp_eq_u32 s4, 0
; VI-NEXT: s_cselect_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_lshl_b32 s0, s6, 7
; VI-NEXT: s_or_b32 s0, s7, s0
; VI-NEXT: s_cmp_eq_u32 s0, 0
; VI-NEXT: s_cselect_b32 s0, s6, s7
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: orxor2or1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s4, s2, 7
; GFX9-NEXT: s_or_b32 s4, s3, s4
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_cselect_b32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_lshl_b32 s0, s6, 7
; GFX9-NEXT: s_or_b32 s0, s7, s0
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cselect_b32 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: orxor2or1:
Expand All @@ -761,29 +761,29 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
;
; GFX10-LABEL: orxor2or1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl_b32 s4, s2, 7
; GFX10-NEXT: s_or_b32 s4, s3, s4
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_lshl_b32 s0, s6, 7
; GFX10-NEXT: s_or_b32 s0, s7, s0
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cselect_b32 s0, s6, s7
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: orxor2or1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s4, s2, 7
; GFX11-NEXT: s_lshl_b32 s0, s6, 7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s4, s3, s4
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_cselect_b32 s2, s2, s3
; GFX11-NEXT: s_or_b32 s0, s7, s0
; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: s_cselect_b32 s0, s6, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
Expand Down
Loading