13 changes: 4 additions & 9 deletions llvm/test/CodeGen/AMDGPU/select-i1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN

; GCN-LABEL: {{^}}select_i1:
; GCN: v_cndmask_b32
; GCN: s_cselect_b32
; GCN-NOT: v_cndmask_b32
define amdgpu_kernel void @select_i1(ptr addrspace(1) %out, i32 %cond, i1 %a, i1 %b) nounwind {
%cmp = icmp ugt i32 %cond, 5
Expand All @@ -16,14 +16,9 @@ define amdgpu_kernel void @select_i1(ptr addrspace(1) %out, i32 %cond, i1 %a, i1
; GCN-LABEL: {{^}}s_minmax_i1:
; GCN: s_load_dword [[LOAD:s[0-9]+]],
; GCN: s_bitcmp1_b32 [[LOAD]], 0
; GCN: s_cselect_b64 vcc, -1, 0
; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16

; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
; GCN: s_cselect_b32 [[SHIFTVAL:s[0-9]+]], 8, 16
; GCN: s_lshr_b32 [[LOAD]], [[LOAD]], [[SHIFTVAL]]
; GCN: s_and_b32 [[LOAD]], [[LOAD]], 1
define amdgpu_kernel void @s_minmax_i1(ptr addrspace(1) %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
%cmp = icmp slt i1 %cond, false
%sel = select i1 %cmp, i1 %a, i1 %b
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/select-vectors.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

; This is worse when i16 is legal and packed is not because
; SelectionDAGBuilder for some reason changes the select type.
; VI: v_cndmask_b32
; VI: s_cselect_b64
; VI: v_cndmask_b32
define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
%a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2
Expand Down Expand Up @@ -111,8 +111,7 @@ define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1
; SI: cndmask
; SI-NOT: cndmask

; VI: s_cselect_b32
; VI: s_cselect_b32
; VI: s_cselect_b64
; GFX9: cndmask
; GFX9: cndmask
define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
Expand Down
17 changes: 5 additions & 12 deletions llvm/test/CodeGen/AMDGPU/setcc-opt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -154,19 +154,12 @@ define amdgpu_kernel void @zext_bool_icmp_ne_neg1(ptr addrspace(1) %out, i32 %a,
}

; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff
; SI: s_cmpk_lg_i32 [[B]], 0xff
; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0

; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], 0xff
; VI: s_movk_i32 [[K255:s[0-9]+]], 0xff
; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]]
; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]]
; GCN: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff
; GCN: s_cmpk_lg_i32 [[B]], 0xff
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0

; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; VI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN: buffer_store_byte [[RESULT]]
; GCN: s_endpgm
define amdgpu_kernel void @cmp_zext_k_i8max(ptr addrspace(1) %out, i8 %b) nounwind {
Expand Down
19 changes: 9 additions & 10 deletions llvm/test/CodeGen/AMDGPU/sign_extend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -372,14 +372,14 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s4
; VI-NEXT: s_ashr_i32 s5, s4, 24
; VI-NEXT: s_bfe_i32 s6, s4, 0x80010
; VI-NEXT: s_bfe_i32 s7, s4, 0x80008
; VI-NEXT: s_sext_i32_i8 s4, s4
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
Expand Down Expand Up @@ -447,19 +447,18 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
; VI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
; VI-NEXT: v_bfe_i32 v2, v0, 16, 8
; VI-NEXT: v_bfe_i32 v3, v0, 8, 8
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in
%cast = bitcast i32 %a to <4 x i8>
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -861,26 +861,26 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
; GFX9-NEXT: global_load_dword v2, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX9-NEXT: v_cmp_gt_i32_sdwa vcc, sext(v1), sext(v2) src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_cmp_gt_i32_sdwa s[0:1], sext(v1), sext(v2) src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX9-NEXT: v_cmp_gt_i16_e32 vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc
; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], v3, v4
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[0:1]
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX9-NEXT: v_lshl_or_b32 v4, v6, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v4
; GFX9-NEXT: global_store_dword v0, v5, s[4:5]
; GFX9-NEXT: v_or_b32_e32 v2, v5, v2
; GFX9-NEXT: global_store_dword v0, v4, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v0, v2, v3
; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
; GFX9-NEXT: v_and_b32_e32 v0, 3, v2
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -921,7 +921,7 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
; VI-NEXT: s_cselect_b32 s0, s2, s4
; VI-NEXT: s_cselect_b32 s1, s4, s2
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: v_lshlrev_b16_e32 v4, 1, v4
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v4
; VI-NEXT: s_lshl_b32 s2, s3, 16
; VI-NEXT: s_and_b32 s1, s1, 0xffff
; VI-NEXT: s_or_b32 s0, s0, s5
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AMDGPU/srem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_movk_i32 s0, 0x4925
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v1, v0, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_i32 v2, v1, 0, 16
; GCN-NEXT: v_mul_lo_u32 v2, v2, s0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 31, v2
; GCN-NEXT: v_ashrrev_i32_e32 v2, 17, v2
; GCN-NEXT: v_add_u16_e32 v2, v2, v3
; GCN-NEXT: v_mul_lo_u16_e32 v2, 7, v2
; GCN-NEXT: v_sub_u16_e32 v1, v1, v2
; GCN-NEXT: v_readfirstlane_b32 s0, v1
; GCN-NEXT: s_sext_i32_i16 s0, s0
; GCN-NEXT: s_mulk_i32 s0, 0x4925
; GCN-NEXT: s_lshr_b32 s1, s0, 31
; GCN-NEXT: s_ashr_i32 s0, s0, 17
; GCN-NEXT: s_add_i32 s0, s0, s1
; GCN-NEXT: s_mul_i32 s0, s0, 7
; GCN-NEXT: v_subrev_u32_e32 v1, s0, v1
; GCN-NEXT: global_store_short v0, v1, s[4:5]
; GCN-NEXT: s_endpgm
;
Expand Down Expand Up @@ -54,17 +54,17 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mov_b32_e32 v0, s2
; TONGA-NEXT: v_mov_b32_e32 v1, s3
; TONGA-NEXT: flat_load_ushort v2, v[0:1]
; TONGA-NEXT: s_movk_i32 s2, 0x4925
; TONGA-NEXT: v_mov_b32_e32 v0, s0
; TONGA-NEXT: v_mov_b32_e32 v1, s1
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_bfe_i32 v0, v2, 0, 16
; TONGA-NEXT: v_mul_lo_u32 v3, v0, s2
; TONGA-NEXT: v_mov_b32_e32 v0, s0
; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 17, v3
; TONGA-NEXT: v_add_u16_e32 v3, v3, v4
; TONGA-NEXT: v_mul_lo_u16_e32 v3, 7, v3
; TONGA-NEXT: v_sub_u16_e32 v2, v2, v3
; TONGA-NEXT: v_readfirstlane_b32 s0, v2
; TONGA-NEXT: s_sext_i32_i16 s0, s0
; TONGA-NEXT: s_mulk_i32 s0, 0x4925
; TONGA-NEXT: s_lshr_b32 s1, s0, 31
; TONGA-NEXT: s_ashr_i32 s0, s0, 17
; TONGA-NEXT: s_add_i32 s0, s0, s1
; TONGA-NEXT: s_mul_i32 s0, s0, 7
; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2
; TONGA-NEXT: flat_store_short v[0:1], v2
; TONGA-NEXT: s_endpgm
;
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/trunc-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u16_e32 v0, 4, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%load0 = load i32, ptr addrspace(1) undef
%load1 = load i32, ptr addrspace(1) null
Expand Down Expand Up @@ -70,7 +70,7 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u16_e32 v0, 4, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%load0 = load float, ptr addrspace(1) undef
%load1 = load float, ptr addrspace(1) null
Expand Down
136 changes: 80 additions & 56 deletions llvm/test/CodeGen/AMDGPU/trunc-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -58,35 +58,47 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s17
; VI-NEXT: v_mov_b32_e32 v1, s16
; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s19
; VI-NEXT: v_mov_b32_e32 v3, s18
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s13
; VI-NEXT: v_mov_b32_e32 v1, s12
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s15
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s9
; VI-NEXT: v_mov_b32_e32 v1, s8
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s11
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s5
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s7
; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_lshl_b32 s2, s19, 8
; VI-NEXT: s_and_b32 s3, s18, 0xff
; VI-NEXT: s_lshl_b32 s17, s17, 8
; VI-NEXT: s_and_b32 s16, s16, 0xff
; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: s_or_b32 s3, s16, s17
; VI-NEXT: s_lshl_b32 s2, s2, 16
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: s_lshl_b32 s3, s15, 8
; VI-NEXT: s_and_b32 s14, s14, 0xff
; VI-NEXT: s_lshl_b32 s13, s13, 8
; VI-NEXT: s_and_b32 s12, s12, 0xff
; VI-NEXT: s_lshl_b32 s11, s11, 8
; VI-NEXT: s_and_b32 s10, s10, 0xff
; VI-NEXT: s_lshl_b32 s9, s9, 8
; VI-NEXT: s_and_b32 s8, s8, 0xff
; VI-NEXT: s_lshl_b32 s7, s7, 8
; VI-NEXT: s_and_b32 s6, s6, 0xff
; VI-NEXT: s_lshl_b32 s5, s5, 8
; VI-NEXT: s_and_b32 s4, s4, 0xff
; VI-NEXT: s_or_b32 s3, s14, s3
; VI-NEXT: s_or_b32 s12, s12, s13
; VI-NEXT: s_or_b32 s10, s10, s11
; VI-NEXT: s_or_b32 s8, s8, s9
; VI-NEXT: s_or_b32 s6, s6, s7
; VI-NEXT: s_or_b32 s4, s4, s5
; VI-NEXT: s_lshl_b32 s3, s3, 16
; VI-NEXT: s_and_b32 s12, s12, 0xffff
; VI-NEXT: s_lshl_b32 s10, s10, 16
; VI-NEXT: s_and_b32 s8, s8, 0xffff
; VI-NEXT: s_lshl_b32 s6, s6, 16
; VI-NEXT: s_and_b32 s4, s4, 0xffff
; VI-NEXT: s_or_b32 s3, s12, s3
; VI-NEXT: s_or_b32 s8, s8, s10
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s8
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
Expand Down Expand Up @@ -153,35 +165,47 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0xa4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s26
; VI-NEXT: v_mov_b32_e32 v1, s24
; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s30
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, s28
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s18
; VI-NEXT: v_mov_b32_e32 v1, s16
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s22
; VI-NEXT: v_mov_b32_e32 v2, s20
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s10
; VI-NEXT: v_mov_b32_e32 v1, s8
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s14
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s2
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s6
; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_lshl_b32 s1, s30, 8
; VI-NEXT: s_and_b32 s3, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s26, 8
; VI-NEXT: s_or_b32 s1, s3, s1
; VI-NEXT: s_and_b32 s3, s24, 0xff
; VI-NEXT: s_or_b32 s3, s3, s5
; VI-NEXT: s_lshl_b32 s1, s1, 16
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_or_b32 s1, s3, s1
; VI-NEXT: s_lshl_b32 s3, s22, 8
; VI-NEXT: s_and_b32 s5, s20, 0xff
; VI-NEXT: s_or_b32 s3, s5, s3
; VI-NEXT: s_lshl_b32 s5, s18, 8
; VI-NEXT: s_and_b32 s7, s16, 0xff
; VI-NEXT: s_or_b32 s5, s7, s5
; VI-NEXT: s_lshl_b32 s3, s3, 16
; VI-NEXT: s_and_b32 s5, s5, 0xffff
; VI-NEXT: s_or_b32 s3, s5, s3
; VI-NEXT: s_lshl_b32 s5, s14, 8
; VI-NEXT: s_and_b32 s7, s12, 0xff
; VI-NEXT: s_or_b32 s5, s7, s5
; VI-NEXT: s_lshl_b32 s7, s10, 8
; VI-NEXT: s_and_b32 s8, s8, 0xff
; VI-NEXT: s_lshl_b32 s6, s6, 8
; VI-NEXT: s_and_b32 s4, s4, 0xff
; VI-NEXT: s_lshl_b32 s2, s2, 8
; VI-NEXT: s_and_b32 s0, s0, 0xff
; VI-NEXT: s_or_b32 s7, s8, s7
; VI-NEXT: s_or_b32 s4, s4, s6
; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: s_and_b32 s7, s7, 0xffff
; VI-NEXT: s_lshl_b32 s4, s4, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_or_b32 s5, s7, s5
; VI-NEXT: s_or_b32 s0, s0, s4
; VI-NEXT: v_mov_b32_e32 v4, s34
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s35
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
Expand Down
12 changes: 7 additions & 5 deletions llvm/test/CodeGen/AMDGPU/uaddo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -499,8 +499,10 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_lt_u16_e32 vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v5
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5
; VI-NEXT: v_cmp_lt_u32_e32 vcc, v6, v4
; VI-NEXT: flat_store_short v[0:1], v5
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
Expand All @@ -514,9 +516,9 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_lt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
Expand Down
12 changes: 7 additions & 5 deletions llvm/test/CodeGen/AMDGPU/usubo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -499,8 +499,10 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_gt_u16_e32 vcc, v5, v4
; VI-NEXT: v_sub_u32_e32 v5, vcc, v4, v5
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5
; VI-NEXT: v_cmp_gt_u32_e32 vcc, v6, v4
; VI-NEXT: flat_store_short v[0:1], v5
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
Expand All @@ -514,9 +516,9 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_gt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,7 @@ entry:
; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
; GCN-ALLOCA: buffer_store_dword

; GCN-PROMOTE: v_cmp_eq_u16
; GCN-PROMOTE: v_cndmask
; GCN-PROMOTE: s_cmp_eq_u32

; GCN: s_cbranch

Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,11 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() {
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_mov_b32 s4, 0xffff
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], v0, s4
; CHECK-NEXT: v_and_b32_e64 v0, s4, v0
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4
; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_4
; CHECK-NEXT: ; %bb.3: ; %bb201
Expand Down
44 changes: 21 additions & 23 deletions llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -318,42 +318,40 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
; VI-LABEL: widen_v2i8_constant_load:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 44
; VI-NEXT: v_mov_b32_e32 v1, 3
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, s0, 0xffff
; VI-NEXT: s_and_b32 s1, s0, 0xffffff00
; VI-NEXT: s_add_i32 s0, s0, 12
; VI-NEXT: s_or_b32 s0, s0, 4
; VI-NEXT: s_and_b32 s0, s0, 0xff
; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: s_addk_i32 s0, 0x2c00
; VI-NEXT: s_or_b32 s0, s0, 0x300
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_i32 s1, s1, 12
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; VI-NEXT: s_or_b32 s0, s1, 4
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: widen_v2i8_constant_load:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v0, s0, 12
; GFX11-NEXT: v_and_b32_e64 v1, 0xffffff00, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 4, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x2c00
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v2, 0x300, v2
; GFX11-NEXT: s_add_i32 s1, s0, 12
; GFX11-NEXT: s_and_b32 s0, s0, 0xff00
; GFX11-NEXT: s_or_b32 s1, s1, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_addk_i32 s0, 0x2c00
; GFX11-NEXT: s_or_b32 s0, s0, 0x300
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down