Skip to content

Commit

Permalink
[AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 3.
Browse files Browse the repository at this point in the history
Reviewed By: Joe_Nash

Differential Revision: https://reviews.llvm.org/D152716
  • Loading branch information
kosarev committed Jun 15, 2023
1 parent f79b033 commit e9d77cd
Show file tree
Hide file tree
Showing 4 changed files with 605 additions and 0 deletions.
158 changes: 158 additions & 0 deletions llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s

define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
; GCN-LABEL: uniform_vec_0_i16:
Expand Down Expand Up @@ -37,6 +38,19 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s2, s2, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tmp = insertelement <2 x i16> undef, i16 0, i32 0
%vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
%val = bitcast <2 x i16> %vec to i32
Expand All @@ -62,6 +76,13 @@ define i32 @divergent_vec_0_i16(i16 %a) {
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: divergent_vec_0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tmp = insertelement <2 x i16> undef, i16 0, i32 0
%vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
%val = bitcast <2 x i16> %vec to i32
Expand Down Expand Up @@ -102,6 +123,19 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tmp = insertelement <2 x i16> undef, i16 %a, i32 0
%vec = insertelement <2 x i16> %tmp, i16 0, i32 1
%val = bitcast <2 x i16> %vec to i32
Expand All @@ -127,6 +161,13 @@ define i32 @divergent_vec_i16_0(i16 %a) {
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: divergent_vec_i16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tmp = insertelement <2 x i16> undef, i16 %a, i32 0
%vec = insertelement <2 x i16> %tmp, i16 0, i32 1
%val = bitcast <2 x i16> %vec to i32
Expand Down Expand Up @@ -167,6 +208,19 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
; GFX906-NEXT: v_mov_b32_e32 v1, s0
; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tmp = insertelement <2 x half> undef, half %a, i32 0
%vec = insertelement <2 x half> %tmp, half 0.0, i32 1
%val = bitcast <2 x half> %vec to float
Expand All @@ -192,6 +246,13 @@ define float @divergent_vec_f16_0(half %a) {
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: divergent_vec_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tmp = insertelement <2 x half> undef, half %a, i32 0
%vec = insertelement <2 x half> %tmp, half 0.0, i32 1
%val = bitcast <2 x half> %vec to float
Expand Down Expand Up @@ -239,6 +300,19 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_LL:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
%val0 = load volatile i32, ptr addrspace(4) %in0
%val1 = load volatile i32, ptr addrspace(4) %in1
%lo = trunc i32 %val0 to i16
Expand Down Expand Up @@ -272,6 +346,13 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) {
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: divergent_vec_i16_LL:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tmp = insertelement <2 x i16> undef, i16 %a, i32 0
%vec = insertelement <2 x i16> %tmp, i16 %b, i32 1
%val = bitcast <2 x i16> %vec to i32
Expand Down Expand Up @@ -313,6 +394,17 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_LH:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%shift = lshr i32 %b, 16
%tr = trunc i32 %shift to i16
%tmp = insertelement <2 x i16> undef, i16 %a, i32 0
Expand Down Expand Up @@ -343,6 +435,13 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
; GFX906-NEXT: s_mov_b32 s4, 0xffff
; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: divergent_vec_i16_LH:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shift = lshr i32 %b, 16
%tr = trunc i32 %shift to i16
%tmp = insertelement <2 x i16> undef, i16 %a, i32 0
Expand Down Expand Up @@ -386,6 +485,17 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
; GFX906-NEXT: v_mov_b32_e32 v1, s2
; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_i16_HH:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%shift_a = lshr i32 %a, 16
%tr_a = trunc i32 %shift_a to i16
%shift_b = lshr i32 %b, 16
Expand Down Expand Up @@ -419,6 +529,13 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
; GFX906-NEXT: s_mov_b32 s4, 0x7060302
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: divergent_vec_i16_HH:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shift_a = lshr i32 %a, 16
%tr_a = trunc i32 %shift_a to i16
%shift_b = lshr i32 %b, 16
Expand Down Expand Up @@ -470,6 +587,19 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: uniform_vec_f16_LL:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
%val0 = load volatile i32, ptr addrspace(4) %in0
%val1 = load volatile i32, ptr addrspace(4) %in1
%lo.i = trunc i32 %val0 to i16
Expand Down Expand Up @@ -507,6 +637,13 @@ define float @divergent_vec_f16_LL(half %a, half %b) {
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: divergent_vec_f16_LL:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tmp = insertelement <2 x half> undef, half %a, i32 0
%vec = insertelement <2 x half> %tmp, half %b, i32 1
%val = bitcast <2 x half> %vec to float
Expand Down Expand Up @@ -535,6 +672,14 @@ define <2 x i16> @build_vec_v2i16_undeflo_divergent(ptr addrspace(3) %in) #0 {
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: build_vec_v2i16_undeflo_divergent:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_load_u16_d16 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i16, ptr addrspace(3) %in
%build = insertelement <2 x i16> undef, i16 %load, i32 0
Expand Down Expand Up @@ -579,6 +724,19 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in,
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_store_dword v1, v0, s[2:3]
; GFX906-NEXT: s_endpgm
;
; GFX11-LABEL: build_vec_v2i16_undeflo_uniform:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-NEXT: ds_load_u16_d16 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%load = load i16, ptr addrspace(3) %in
%build = insertelement <2 x i16> undef, i16 %load, i32 0
Expand Down

0 comments on commit e9d77cd

Please sign in to comment.