-
Notifications
You must be signed in to change notification settings - Fork 14.9k
Open
Labels
Description
Input IR
; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - < %s
;; Note: the IR didn't come like this. The loads and stores were on <16 x i8>
;; and we'd bitcast to <32 x i4> and back for the select. Then, `opt`
;; would fold the bitcasts indo the loads/stores
define void @v32i4(i1 %cond, ptr addrspace(1) %x, ptr addrspace(3) %y) {
%v = load <32 x i4>, ptr addrspace(1) %x, align 16
%vMasked = select i1 %cond, <32 x i4> %v, <32 x i4> zeroinitializer
store <32 x i4> %vMasked, ptr addrspace(3) %y, align 16
ret void
}
define void @v16i8(i1 %cond, ptr addrspace(1) %x, ptr addrspace(3) %y) {
%v = load <16 x i8>, ptr addrspace(1) %x, align 16
%vMasked = select i1 %cond, <16 x i8> %v, <16 x i8> zeroinitializer
store <16 x i8> %vMasked, ptr addrspace(3) %y, align 16
ret void
}
define void @v4i32(i1 %cond, ptr addrspace(1) %x, ptr addrspace(3) %y) {
%v = load <4 x i32>, ptr addrspace(1) %x, align 16
%vMasked = select i1 %cond, <4 x i32> %v, <4 x i32> zeroinitializer
store <4 x i32> %vMasked, ptr addrspace(3) %y, align 16
ret void
}
Current output
Assembly output of the poorly-compiling program at time of writing
v32i4: ; @v32i4
; %bb.0:
s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
v_mov_b32_e32 v5, v2
v_mov_b32_e32 v4, v1
global_load_dwordx4 v[4:7], v[4:5], off
s_mov_b32 s0, 0x5040100
v_and_b32_e32 v0, 1, v0
v_cmp_eq_u32_e32 vcc, 1, v0
s_waitcnt vmcnt(0)
scratch_store_dwordx4 off, v[4:7], s32
scratch_load_dwordx2 v[4:5], off, s32
s_nop 0
scratch_load_dwordx2 v[6:7], off, s32 offset:8
s_waitcnt vmcnt(1)
v_lshrrev_b32_e32 v8, 28, v4
s_waitcnt vmcnt(0)
v_lshrrev_b32_e32 v12, 28, v6
v_and_b32_e32 v13, 15, v6
v_bfe_u32 v29, v6, 24, 4
v_bfe_u32 v30, v6, 16, 4
v_bfe_u32 v31, v6, 20, 4
v_bfe_u32 v32, v6, 8, 4
v_bfe_u32 v33, v6, 12, 4
v_bfe_u32 v6, v6, 4, 4
v_and_b32_e32 v9, 15, v4
v_bfe_u32 v19, v4, 24, 4
v_bfe_u32 v20, v4, 16, 4
v_bfe_u32 v21, v4, 20, 4
v_bfe_u32 v22, v4, 8, 4
v_bfe_u32 v23, v4, 12, 4
v_bfe_u32 v4, v4, 4, 4
v_perm_b32 v6, v6, v13, s0
v_perm_b32 v13, v33, v32, s0
v_lshrrev_b32_e32 v1, 28, v5
v_and_b32_e32 v2, 15, v5
v_bfe_u32 v14, v5, 24, 4
v_bfe_u32 v15, v5, 16, 4
v_bfe_u32 v16, v5, 20, 4
v_bfe_u32 v17, v5, 8, 4
v_bfe_u32 v18, v5, 12, 4
v_bfe_u32 v5, v5, 4, 4
v_bfe_u32 v25, v7, 16, 4
v_bfe_u32 v26, v7, 20, 4
v_perm_b32 v30, v31, v30, s0
v_perm_b32 v4, v4, v9, s0
v_perm_b32 v9, v23, v22, s0
v_cndmask_b32_e32 v0, 0, v6, vcc
v_cndmask_b32_e32 v6, 0, v13, vcc
v_perm_b32 v12, v12, v29, s0
v_perm_b32 v25, v26, v25, s0
v_perm_b32 v20, v21, v20, s0
v_perm_b32 v2, v5, v2, s0
v_perm_b32 v5, v18, v17, s0
v_cndmask_b32_e32 v13, 0, v30, vcc
v_cndmask_b32_e32 v4, 0, v4, vcc
v_cndmask_b32_e32 v9, 0, v9, vcc
v_lshrrev_b32_e32 v17, 12, v0
v_lshrrev_b32_e32 v18, 4, v6
v_and_b32_e32 v6, 15, v6
v_lshrrev_b32_e32 v10, 28, v7
v_and_b32_e32 v11, 15, v7
v_bfe_u32 v24, v7, 24, 4
v_bfe_u32 v27, v7, 8, 4
v_bfe_u32 v28, v7, 12, 4
v_bfe_u32 v7, v7, 4, 4
v_perm_b32 v8, v8, v19, s0
v_perm_b32 v15, v16, v15, s0
v_perm_b32 v1, v1, v14, s0
v_cndmask_b32_e32 v12, 0, v12, vcc
v_cndmask_b32_e32 v14, 0, v25, vcc
v_cndmask_b32_e32 v16, 0, v20, vcc
v_cndmask_b32_e32 v2, 0, v2, vcc
v_cndmask_b32_e32 v5, 0, v5, vcc
v_lshlrev_b32_e32 v19, 16, v13
v_lshlrev_b32_e32 v13, 4, v13
v_lshrrev_b32_e32 v25, 12, v4
v_lshrrev_b32_e32 v26, 4, v9
v_and_b32_e32 v9, 15, v9
v_and_or_b32 v0, v0, 15, v17
v_lshlrev_b32_e32 v6, 8, v6
v_perm_b32 v7, v7, v11, s0
v_perm_b32 v11, v28, v27, s0
v_cndmask_b32_e32 v8, 0, v8, vcc
v_cndmask_b32_e32 v15, 0, v15, vcc
v_lshlrev_b32_e32 v20, 24, v12
v_lshlrev_b32_e32 v12, 12, v12
v_lshlrev_b32_e32 v27, 16, v16
v_lshlrev_b32_e32 v16, 4, v16
v_lshrrev_b32_e32 v29, 12, v2
v_and_b32_e32 v2, 15, v2
v_lshrrev_b32_e32 v30, 4, v5
v_and_b32_e32 v5, 15, v5
v_and_b32_e32 v13, 0xf00000, v13
v_and_or_b32 v4, v4, 15, v25
v_lshlrev_b32_e32 v9, 8, v9
v_or3_b32 v0, v0, v6, v18
v_cndmask_b32_e32 v7, 0, v7, vcc
v_cndmask_b32_e32 v11, 0, v11, vcc
v_cndmask_b32_e32 v1, 0, v1, vcc
v_lshlrev_b32_e32 v28, 24, v8
v_lshlrev_b32_e32 v8, 12, v8
v_lshlrev_b32_e32 v31, 16, v15
v_lshlrev_b32_e32 v15, 4, v15
v_and_b32_e32 v12, 0xf0000000, v12
v_and_b32_e32 v16, 0xf00000, v16
v_or3_b32 v2, 0, v2, v29
v_lshlrev_b32_e32 v5, 8, v5
v_or3_b32 v4, v4, v9, v26
v_or3_b32 v0, v0, v19, v13
v_perm_b32 v10, v10, v24, s0
v_lshrrev_b32_e32 v21, 12, v7
v_and_b32_e32 v7, 15, v7
v_lshrrev_b32_e32 v22, 4, v11
v_and_b32_e32 v11, 15, v11
v_lshlrev_b32_e32 v32, 24, v1
v_lshlrev_b32_e32 v1, 12, v1
v_and_b32_e32 v8, 0xf0000000, v8
v_and_b32_e32 v15, 0xf00000, v15
v_or3_b32 v2, v2, v5, v30
v_or3_b32 v4, v4, v27, v16
v_or3_b32 v0, v0, v20, v12
v_cndmask_b32_e32 v10, 0, v10, vcc
v_lshlrev_b32_e32 v23, 16, v14
v_lshlrev_b32_e32 v14, 4, v14
v_or3_b32 v7, 0, v7, v21
v_lshlrev_b32_e32 v11, 8, v11
v_and_b32_e32 v17, 0xf0000000, v1
v_or3_b32 v2, v2, v31, v15
v_or3_b32 v4, v4, v28, v8
v_or3_b32 v0, v0, 0, 0
v_lshlrev_b32_e32 v24, 24, v10
v_lshlrev_b32_e32 v10, 12, v10
v_and_b32_e32 v14, 0xf00000, v14
v_or3_b32 v1, v7, v11, v22
v_or3_b32 v5, v2, v32, v17
v_or3_b32 v2, v4, 0, 0
v_or3_b32 v0, v0, 0, 0
v_and_b32_e32 v10, 0xf0000000, v10
v_or3_b32 v1, v1, v23, v14
v_or3_b32 v2, v2, 0, 0
v_or3_b32 v0, v0, 0, 0
v_or3_b32 v1, v1, v24, v10
v_or3_b32 v2, v2, 0, 0
v_or3_b32 v0, v0, 0, 0
v_or3_b32 v4, v2, 0, 0
scratch_store_dwordx2 off, v[0:1], s32 offset:24
scratch_store_dwordx2 off, v[4:5], s32 offset:16
scratch_load_dwordx4 v[4:7], off, s32 offset:16
s_waitcnt vmcnt(0)
ds_write_b128 v3, v[4:7]
s_waitcnt lgkmcnt(0)
s_setpc_b64 s[30:31]
v16i8: ; @v16i8
; %bb.0:
s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
v_mov_b32_e32 v5, v2
v_mov_b32_e32 v4, v1
global_load_dwordx4 v[4:7], v[4:5], off
v_and_b32_e32 v0, 1, v0
v_cmp_eq_u32_e32 vcc, 1, v0
s_waitcnt vmcnt(0)
s_nop 0
v_cndmask_b32_e32 v7, 0, v7, vcc
v_cndmask_b32_e32 v6, 0, v6, vcc
v_cndmask_b32_e32 v5, 0, v5, vcc
v_cndmask_b32_e32 v4, 0, v4, vcc
ds_write_b128 v3, v[4:7]
s_waitcnt lgkmcnt(0)
s_setpc_b64 s[30:31]
v4i32: ; @v4i32
; %bb.0:
s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
v_mov_b32_e32 v5, v2
v_mov_b32_e32 v4, v1
global_load_dwordx4 v[4:7], v[4:5], off
v_and_b32_e32 v0, 1, v0
v_cmp_eq_u32_e32 vcc, 1, v0
s_waitcnt vmcnt(0)
s_nop 0
v_cndmask_b32_e32 v7, 0, v7, vcc
v_cndmask_b32_e32 v6, 0, v6, vcc
v_cndmask_b32_e32 v5, 0, v5, vcc
v_cndmask_b32_e32 v4, 0, v4, vcc
ds_write_b128 v3, v[4:7]
s_waitcnt lgkmcnt(0)
s_setpc_b64 s[30:31]
Expected behavior
@v32i4
, @v16i8
, and @v4i32
generate the same or substantially similar code
Actual behavior
@v32i4
doesn't seem to perform the scalar-condition optimization and performs a bunch of bitfield extracts, including ones that cause scratch spills