Skip to content

[AMDGPU] select i1 %cond, <32 x i4> %x, <32 x i4> zeroinitializer doesn't act like other 128-bit scalar selects #160969

@krzysz00

Description

@krzysz00

Input IR

; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - < %s

;; Note: the IR didn't come like this. The loads and stores were on <16 x i8>
;; and we'd bitcast to <32 x i4> and back for the select. Then, `opt`
;; would fold the bitcasts indo the loads/stores
define void @v32i4(i1 %cond, ptr addrspace(1) %x, ptr addrspace(3) %y) {
  %v = load <32 x i4>, ptr addrspace(1) %x, align 16
  %vMasked = select i1 %cond, <32 x i4> %v, <32 x i4> zeroinitializer
  store <32 x i4> %vMasked, ptr addrspace(3) %y, align 16
  ret void
}

define void @v16i8(i1 %cond, ptr addrspace(1) %x, ptr addrspace(3) %y) {
  %v = load <16 x i8>, ptr addrspace(1) %x, align 16
  %vMasked = select i1 %cond, <16 x i8> %v, <16 x i8> zeroinitializer
  store <16 x i8> %vMasked, ptr addrspace(3) %y, align 16
  ret void
}

define void @v4i32(i1 %cond, ptr addrspace(1) %x, ptr addrspace(3) %y) {
  %v = load <4 x i32>, ptr addrspace(1) %x, align 16
  %vMasked = select i1 %cond, <4 x i32> %v, <4 x i32> zeroinitializer
  store <4 x i32> %vMasked, ptr addrspace(3) %y, align 16
  ret void
}

Current output

Assembly output of the poorly-compiling program at time of writing
v32i4:                                  ; @v32i4
; %bb.0:
	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	v_mov_b32_e32 v5, v2
	v_mov_b32_e32 v4, v1
	global_load_dwordx4 v[4:7], v[4:5], off
	s_mov_b32 s0, 0x5040100
	v_and_b32_e32 v0, 1, v0
	v_cmp_eq_u32_e32 vcc, 1, v0
	s_waitcnt vmcnt(0)
	scratch_store_dwordx4 off, v[4:7], s32
	scratch_load_dwordx2 v[4:5], off, s32
	s_nop 0
	scratch_load_dwordx2 v[6:7], off, s32 offset:8
	s_waitcnt vmcnt(1)
	v_lshrrev_b32_e32 v8, 28, v4
	s_waitcnt vmcnt(0)
	v_lshrrev_b32_e32 v12, 28, v6
	v_and_b32_e32 v13, 15, v6
	v_bfe_u32 v29, v6, 24, 4
	v_bfe_u32 v30, v6, 16, 4
	v_bfe_u32 v31, v6, 20, 4
	v_bfe_u32 v32, v6, 8, 4
	v_bfe_u32 v33, v6, 12, 4
	v_bfe_u32 v6, v6, 4, 4
	v_and_b32_e32 v9, 15, v4
	v_bfe_u32 v19, v4, 24, 4
	v_bfe_u32 v20, v4, 16, 4
	v_bfe_u32 v21, v4, 20, 4
	v_bfe_u32 v22, v4, 8, 4
	v_bfe_u32 v23, v4, 12, 4
	v_bfe_u32 v4, v4, 4, 4
	v_perm_b32 v6, v6, v13, s0
	v_perm_b32 v13, v33, v32, s0
	v_lshrrev_b32_e32 v1, 28, v5
	v_and_b32_e32 v2, 15, v5
	v_bfe_u32 v14, v5, 24, 4
	v_bfe_u32 v15, v5, 16, 4
	v_bfe_u32 v16, v5, 20, 4
	v_bfe_u32 v17, v5, 8, 4
	v_bfe_u32 v18, v5, 12, 4
	v_bfe_u32 v5, v5, 4, 4
	v_bfe_u32 v25, v7, 16, 4
	v_bfe_u32 v26, v7, 20, 4
	v_perm_b32 v30, v31, v30, s0
	v_perm_b32 v4, v4, v9, s0
	v_perm_b32 v9, v23, v22, s0
	v_cndmask_b32_e32 v0, 0, v6, vcc
	v_cndmask_b32_e32 v6, 0, v13, vcc
	v_perm_b32 v12, v12, v29, s0
	v_perm_b32 v25, v26, v25, s0
	v_perm_b32 v20, v21, v20, s0
	v_perm_b32 v2, v5, v2, s0
	v_perm_b32 v5, v18, v17, s0
	v_cndmask_b32_e32 v13, 0, v30, vcc
	v_cndmask_b32_e32 v4, 0, v4, vcc
	v_cndmask_b32_e32 v9, 0, v9, vcc
	v_lshrrev_b32_e32 v17, 12, v0
	v_lshrrev_b32_e32 v18, 4, v6
	v_and_b32_e32 v6, 15, v6
	v_lshrrev_b32_e32 v10, 28, v7
	v_and_b32_e32 v11, 15, v7
	v_bfe_u32 v24, v7, 24, 4
	v_bfe_u32 v27, v7, 8, 4
	v_bfe_u32 v28, v7, 12, 4
	v_bfe_u32 v7, v7, 4, 4
	v_perm_b32 v8, v8, v19, s0
	v_perm_b32 v15, v16, v15, s0
	v_perm_b32 v1, v1, v14, s0
	v_cndmask_b32_e32 v12, 0, v12, vcc
	v_cndmask_b32_e32 v14, 0, v25, vcc
	v_cndmask_b32_e32 v16, 0, v20, vcc
	v_cndmask_b32_e32 v2, 0, v2, vcc
	v_cndmask_b32_e32 v5, 0, v5, vcc
	v_lshlrev_b32_e32 v19, 16, v13
	v_lshlrev_b32_e32 v13, 4, v13
	v_lshrrev_b32_e32 v25, 12, v4
	v_lshrrev_b32_e32 v26, 4, v9
	v_and_b32_e32 v9, 15, v9
	v_and_or_b32 v0, v0, 15, v17
	v_lshlrev_b32_e32 v6, 8, v6
	v_perm_b32 v7, v7, v11, s0
	v_perm_b32 v11, v28, v27, s0
	v_cndmask_b32_e32 v8, 0, v8, vcc
	v_cndmask_b32_e32 v15, 0, v15, vcc
	v_lshlrev_b32_e32 v20, 24, v12
	v_lshlrev_b32_e32 v12, 12, v12
	v_lshlrev_b32_e32 v27, 16, v16
	v_lshlrev_b32_e32 v16, 4, v16
	v_lshrrev_b32_e32 v29, 12, v2
	v_and_b32_e32 v2, 15, v2
	v_lshrrev_b32_e32 v30, 4, v5
	v_and_b32_e32 v5, 15, v5
	v_and_b32_e32 v13, 0xf00000, v13
	v_and_or_b32 v4, v4, 15, v25
	v_lshlrev_b32_e32 v9, 8, v9
	v_or3_b32 v0, v0, v6, v18
	v_cndmask_b32_e32 v7, 0, v7, vcc
	v_cndmask_b32_e32 v11, 0, v11, vcc
	v_cndmask_b32_e32 v1, 0, v1, vcc
	v_lshlrev_b32_e32 v28, 24, v8
	v_lshlrev_b32_e32 v8, 12, v8
	v_lshlrev_b32_e32 v31, 16, v15
	v_lshlrev_b32_e32 v15, 4, v15
	v_and_b32_e32 v12, 0xf0000000, v12
	v_and_b32_e32 v16, 0xf00000, v16
	v_or3_b32 v2, 0, v2, v29
	v_lshlrev_b32_e32 v5, 8, v5
	v_or3_b32 v4, v4, v9, v26
	v_or3_b32 v0, v0, v19, v13
	v_perm_b32 v10, v10, v24, s0
	v_lshrrev_b32_e32 v21, 12, v7
	v_and_b32_e32 v7, 15, v7
	v_lshrrev_b32_e32 v22, 4, v11
	v_and_b32_e32 v11, 15, v11
	v_lshlrev_b32_e32 v32, 24, v1
	v_lshlrev_b32_e32 v1, 12, v1
	v_and_b32_e32 v8, 0xf0000000, v8
	v_and_b32_e32 v15, 0xf00000, v15
	v_or3_b32 v2, v2, v5, v30
	v_or3_b32 v4, v4, v27, v16
	v_or3_b32 v0, v0, v20, v12
	v_cndmask_b32_e32 v10, 0, v10, vcc
	v_lshlrev_b32_e32 v23, 16, v14
	v_lshlrev_b32_e32 v14, 4, v14
	v_or3_b32 v7, 0, v7, v21
	v_lshlrev_b32_e32 v11, 8, v11
	v_and_b32_e32 v17, 0xf0000000, v1
	v_or3_b32 v2, v2, v31, v15
	v_or3_b32 v4, v4, v28, v8
	v_or3_b32 v0, v0, 0, 0
	v_lshlrev_b32_e32 v24, 24, v10
	v_lshlrev_b32_e32 v10, 12, v10
	v_and_b32_e32 v14, 0xf00000, v14
	v_or3_b32 v1, v7, v11, v22
	v_or3_b32 v5, v2, v32, v17
	v_or3_b32 v2, v4, 0, 0
	v_or3_b32 v0, v0, 0, 0
	v_and_b32_e32 v10, 0xf0000000, v10
	v_or3_b32 v1, v1, v23, v14
	v_or3_b32 v2, v2, 0, 0
	v_or3_b32 v0, v0, 0, 0
	v_or3_b32 v1, v1, v24, v10
	v_or3_b32 v2, v2, 0, 0
	v_or3_b32 v0, v0, 0, 0
	v_or3_b32 v4, v2, 0, 0
	scratch_store_dwordx2 off, v[0:1], s32 offset:24
	scratch_store_dwordx2 off, v[4:5], s32 offset:16
	scratch_load_dwordx4 v[4:7], off, s32 offset:16
	s_waitcnt vmcnt(0)
	ds_write_b128 v3, v[4:7]
	s_waitcnt lgkmcnt(0)
	s_setpc_b64 s[30:31]

v16i8:                                  ; @v16i8
; %bb.0:
	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	v_mov_b32_e32 v5, v2
	v_mov_b32_e32 v4, v1
	global_load_dwordx4 v[4:7], v[4:5], off
	v_and_b32_e32 v0, 1, v0
	v_cmp_eq_u32_e32 vcc, 1, v0
	s_waitcnt vmcnt(0)
	s_nop 0
	v_cndmask_b32_e32 v7, 0, v7, vcc
	v_cndmask_b32_e32 v6, 0, v6, vcc
	v_cndmask_b32_e32 v5, 0, v5, vcc
	v_cndmask_b32_e32 v4, 0, v4, vcc
	ds_write_b128 v3, v[4:7]
	s_waitcnt lgkmcnt(0)
	s_setpc_b64 s[30:31]

v4i32:                                  ; @v4i32
; %bb.0:
	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	v_mov_b32_e32 v5, v2
	v_mov_b32_e32 v4, v1
	global_load_dwordx4 v[4:7], v[4:5], off
	v_and_b32_e32 v0, 1, v0
	v_cmp_eq_u32_e32 vcc, 1, v0
	s_waitcnt vmcnt(0)
	s_nop 0
	v_cndmask_b32_e32 v7, 0, v7, vcc
	v_cndmask_b32_e32 v6, 0, v6, vcc
	v_cndmask_b32_e32 v5, 0, v5, vcc
	v_cndmask_b32_e32 v4, 0, v4, vcc
	ds_write_b128 v3, v[4:7]
	s_waitcnt lgkmcnt(0)
	s_setpc_b64 s[30:31]

               

Expected behavior

@v32i4, @v16i8, and @v4i32 generate the same or substantially similar code

Actual behavior

@v32i4 doesn't seem to perform the scalar-condition optimization and performs a bunch of bitfield extracts, including ones that cause scratch spills

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions