Skip to content

Commit

Permalink
[RISCV] Be more aggressive about shrinking constant build_vector etype (
Browse files Browse the repository at this point in the history
#67175)

If LMUL is more than m1, we can be more aggressive about narrowing the
build_vector via a vsext if legal. If the narrow build_vector gets
lowered as a load, while both are linear in lmul, load uops are
generally more expensive than extend uops. If the narrow build_vector
gets lowered via dominant values, that work is linear in both #unique
elements and LMUL. So provided the number of unique values > 2, this is
a net win in work performed.
  • Loading branch information
preames committed Sep 25, 2023
1 parent d374a78 commit 95ce3c2
Show file tree
Hide file tree
Showing 11 changed files with 406 additions and 283 deletions.
16 changes: 8 additions & 8 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3511,17 +3511,14 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
}
}

if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget))
return Res;

// If the number of signbits allows, see if we can lower as a <N x i8>.
// We restrict this to N <= 4 to ensure the resulting narrow vector is
// 32 bits of smaller and can thus be materialized cheaply from scalar.
// The main motivation for this is the constant index vector required
// by vrgather.vv. This covers all indice vectors up to size 4.
// Our main goal here is to reduce LMUL (and thus work) required to
// build the constant, but we will also narrow if the resulting
// narrow vector is known to materialize cheaply.
// TODO: We really should be costing the smaller vector. There are
// profitable cases this misses.
if (EltBitSize > 8 && NumElts <= 4) {
if (EltBitSize > 8 &&
(NumElts <= 4 || VT.getSizeInBits() > Subtarget.getRealMinVLen())) {
unsigned SignBits = DAG.ComputeNumSignBits(Op);
if (EltBitSize - SignBits < 8) {
SDValue Source =
Expand All @@ -3533,6 +3530,9 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
}
}

if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget))
return Res;

// For constant vectors, use generic constant pool lowering. Otherwise,
// we'd have to materialize constants in GPRs just to move them into the
// vector.
Expand Down
55 changes: 33 additions & 22 deletions llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,12 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vsaddu.vx v16, v16, a1
; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
Expand All @@ -125,27 +126,30 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vsaddu.vx v16, v16, a1
; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 4
; CHECK-NEXT: lui a0, %hi(.LCPI9_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vslideup.vi v0, v16, 6
Expand All @@ -160,59 +164,66 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vsaddu.vx v16, v16, a1
; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 4
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 6
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 8
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 10
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 12
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vslideup.vi v0, v16, 14
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -950,16 +950,16 @@ define i32 @extractelt_mul_v4i32(<4 x i32> %x) {
define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
; RV32NOM-LABEL: extractelt_sdiv_v4i32:
; RV32NOM: # %bb.0:
; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32NOM-NEXT: vmv.v.i v9, 0
; RV32NOM-NEXT: lui a0, %hi(.LCPI42_0)
; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI42_0)
; RV32NOM-NEXT: vle32.v v10, (a0)
; RV32NOM-NEXT: li a0, -1
; RV32NOM-NEXT: vslide1down.vx v9, v9, a0
; RV32NOM-NEXT: vand.vv v9, v8, v9
; RV32NOM-NEXT: vmulh.vv v8, v8, v10
; RV32NOM-NEXT: vadd.vv v8, v8, v9
; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32NOM-NEXT: vle32.v v9, (a0)
; RV32NOM-NEXT: vmulh.vv v9, v8, v9
; RV32NOM-NEXT: lui a0, 1044480
; RV32NOM-NEXT: vmv.s.x v10, a0
; RV32NOM-NEXT: vsext.vf4 v11, v10
; RV32NOM-NEXT: vand.vv v8, v8, v11
; RV32NOM-NEXT: vadd.vv v8, v9, v8
; RV32NOM-NEXT: lui a0, 12320
; RV32NOM-NEXT: addi a0, a0, 257
; RV32NOM-NEXT: vmv.s.x v9, a0
Expand All @@ -986,16 +986,16 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
;
; RV64NOM-LABEL: extractelt_sdiv_v4i32:
; RV64NOM: # %bb.0:
; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64NOM-NEXT: vmv.v.i v9, 0
; RV64NOM-NEXT: lui a0, %hi(.LCPI42_0)
; RV64NOM-NEXT: addi a0, a0, %lo(.LCPI42_0)
; RV64NOM-NEXT: vle32.v v10, (a0)
; RV64NOM-NEXT: li a0, -1
; RV64NOM-NEXT: vslide1down.vx v9, v9, a0
; RV64NOM-NEXT: vand.vv v9, v8, v9
; RV64NOM-NEXT: vmulh.vv v8, v8, v10
; RV64NOM-NEXT: vadd.vv v8, v8, v9
; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64NOM-NEXT: vle32.v v9, (a0)
; RV64NOM-NEXT: vmulh.vv v9, v8, v9
; RV64NOM-NEXT: lui a0, 1044480
; RV64NOM-NEXT: vmv.s.x v10, a0
; RV64NOM-NEXT: vsext.vf4 v11, v10
; RV64NOM-NEXT: vand.vv v8, v8, v11
; RV64NOM-NEXT: vadd.vv v8, v9, v8
; RV64NOM-NEXT: lui a0, 12320
; RV64NOM-NEXT: addiw a0, a0, 257
; RV64NOM-NEXT: vmv.s.x v9, a0
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,8 @@ define <4 x i64> @buildvec_vid_step1_add0_v4i64() {
; RV32-NEXT: lui a0, %hi(.LCPI25_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI25_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: vle8.v v10, (a0)
; RV32-NEXT: vsext.vf4 v8, v10
; RV32-NEXT: ret
;
; RV64-LABEL: buildvec_vid_step1_add0_v4i64:
Expand All @@ -309,7 +310,8 @@ define <4 x i64> @buildvec_vid_step2_add0_v4i64() {
; RV32-NEXT: lui a0, %hi(.LCPI26_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI26_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: vle8.v v10, (a0)
; RV32-NEXT: vsext.vf4 v8, v10
; RV32-NEXT: ret
;
; RV64-LABEL: buildvec_vid_step2_add0_v4i64:
Expand Down
Loading

0 comments on commit 95ce3c2

Please sign in to comment.