-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Recursively split concat_vector into smaller LMULs #83035
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesThis is the concat_vector equivalent of #81312, in that we recursively split concat_vectors with more than two operands into smaller concat_vectors. This allows us to break up the chain of vslideups, as well as perform the vslideups at a smaller LMUL, which in turn reduces register pressure as the previous lowering performed N vslideups at the highest result LMUL. This is done as a DAG combine so that any undef operands are combined away: If we do this during lowering then we end up with unnecessary vslideups of undefs. Patch is 114.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83035.diff 11 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0c98642748d4ec..64661ff26f5444 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15262,13 +15262,54 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
}
+// Recursively split up concat_vectors with more than 2 operands:
+//
+// concat_vector op1, op2, op3, op4
+// ->
+// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
+//
+// This reduces the length of the chain of vslideups and allows us to perform
+// the vslideups at a smaller LMUL.
+//
+// We do this as a DAG combine rather than during lowering so that any undef
+// operands can get combined away.
+static SDValue
+performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG,
+ const RISCVTargetLowering &TLI) {
+ SDLoc DL(N);
+
+ if (N->getNumOperands() <= 2)
+ return SDValue();
+
+ if (!TLI.isTypeLegal(N->getValueType(0)))
+ return SDValue();
+ MVT VT = N->getSimpleValueType(0);
+
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ size_t HalfNumOps = (N->getNumOperands() + 1) / 2;
+ SDValue BotSubConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
+ N->ops().take_front(HalfNumOps));
+ SDValue TopSubConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
+ N->ops().drop_front(HalfNumOps));
+
+ // Lower to an insert_subvector directly so the concat_vectors don't get
+ // recombined.
+ SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
+ BotSubConcat, DAG.getVectorIdxConstant(0, DL));
+ Vec = DAG.getNode(
+ ISD::INSERT_SUBVECTOR, DL, VT, Vec, TopSubConcat,
+ DAG.getVectorIdxConstant(HalfVT.getVectorMinNumElements(), DL));
+ return Vec;
+}
+
// If we're concatenating a series of vector loads like
// concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
// Then we can turn this into a strided load by widening the vector elements
// vlse32 p, stride=n
-static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget,
- const RISCVTargetLowering &TLI) {
+static SDValue
+performCONCAT_VECTORSStridedLoadCombine(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget,
+ const RISCVTargetLowering &TLI) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
@@ -16373,7 +16414,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return V;
break;
case ISD::CONCAT_VECTORS:
- if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
+ if (SDValue V =
+ performCONCAT_VECTORSStridedLoadCombine(N, DAG, Subtarget, *this))
+ return V;
+ if (SDValue V = performCONCAT_VECTORSSplitCombine(N, DAG, *this))
return V;
break;
case ISD::INSERT_VECTOR_ELT:
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 87d95d7596d4fa..fd087ba3a21d12 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -123,36 +123,35 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
; CHECK-LABEL: fv64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v0, v16, a2
-; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 4
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsext.vf8 v16, v9
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v9, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI9_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vid.v v16
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v0, v8, 2
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 6
+; CHECK-NEXT: vslideup.vi v0, v9, 4
; CHECK-NEXT: ret
%mask = call <64 x i1> @llvm.get.active.lane.mask.v64i1.i64(i64 %index, i64 %tc)
ret <64 x i1> %mask
@@ -161,72 +160,69 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
; CHECK-LABEL: fv128:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v0, v16, a2
-; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 4
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsext.vf8 v16, v9
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v9, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 2
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 6
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v10, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 8
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsext.vf8 v16, v10
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v10, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v8, 2
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v9, 4
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 10
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 12
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsext.vf8 v16, v9
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v9, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vid.v v16
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsext.vf8 v16, v8
-; CHECK-NEXT: vsaddu.vx v8, v16, a1
-; CHECK-NEXT: vmsltu.vx v16, v8, a2
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v0, v16, 14
+; CHECK-NEXT: vsaddu.vx v16, v16, a1
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vi v0, v8, 2
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v0, v9, 4
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v0, v10, 8
; CHECK-NEXT: ret
%mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
index c64216180c2af7..ed434deea1a837 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
@@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
; RV32-NEXT: th.swia a0, (a1), 4, 0
; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; RV32-NEXT: vle8.v v10, (a3)
-; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; RV32-NEXT: vslideup.vi v10, v9, 4
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; RV32-NEXT: vzext.vf4 v12, v10
@@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
; RV64-NEXT: th.swia a0, (a1), 4, 0
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; RV64-NEXT: vle8.v v10, (a3)
-; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma
+; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; RV64-NEXT: vslideup.vi v10, v9, 4
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; RV64-NEXT: vzext.vf4 v12, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index a2d02b6bb641b2..418e842a6725e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -469,9 +469,8 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v13, v10, a0
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v13, v10, a0
; CHECK-NEXT: vslidedown.vx v12, v9, a0
; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 8acc70faaa1fc9..8f9d26ba001c22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,14 +7,14 @@
define <4 x i1> @load_large_vector(ptr %p) {
; ZVE32X-LABEL: load_large_vector:
; ZVE32X: # %bb.0:
-; ZVE32X-NEXT: ld a1, 80(a0)
-; ZVE32X-NEXT: ld a2, 72(a0)
-; ZVE32X-NEXT: ld a3, 56(a0)
-; ZVE32X-NEXT: ld a4, 32(a0)
-; ZVE32X-NEXT: ld a5, 24(a0)
-; ZVE32X-NEXT: ld a6, 48(a0)
-; ZVE32X-NEXT: ld a7, 8(a0)
-; ZVE32X-NEXT: ld a0, 0(a0)
+; ZVE32X-NEXT: ld a1, 8(a0)
+; ZVE32X-NEXT: ld a2, 0(a0)
+; ZVE32X-NEXT: ld a3, 32(a0)
+; ZVE32X-NEXT: ld a4, 80(a0)
+; ZVE32X-NEXT: ld a5, 72(a0)
+; ZVE32X-NEXT: ld a6, 24(a0)
+; ZVE32X-NEXT: ld a7, 56(a0)
+; ZVE32X-NEXT: ld a0, 48(a0)
; ZVE32X-NEXT: xor a4, a5, a4
; ZVE32X-NEXT: snez a4, a4
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
@@ -28,12 +28,10 @@ define <4 x i1> @load_large_vector(ptr %p) {
; ZVE32X-NEXT: vmv.s.x v10, a0
; ZVE32X-NEXT: vand.vi v10, v10, 1
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; ZVE32X-NEXT: vmv.v.i v10, 0
; ZVE32X-NEXT: vmerge.vim v11, v10, 1, v0
-; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma
; ZVE32X-NEXT: vslideup.vi v11, v9, 1
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
; ZVE32X-NEXT: xor a0, a6, a3
@@ -42,22 +40,21 @@ define <4 x i1> @load_large_vector(ptr %p) {
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; ZVE32X-NEXT: vand.vi v11, v11, 1
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
-; ZVE32X-NEXT: vmerge.vim v11, v8, 1, v0
-; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
-; ZVE32X-NEXT: vslideup.vi v9, v11, 2
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT: vmsne.vi v0, v9, 0
-; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
+; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
; ZVE32X-NEXT: xor a1, a2, a1
; ZVE32X-NEXT: snez a0, a1
-; ZVE32X-NEXT: vmv.s.x v10, a0
-; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT: vand.vi v10, v10, 1
+; ZVE32X-NEXT: vmv.s.x v11, a0
+; ZVE32X-NEXT: vand.vi v11, v11, 1
+; ZVE32X-NEXT: vmsne.vi v0, v11, 0
+; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; ZVE32X-NEXT: vmerge.vim v10, v10, 1, v0
+; ZVE32X-NEXT: vslideup.vi v10, v8, 1
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
-; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT: vslideup.vi v9, v8, 3
-; ZVE32X-NEXT: vmsne.vi v0, v9, 0
+; ZVE32X-NEXT: vmv.v.i v8, 0
+; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
+; ZVE32X-NEXT: vslideup.vi v8, v9, 2
+; ZVE32X-NEXT: vmsne.vi v0, v8, 0
; ZVE32X-NEXT: ret
;
; ZVE64X-LABEL: load_large_vector:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 6ef5aa846d6d96..59fd864e191716 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -16,14 +16,11 @@ define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; CHECK-LABEL: concat_4xv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v12, v11
-; CHECK-NEXT: vmv1r.v v14, v9
-; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v14, 2
-; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 2
+; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 6
+; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: ret
%ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -34,24 +31,18 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
; CHECK-LABEL: concat_8xv1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v16, v15
-; CHECK-NEXT: vmv1r.v v18, v13
-; CHECK-NEXT: vmv1r.v v20, v11
-; CHECK-NEXT: vmv1r.v v22, v9
-; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v22, 1
-; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v14, v15, 1
+; CHECK-NEXT: vslideup.vi v12, v13, 1
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v14, 2
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 1
+; CHECK-NEXT: vslideup.vi v8, v9, 1
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vi v8, v10, 2
-; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v20, 3
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 4
-; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v18, 5
-; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v14, 6
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v16, 7
+; CHECK-NEXT: vslideup.vi v8, v12, 4
; CHECK-NEXT: ret
%ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
%cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
@@ -77,15 +68,14 @@ define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: concat_4xv4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v12, v11
-; CHECK-NEXT: vmv1r.v v16, v10
-; CHECK-NEXT: vmv1r.v v20, v9
-; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v20, 4
-; CHECK-NEXT: vsetivli zero, 12, e32, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v16, 8
+; CHECK-NEXT: vmv1r.v v14, v11
+; CHECK-NEXT: vmv1r.v v12, v10
+; CHECK-NEXT: vmv1r.v v10, v9
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v14, 4
+; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 12
+; CHECK-NEXT: vslideup.vi v8, v12, 8
; CHECK-NEXT: ret
%ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -96,26 +86,18 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
; CHECK-LABEL: concat_8xv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmv1r.v v16, v15
-; CHECK-NEXT: vmv1r.v v20, v14
-; CHECK-NEXT: vmv1r.v v24, v13
-; CHECK-NEXT: vmv1r.v v28, v11
-; CHECK-NEXT: vmv1r.v v0, v10
-; CHECK-NEXT: vmv1r.v v4, v9
-; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v4, 2
-; CHECK-NEXT: vsetivli zero, 6, e32, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v0, 4
-; CHECK-NEXT: ...
[truncated]
|
MVT VT = N->getSimpleValueType(0); | ||
|
||
MVT HalfVT = VT.getHalfNumVectorElementsVT(); | ||
size_t HalfNumOps = (N->getNumOperands() + 1) / 2; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be HalfVT.getVectorNumElements()
? And what if VT isn't power-of-2 types?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
HalfNumOps is used for the number of operands in each sub concat_vector, so HalfVT.getVectorNumElements() == HalfNumOps * N->getOperand(0).getSimpleValueType().getVectorNumElements()
And I believe !TLI.isTypeLegal(N->getValueType(0))
should rule out any non-power-of-2-types. Which actually come to think of it should mean that N->getNumOperands()
should always be a power of two as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
N->getNumOperands() should always be a power of two as well?
I think so.
; ZVE32X-NEXT: ld a6, 48(a0) | ||
; ZVE32X-NEXT: ld a7, 8(a0) | ||
; ZVE32X-NEXT: ld a0, 0(a0) | ||
; ZVE32X-NEXT: ld a1, 8(a0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These offsets become more irregular (though they were not so regular before). Maybe this can be a future optimization?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure why this combine is specifically triggering these loads to be come unordered, but #73789 may fix this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just tried it, the result is:
; ZVE32X-NEXT: ld a1, 48(a0)
; ZVE32X-NEXT: ld a2, 56(a0)
; ZVE32X-NEXT: ld a3, 72(a0)
; ZVE32X-NEXT: ld a4, 80(a0)
; ZVE32X-NEXT: ld a5, 0(a0)
; ZVE32X-NEXT: ld a6, 8(a0)
; ZVE32X-NEXT: ld a7, 24(a0)
; ZVE32X-NEXT: ld a0, 32(a0)
Without your changes, they can be ordered with -riscv-misched-load-clustering=true
.
But with your changes, it seems that elements in subvectors are ordered, but it's not the case for subvectors.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After limiting the splitting down to MF2 the original order of these loads seems to be restored
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I don't know which one is more worthwhile. Maybe we can loose the limitation in the future. :-)
cfd4f77
to
0f44137
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Just because it doesn't seem to be reflected otherwise in the review clearly... I talked with Luke about this one offline on Monday and pushed strongly for a restriction on the minimum lmul we'd split. The concern was to avoid increasing register pressure in an unbounded manner. The exact threshold value isn't particular important. For the corresponding build_vector lowering change, we also chose mf2 with the intention to (someday) explore that tradeoff point more carefully.
I'm a bit hesitant about this being a combine instead of a lowering. In particular, I'm concerned about the split form inhibiting other combines - such as e.g. the load combine. However, I'm choosing to approve for now, we can revisit if this turns out to be an actual problem. Moving this into lowering shouldn't be too hard.
This is the concat_vector equivalent of llvm#81312, in that we recursively split concat_vectors with more than two operands into smaller concat_vectors. This allows us to break up the chain of vslideups, as well as perform the vslideups at a smaller LMUL, which in turn reduces register pressure as the previous lowering performed N vslideups at the highest result LMUL. This is done as a DAG combine so that any undef operands are combined away: If we do this during lowering then we end up with unnecessary vslideups of undefs.
0f44137
to
8e6c8cf
Compare
Yup, looks like this is interfering with the strided load combine on some x264 workloads, e.g. |
This adds a reduced test case for the regression seen in x264 with #83035. If the intermediate concatenating shuffles are large enough then the splitting combine will prevent the strided load combine which is preferable.
This is a reimplementation of the combine added in llvm#83035 but as a lowering. Previously the combine had to concatenate the split vectors with insert_subvector instead of concat_vectors to prevent an infinite combine loop. And we kept it as a combine because if we emitted the insert_subvector during lowering then we didn't fold away inserts of undef subvectors. However it turns out we can avoid this if we just do this in lowering and select a concat_vector directly, since we get the undef folding for free with DAG.getNode via foldCONCAT_VECTORS.
…ing (#85825) This is a reimplementation of the combine added in #83035 but as a lowering instead of a combine, so we don't regress the test case added in e59f120 by interfering with the strided load combine Previously the combine had to concatenate the split vectors with insert_subvector instead of concat_vectors to prevent an infinite combine loop. And the reasoning behind keeping it as a combine was because if we emitted the insert_subvector during lowering then we didn't fold away inserts of undef subvectors. However it turns out we can avoid this if we just do this in lowering and select a concat_vector directly, since we get the undef folding for free with `DAG.getNode(ISD::CONCAT_VECTOR, ...)` via foldCONCAT_VECTORS.
…lvm#83035)" This reverts commit c59129a. This causes regressions in some x264 workloads like pixel_var_8x8 due to it interfering with the strided load combine. Reverting so I can try to rework it as a lowering instead.
This adds a reduced test case for the regression seen in x264 with llvm#83035. If the intermediate concatenating shuffles are large enough then the splitting combine will prevent the strided load combine which is preferable.
…ing (llvm#85825) This is a reimplementation of the combine added in llvm#83035 but as a lowering instead of a combine, so we don't regress the test case added in e59f120 by interfering with the strided load combine Previously the combine had to concatenate the split vectors with insert_subvector instead of concat_vectors to prevent an infinite combine loop. And the reasoning behind keeping it as a combine was because if we emitted the insert_subvector during lowering then we didn't fold away inserts of undef subvectors. However it turns out we can avoid this if we just do this in lowering and select a concat_vector directly, since we get the undef folding for free with `DAG.getNode(ISD::CONCAT_VECTOR, ...)` via foldCONCAT_VECTORS.
This is the concat_vector equivalent of #81312, in that we recursively split concat_vectors with more than two operands into smaller concat_vectors.
This allows us to break up the chain of vslideups, as well as perform the vslideups at a smaller LMUL, which in turn reduces register pressure as the previous lowering performed N vslideups at the highest result LMUL. For now, it stops splitting past MF2.
This is done as a DAG combine so that any undef operands are combined away: If we do this during lowering then we end up with unnecessary vslideups of undefs.