-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[RISCV]Do not combine to 'vw' if the number of extended instructions cannot be reduced #159715
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…cannot be reduced Only zero_ext and signed_ext are handled, which can reduce some instructions.
@llvm/pr-subscribers-backend-risc-v Author: Liao Chunyu (ChunyuLiao) ChangesOnly zero_ext and signed_ext are handled, which can reduce some instructions. Patch is 59.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159715.diff 10 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4f659471253a4..b980945ac0e00 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -17489,9 +17489,18 @@ struct CombineResult {
Passthru = DAG.getUNDEF(Root->getValueType(0));
break;
}
- return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0),
- LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, LHSExt),
- RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, RHSExt),
+ SDValue L = LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, LHSExt);
+ SDValue R = RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, RHSExt);
+ // Return SDValue() if the instructions are not reduced.
+ if (L->getOpcode() == Root->getOperand(0).getOpcode() &&
+ (R->getOpcode() == RISCVISD::VZEXT_VL ||
+ R->getOpcode() == RISCVISD::VSEXT_VL) &&
+ (R->getOperand(0).getOpcode() != ISD::SPLAT_VECTOR &&
+ R->getOperand(0).getOpcode() != RISCVISD::VMV_V_X_VL &&
+ R->getOperand(0).getOpcode() != ISD::INSERT_SUBVECTOR))
+ return SDValue();
+
+ return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0), L, R,
Passthru, Mask, VL);
}
};
@@ -17740,6 +17749,30 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
if (!NodeExtensionHelper::isSupportedRoot(N, Subtarget))
return SDValue();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ unsigned Opc0 = Op0.getOpcode();
+ unsigned Opc1 = Op1.getOpcode();
+ // Do not combine to the 'vw' instructions if the number of extended
+ // instructions cannot be reduced.
+ // vx and vi, if v is ext_mf4/ext_mf8
+ // vv, if op0 is ext_mf4/ext_mf8 and op1 is ext_mf8(except: imm and scalar)
+ if ((Opc0 == RISCVISD::VZEXT_VL || Opc0 == RISCVISD::VSEXT_VL ||
+ Opc0 == ISD::ZERO_EXTEND || Opc0 == ISD::SIGN_EXTEND) &&
+ (N->getValueType(0).getScalarSizeInBits() >
+ Op0.getOperand(0)->getValueType(0).getScalarSizeInBits() * 2) &&
+ (Opc1 == ISD::SPLAT_VECTOR || Opc1 == RISCVISD::VMV_V_X_VL ||
+ Opc1 == ISD::INSERT_SUBVECTOR ||
+ ((Opc1 == RISCVISD::VZEXT_VL || Opc1 == RISCVISD::VSEXT_VL ||
+ Opc1 == ISD::ZERO_EXTEND || Opc1 == ISD::SIGN_EXTEND) &&
+ Op1.getOperand(0).getOpcode() != ISD::SPLAT_VECTOR &&
+ Op1.getOperand(0).getOpcode() != RISCVISD::VMV_V_X_VL &&
+ Op1.getOperand(0).getOpcode() != ISD::INSERT_SUBVECTOR &&
+ Op1->getValueType(0).getScalarSizeInBits() >
+ Op1.getOperand(0)->getValueType(0).getScalarSizeInBits() * 4))) {
+ return SDValue();
+ }
+
SmallVector<SDNode *> Worklist;
SmallPtrSet<SDNode *, 8> Inserted;
Worklist.push_back(N);
@@ -17817,6 +17850,9 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
ValuesToReplace.reserve(CombinesToApply.size());
for (CombineResult Res : CombinesToApply) {
SDValue NewValue = Res.materialize(DAG, Subtarget);
+ if (!NewValue)
+ return SDValue();
+
if (!InputRootReplacement) {
assert(Res.Root == N &&
"First element is expected to be the current node");
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index e89bac54a7b66..b58468c9010b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -869,12 +869,9 @@ define <4 x i64> @crash(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: crash:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT: vmv1r.v v10, v9
-; CHECK-NEXT: vmv1r.v v11, v8
-; CHECK-NEXT: vsext.vf4 v8, v11
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vzext.vf2 v11, v10
-; CHECK-NEXT: vwaddu.wv v8, v8, v11
+; CHECK-NEXT: vsext.vf4 v10, v8
+; CHECK-NEXT: vzext.vf4 v12, v9
+; CHECK-NEXT: vadd.vv v8, v10, v12
; CHECK-NEXT: ret
%a = sext <4 x i16> %x to <4 x i64>
%b = zext <4 x i16> %y to <4 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
index 2c9aed6274dd8..ef048e22553b8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
@@ -716,10 +716,10 @@ define <4 x i64> @vwsll_vv_v4i64_v4i8_zext(<4 x i8> %a, <4 x i8> %b) {
;
; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_v4i8_zext:
; CHECK-ZVBB: # %bb.0:
-; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8
-; CHECK-ZVBB-NEXT: vzext.vf4 v11, v9
-; CHECK-ZVBB-NEXT: vwsll.vv v8, v10, v11
+; CHECK-ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT: vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT: vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v12
; CHECK-ZVBB-NEXT: ret
%x = zext <4 x i8> %a to <4 x i64>
%y = zext <4 x i8> %b to <4 x i64>
@@ -917,9 +917,9 @@ define <4 x i64> @vwsll_vi_v4i64_v4i8(<4 x i8> %a) {
;
; CHECK-ZVBB-LABEL: vwsll_vi_v4i64_v4i8:
; CHECK-ZVBB: # %bb.0:
-; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8
-; CHECK-ZVBB-NEXT: vwsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT: vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT: vsll.vi v8, v10, 2
; CHECK-ZVBB-NEXT: ret
%x = zext <4 x i8> %a to <4 x i64>
%z = shl <4 x i64> %x, splat (i64 2)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
index e6ca6875e1412..48f47b68151d7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
@@ -7,11 +7,10 @@
define i32 @vqdot_vv(<16 x i8> %a, <16 x i8> %b) {
; NODOT-LABEL: vqdot_vv:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v12, v8
-; NODOT-NEXT: vsext.vf2 v14, v9
-; NODOT-NEXT: vwmul.vv v8, v12, v14
-; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v12, v8
+; NODOT-NEXT: vsext.vf4 v16, v9
+; NODOT-NEXT: vmul.vv v8, v12, v16
; NODOT-NEXT: vmv.s.x v12, zero
; NODOT-NEXT: vredsum.vs v8, v8, v12
; NODOT-NEXT: vmv.x.s a0, v8
@@ -37,11 +36,10 @@ entry:
define i32 @vqdot_vx_constant(<16 x i8> %a) {
; CHECK-LABEL: vqdot_vx_constant:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vsext.vf2 v12, v8
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsext.vf4 v12, v8
; CHECK-NEXT: li a0, 23
-; CHECK-NEXT: vwmul.vx v8, v12, a0
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmul.vx v8, v12, a0
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
@@ -56,11 +54,10 @@ entry:
define i32 @vqdot_vx_constant_swapped(<16 x i8> %a) {
; CHECK-LABEL: vqdot_vx_constant_swapped:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vsext.vf2 v12, v8
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsext.vf4 v12, v8
; CHECK-NEXT: li a0, 23
-; CHECK-NEXT: vwmul.vx v8, v12, a0
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmul.vx v8, v12, a0
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
@@ -105,11 +102,10 @@ entry:
define i32 @vqdotu_vx_constant(<16 x i8> %a) {
; CHECK-LABEL: vqdotu_vx_constant:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v12, v8
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vzext.vf4 v12, v8
; CHECK-NEXT: li a0, 123
-; CHECK-NEXT: vwmulu.vx v8, v12, a0
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmul.vx v8, v12, a0
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
@@ -124,11 +120,10 @@ entry:
define i32 @vqdotsu_vv(<16 x i8> %a, <16 x i8> %b) {
; NODOT-LABEL: vqdotsu_vv:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v12, v8
-; NODOT-NEXT: vzext.vf2 v14, v9
-; NODOT-NEXT: vwmulsu.vv v8, v12, v14
-; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v12, v8
+; NODOT-NEXT: vzext.vf4 v16, v9
+; NODOT-NEXT: vmul.vv v8, v12, v16
; NODOT-NEXT: vmv.s.x v12, zero
; NODOT-NEXT: vredsum.vs v8, v8, v12
; NODOT-NEXT: vmv.x.s a0, v8
@@ -184,11 +179,10 @@ entry:
define i32 @vdotqsu_vx_constant(<16 x i8> %a) {
; CHECK-LABEL: vdotqsu_vx_constant:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vsext.vf2 v12, v8
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsext.vf4 v12, v8
; CHECK-NEXT: li a0, 123
-; CHECK-NEXT: vwmul.vx v8, v12, a0
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmul.vx v8, v12, a0
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
@@ -203,12 +197,10 @@ entry:
define i32 @vdotqus_vx_constant(<16 x i8> %a) {
; CHECK-LABEL: vdotqus_vx_constant:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v12, v8
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vzext.vf4 v12, v8
; CHECK-NEXT: li a0, -23
-; CHECK-NEXT: vmv.v.x v14, a0
-; CHECK-NEXT: vwmulsu.vv v8, v14, v12
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmul.vx v8, v12, a0
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
@@ -277,13 +269,12 @@ entry:
define i32 @vqdot_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) {
; NODOT-LABEL: vqdot_vv_accum:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v10, v8
-; NODOT-NEXT: vsext.vf2 v16, v9
-; NODOT-NEXT: vwmacc.vv v12, v10, v16
-; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v16, v8
+; NODOT-NEXT: vsext.vf4 v20, v9
+; NODOT-NEXT: vmadd.vv v20, v16, v12
; NODOT-NEXT: vmv.s.x v8, zero
-; NODOT-NEXT: vredsum.vs v8, v12, v8
+; NODOT-NEXT: vredsum.vs v8, v20, v8
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
@@ -345,13 +336,12 @@ entry:
define i32 @vqdotsu_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) {
; NODOT-LABEL: vqdotsu_vv_accum:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v10, v8
-; NODOT-NEXT: vzext.vf2 v16, v9
-; NODOT-NEXT: vwmaccsu.vv v12, v10, v16
-; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v16, v8
+; NODOT-NEXT: vzext.vf4 v20, v9
+; NODOT-NEXT: vmadd.vv v20, v16, v12
; NODOT-NEXT: vmv.s.x v8, zero
-; NODOT-NEXT: vredsum.vs v8, v12, v8
+; NODOT-NEXT: vredsum.vs v8, v20, v8
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
@@ -379,11 +369,10 @@ entry:
define i32 @vqdot_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) {
; NODOT-LABEL: vqdot_vv_scalar_add:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v12, v8
-; NODOT-NEXT: vsext.vf2 v14, v9
-; NODOT-NEXT: vwmul.vv v8, v12, v14
-; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v12, v8
+; NODOT-NEXT: vsext.vf4 v16, v9
+; NODOT-NEXT: vmul.vv v8, v12, v16
; NODOT-NEXT: vmv.s.x v12, a0
; NODOT-NEXT: vredsum.vs v8, v8, v12
; NODOT-NEXT: vmv.x.s a0, v8
@@ -441,11 +430,10 @@ entry:
define i32 @vqdotsu_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) {
; NODOT-LABEL: vqdotsu_vv_scalar_add:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v12, v8
-; NODOT-NEXT: vzext.vf2 v14, v9
-; NODOT-NEXT: vwmulsu.vv v8, v12, v14
-; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v12, v8
+; NODOT-NEXT: vzext.vf4 v16, v9
+; NODOT-NEXT: vmul.vv v8, v12, v16
; NODOT-NEXT: vmv.s.x v12, a0
; NODOT-NEXT: vredsum.vs v8, v8, v12
; NODOT-NEXT: vmv.x.s a0, v8
@@ -472,16 +460,15 @@ entry:
define i32 @vqdot_vv_split(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; NODOT-LABEL: vqdot_vv_split:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v12, v8
-; NODOT-NEXT: vsext.vf2 v14, v9
-; NODOT-NEXT: vsext.vf2 v16, v10
-; NODOT-NEXT: vsext.vf2 v18, v11
-; NODOT-NEXT: vwmul.vv v8, v12, v14
-; NODOT-NEXT: vwmacc.vv v8, v16, v18
-; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; NODOT-NEXT: vmv.s.x v12, zero
-; NODOT-NEXT: vredsum.vs v8, v8, v12
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v12, v8
+; NODOT-NEXT: vsext.vf4 v16, v9
+; NODOT-NEXT: vmul.vv v12, v12, v16
+; NODOT-NEXT: vsext.vf4 v16, v10
+; NODOT-NEXT: vsext.vf4 v20, v11
+; NODOT-NEXT: vmadd.vv v20, v16, v12
+; NODOT-NEXT: vmv.s.x v8, zero
+; NODOT-NEXT: vredsum.vs v8, v20, v8
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
@@ -510,20 +497,19 @@ entry:
define <1 x i32> @vqdot_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) {
; NODOT-LABEL: vqdot_vv_partial_reduce_v1i32_v4i8:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; NODOT-NEXT: vsext.vf2 v10, v8
-; NODOT-NEXT: vsext.vf2 v8, v9
-; NODOT-NEXT: vwmul.vv v9, v10, v8
-; NODOT-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; NODOT-NEXT: vslidedown.vi v8, v9, 3
-; NODOT-NEXT: vslidedown.vi v10, v9, 2
+; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; NODOT-NEXT: vsext.vf4 v10, v8
+; NODOT-NEXT: vsext.vf4 v8, v9
+; NODOT-NEXT: vmul.vv v8, v10, v8
+; NODOT-NEXT: vslidedown.vi v9, v8, 3
+; NODOT-NEXT: vslidedown.vi v10, v8, 2
; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; NODOT-NEXT: vadd.vv v8, v8, v9
+; NODOT-NEXT: vadd.vv v9, v9, v8
; NODOT-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; NODOT-NEXT: vslidedown.vi v9, v9, 1
+; NODOT-NEXT: vslidedown.vi v8, v8, 1
; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; NODOT-NEXT: vadd.vv v9, v9, v10
-; NODOT-NEXT: vadd.vv v8, v9, v8
+; NODOT-NEXT: vadd.vv v8, v8, v10
+; NODOT-NEXT: vadd.vv v8, v8, v9
; NODOT-NEXT: ret
;
; DOT-LABEL: vqdot_vv_partial_reduce_v1i32_v4i8:
@@ -648,20 +634,19 @@ entry:
define <1 x i32> @vqdotsu_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) {
; NODOT-LABEL: vqdotsu_vv_partial_reduce_v1i32_v4i8:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; NODOT-NEXT: vsext.vf2 v10, v8
-; NODOT-NEXT: vzext.vf2 v8, v9
-; NODOT-NEXT: vwmulsu.vv v9, v10, v8
-; NODOT-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; NODOT-NEXT: vslidedown.vi v8, v9, 3
-; NODOT-NEXT: vslidedown.vi v10, v9, 2
+; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; NODOT-NEXT: vsext.vf4 v10, v8
+; NODOT-NEXT: vzext.vf4 v8, v9
+; NODOT-NEXT: vmul.vv v8, v10, v8
+; NODOT-NEXT: vslidedown.vi v9, v8, 3
+; NODOT-NEXT: vslidedown.vi v10, v8, 2
; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; NODOT-NEXT: vadd.vv v8, v8, v9
+; NODOT-NEXT: vadd.vv v9, v9, v8
; NODOT-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; NODOT-NEXT: vslidedown.vi v9, v9, 1
+; NODOT-NEXT: vslidedown.vi v8, v8, 1
; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; NODOT-NEXT: vadd.vv v9, v9, v10
-; NODOT-NEXT: vadd.vv v8, v9, v8
+; NODOT-NEXT: vadd.vv v8, v8, v10
+; NODOT-NEXT: vadd.vv v8, v8, v9
; NODOT-NEXT: ret
;
; DOT-LABEL: vqdotsu_vv_partial_reduce_v1i32_v4i8:
@@ -740,10 +725,10 @@ entry:
define <2 x i32> @vqdot_vv_partial_reduce_v2i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
; NODOT-LABEL: vqdot_vv_partial_reduce_v2i32_v8i8:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; NODOT-NEXT: vsext.vf2 v10, v8
-; NODOT-NEXT: vsext.vf2 v11, v9
-; NODOT-NEXT: vwmul.vv v8, v10, v11
+; NODOT-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; NODOT-NEXT: vsext.vf4 v10, v8
+; NODOT-NEXT: vsext.vf4 v12, v9
+; NODOT-NEXT: vmul.vv v8, v10, v12
; NODOT-NEXT: vsetivli zero, 2, e32, m2, ta, ma
; NODOT-NEXT: vslidedown.vi v10, v8, 6
; NODOT-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
@@ -952,10 +937,10 @@ entry:
define <4 x i32> @vqdot_vv_partial_reduce_v4i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
; NODOT-LABEL: vqdot_vv_partial_reduce_v4i32_v16i8:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v12, v8
-; NODOT-NEXT: vsext.vf2 v14, v9
-; NODOT-NEXT: vwmul.vv v8, v12, v14
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v12, v8
+; NODOT-NEXT: vsext.vf4 v16, v9
+; NODOT-NEXT: vmul.vv v8, v12, v16
; NODOT-NEXT: vsetivli zero, 4, e32, m4, ta, ma
; NODOT-NEXT: vslidedown.vi v12, v8, 12
; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
@@ -1030,10 +1015,10 @@ entry:
define <4 x i32> @vqdot_vv_partial_reduce_m1_accum(<16 x i8> %a, <16 x i8> %b, <4 x i32> %accum) {
; NODOT-LABEL: vqdot_vv_partial_reduce_m1_accum:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v16, v8
-; NODOT-NEXT: vsext.vf2 v18, v9
-; NODOT-NEXT: vwmul.vv v12, v16, v18
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v12, v8
+; NODOT-NEXT: vsext.vf4 v16, v9
+; NODOT-NEXT: vmul.vv v12, v12, v16
; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; NODOT-NEXT: vadd.vv v16, v10, v12
; NODOT-NEXT: vsetivli zero, 4, e32, m4, ta, ma
@@ -1066,10 +1051,10 @@ entry:
define <16 x i32> @vqdot_vv_partial_reduce3(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: vqdot_vv_partial_reduce3:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vsext.vf2 v12, v8
-; CHECK-NEXT: vsext.vf2 v14, v9
-; CHECK-NEXT: vwmul.vv v8, v12, v14
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsext.vf4 v12, v8
+; CHECK-NEXT: vsext.vf4 v16, v9
+; CHECK-NEXT: vmul.vv v8, v12, v16
; CHECK-NEXT: ret
entry:
%a.sext = sext <16 x i8> %a to <16 x i32>
@@ -1556,11 +1541,10 @@ entry:
define i32 @vqdot_vv_accum_disjoint_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) {
; NODOT-LABEL: vqdot_vv_accum_disjoint_or:
; NODOT: # %bb.0: # %entry
-; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; NODOT-NEXT: vsext.vf2 v16, v8
-; NODOT-NEXT: vsext.vf2 v18, v9
-; NODOT-NEXT: vwmul.vv v8, v16, v18
-; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT: vsext.vf4 v16, v8
+; NODOT-NEXT: vsext.vf4 v20, v9
+; NODOT-NEXT: vmul.vv v8, v16, v20
; NODOT-NEXT: vor.vv v8, v8, v12
; NODOT-NEXT: vmv.s.x v12, zero
; NODOT-NEXT: vredsum.vs v8, v8, v12
@@ -1591,11 +1575,10 @@ entry:
define i32 @vqdot_vv_accum_or(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) {
; CHECK-LABEL: vqdot_vv_accum_or:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vsext.vf2 v16, v8
-; CHECK-NEXT: vsext.vf2 v18, v9
-; CHECK-NEXT: vwmul.vv v8, v16, v18
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsext.vf4 v16, v8
+; CHECK-NEXT: vsext.vf4 v20, v9
+; CHECK-NEXT: vmul.vv v8, v16, v20
; CHECK-NEXT: vor.vv v8, v8, v12
; CHECK-NEXT: vmv.s.x v12, zero
...
[truncated]
|
; CHECK-NEXT: li a0, 123 | ||
; CHECK-NEXT: vwmul.vx v8, v16, a0 | ||
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma | ||
; CHECK-NEXT: vmul.vx v8, v16, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if this is generally profitable. We're increasing register pressure because we're now sign extending to m8 first.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I cross-compiled part SPEC CPU 2017 (since it can only be partially passed), and I will supplement the objdump results later. if without increasing the number of spills, perhaps it could be considered good?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the vsext.vf4 is more expensive too. At least on the Banana Pi F3 from https://camel-cdr.github.io/rvv-bench-results/bpi_f3/index.html, this is now 16 (vsext.vf4) + 8 (vmul.vx) = 24 total cycles. Previously it was 8 (vsext.vf2) + 8 (vwmul.vx) = 16.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall, there is a slight reduction in the number of instructions.

Build : -march=rva23u64 -mllvm -force-tail-folding-style=data-with-evl -mllvm -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -O3
Total : llvm-objdump --mattr=+d,+zfh,+zfbfmin,+v,+zvbb
. Then wc -l
vset* : grep -nirI vset |wc -l
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On sifive-x280, v(s/z)ext.vf* have latency of 4 cycles for each DLEN and reciprocal throughput of (output_lmul*2) producing DLEN bits each cycle. vwadd has latency of 8 cycles for each DLEN and reciprocal throughput of (input_lmul*2) producing DLEN*2 bits each cycle. Plain add has latency of 4 cycles for each DLEN and reciprocal throughput of (lmul*2) producing DLEN bits each cycle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall, there is a slight reduction in the number of instructions.
From our measurements vsetvli instructions usually aren't the bottleneck. I think in this example the increased number of uops issued from the higher LMUL vsext will likely outweigh benefit or removing the vsetvli toggle.
Only zero_ext and signed_ext are handled, which can reduce some instructions.