-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LLVM][CodeGen][SVE] Add lowering for ISD::VECREDUCE_MUL/FMUL. #161842
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Paul Walker (paulwalker-arm) ChangesWe might be able to do better by using SVE2 and perhaps even NEON for the final stages, but this version works everywhere so seems like is a good place to start. NOTE: I plan to follow up to replace the Fixes #155468 Full diff: https://github.com/llvm/llvm-project/pull/161842.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 70d5ad7d660f1..4752cd1bbd95f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1560,6 +1560,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
@@ -1716,6 +1717,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
@@ -7774,6 +7776,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::VECREDUCE_FMAXIMUM:
case ISD::VECREDUCE_FMINIMUM:
return LowerVECREDUCE(Op, DAG);
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_FMUL:
+ return LowerVECREDUCE_MUL(Op, DAG);
case ISD::ATOMIC_LOAD_AND:
return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
@@ -16790,6 +16795,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
}
}
+SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ assert(SrcVT.isScalableVector() && "Unexpected operand type!");
+
+ SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
+ unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+ SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
+
+ // Whilst we don't know the size of the vector we do know the maximum size so
+ // can perform a tree reduction with an identity vector, which means once we
+ // arrive at the result the remaining stages (when the vector is smaller than
+ // the maximum) have no affect.
+
+ unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+ unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
+
+ for (unsigned I = 0; I < Stages; ++I) {
+ Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
+ Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
+ }
+
+ return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
+}
+
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e472e7d565d9b..bbf7a383602ff 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -753,6 +753,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
index 15ee6a02a1639..36655f6b781b9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
@@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x
ret float %r
}
+; No FMULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) {
+; CHECK-LABEL: fmulv_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a)
+ ret half %res
+}
+
+define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) {
+; CHECK-LABEL: fmulv_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a)
+ ret half %res
+}
+
+define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) {
+; CHECK-LABEL: fmulv_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a)
+ ret half %res
+}
+
+define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) {
+; CHECK-LABEL: fmulv_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.s, #1.00000000
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a)
+ ret float %res
+}
+
+define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) {
+; CHECK-LABEL: fmulv_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.s, #1.00000000
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a)
+ ret float %res
+}
+
+define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fmulv_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.d, #1.00000000
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: fmul d0, d0, d1
+; CHECK-NEXT: ret
+ %res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a)
+ ret double %res
+}
+
declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>)
-declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>)
-declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
-declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
@@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>)
declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>)
declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>)
declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>)
+
+declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>)
+declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>)
+declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>)
+declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>)
+declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>)
+declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index be936f0fd6d4a..6fb0315e27fb5 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) {
ret i64 %res
}
+; No MULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: mulv_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.b, #1 // =0x1
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a)
+ ret i8 %res
+}
+
+define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: mulv_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.h, #1 // =0x1
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a)
+ ret i16 %res
+}
+
+define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: mulv_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, #1 // =0x1
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a)
+ ret i32 %res
+}
+
+define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: mulv_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, #1 // =0x1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a)
+ ret i64 %res
+}
+
; Test widen vector reduce type
declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>)
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
// the maximum) have no affect. | ||
|
||
unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; | ||
unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like this is assuming the minimum number of elements is a power of 2. I think it should be, but perhaps worth asserting?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By this point everything is type-legal and so they can only be a power-of-two in length.
We might be able to do better by using SVE2 and perhaps even NEON for the final stages, but this version works everywhere so seems like is a good place to start.
NOTE: I plan to follow up to replace the
SVEMaxBitsPerVector
upper limit by one that is derived fromvscale_range
, but some refactoring is necessary before that is possible.Fixes #155468