Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1560,6 +1560,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
Expand Down Expand Up @@ -1716,6 +1717,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
Expand Down Expand Up @@ -7774,6 +7776,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::VECREDUCE_FMAXIMUM:
case ISD::VECREDUCE_FMINIMUM:
return LowerVECREDUCE(Op, DAG);
case ISD::VECREDUCE_MUL:
case ISD::VECREDUCE_FMUL:
return LowerVECREDUCE_MUL(Op, DAG);
case ISD::ATOMIC_LOAD_AND:
return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
Expand Down Expand Up @@ -16790,6 +16795,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
}
}

SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
assert(SrcVT.isScalableVector() && "Unexpected operand type!");

SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());

// Whilst we don't know the size of the vector we do know the maximum size so
// can perform a tree reduction with an identity vector, which means once we
// arrive at the result the remaining stages (when the vector is smaller than
// the maximum) have no affect.

unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this is assuming the minimum number of elements is a power of 2. I think it should be, but perhaps worth asserting?

Copy link
Collaborator Author

@paulwalker-arm paulwalker-arm Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By this point everything is type-legal and so they can only be a power-of-two in length.


for (unsigned I = 0; I < Stages; ++I) {
Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
}

return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
}

SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
Expand Down
178 changes: 175 additions & 3 deletions llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
Original file line number Diff line number Diff line change
Expand Up @@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x
ret float %r
}

; No FMULV instruction so use knowledge about the architectural maximum size of
; an SVE register to "scalarise" the reduction.

define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) {
; CHECK-LABEL: fmulv_nxv2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z2.h, #1.00000000
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: ret
%res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a)
ret half %res
}

define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) {
; CHECK-LABEL: fmulv_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z2.h, #1.00000000
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: ret
%res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a)
ret half %res
}

define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) {
; CHECK-LABEL: fmulv_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z2.h, #1.00000000
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: ret
%res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a)
ret half %res
}

define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) {
; CHECK-LABEL: fmulv_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z2.s, #1.00000000
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: ret
%res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a)
ret float %res
}

define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) {
; CHECK-LABEL: fmulv_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z2.s, #1.00000000
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: ret
%res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a)
ret float %res
}

define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) {
; CHECK-LABEL: fmulv_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z2.d, #1.00000000
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
; CHECK-NEXT: fmul d0, d0, d1
; CHECK-NEXT: ret
%res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a)
ret double %res
}

declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>)
declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>)
declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
Expand Down Expand Up @@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>)
declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>)
declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>)
declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>)

declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>)
declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>)
declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>)
declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>)
declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>)
declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>)
125 changes: 125 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-int-reduce.ll
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) {
ret i64 %res
}

; No MULV instruction so use knowledge about the architectural maximum size of
; an SVE register to "scalarise" the reduction.

define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) {
; CHECK-LABEL: mulv_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.b, #1 // =0x1
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a)
ret i8 %res
}

define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) {
; CHECK-LABEL: mulv_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.h, #1 // =0x1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a)
ret i16 %res
}

define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) {
; CHECK-LABEL: mulv_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.s, #1 // =0x1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a)
ret i32 %res
}

define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) {
; CHECK-LABEL: mulv_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, #1 // =0x1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a)
ret i64 %res
}

; Test widen vector reduce type
declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>)

Expand Down