From 1af3f596f6c6b213cec9b3acd7099f8c4f11d0d0 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 8 Feb 2023 11:43:36 +0000 Subject: [PATCH] [DAG] Fold Op(vecreduce(a), vecreduce(b)) into vecreduce(Op(a,b)) So long as the operation is reassociative, we can reassociate the double vecreduce from for example fadd(vecreduce(a), vecreduce(b)) to vecreduce(fadd(a,b)). This will in general save a few instructions, but some architectures (MVE) require the opposite fold, so a shouldExpandReduction is added to account for it. Only targets that use shouldExpandReduction will be affected. Differential Revision: https://reviews.llvm.org/D141870 --- llvm/include/llvm/CodeGen/TargetLowering.h | 6 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 84 ++++++++ llvm/lib/Target/ARM/ARMISelLowering.h | 4 + llvm/test/CodeGen/AArch64/aarch64-addv.ll | 12 +- llvm/test/CodeGen/AArch64/double_reduct.ll | 104 +++------- llvm/test/CodeGen/AArch64/sve-doublereduct.ll | 84 +++----- llvm/test/CodeGen/AArch64/sve-fp-reduce.ll | 6 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 186 ++++++++---------- llvm/test/CodeGen/AArch64/vecreduce-fadd.ll | 24 +-- llvm/test/CodeGen/RISCV/double_reduct.ll | 116 ++++------- llvm/test/CodeGen/Thumb2/mve-doublereduct.ll | 114 ++++------- 11 files changed, 321 insertions(+), 419 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index e26540a684c178..2046f4b8a08542 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -444,6 +444,12 @@ class TargetLoweringBase { return true; } + // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to + // vecreduce(op(x, y)) for the reduction opcode RedOpc. + virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const { + return true; + } + /// Return true if it is profitable to convert a select of FP constants into /// a constant pool load whose address depends on the select condition. The /// parameter may be used to differentiate a select with FP compare from diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b6e0578dba9b44..7249c637028f32 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -550,6 +550,9 @@ namespace { SDValue N1); SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SDNodeFlags Flags); + SDValue reassociateReduction(unsigned ResOpc, unsigned Opc, const SDLoc &DL, + EVT VT, SDValue N0, SDValue N1, + SDNodeFlags Flags = SDNodeFlags()); SDValue visitShiftByConstant(SDNode *N); @@ -1310,6 +1313,25 @@ SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, return SDValue(); } +// Try to fold Opc(vecreduce(x), vecreduce(y)) -> vecreduce(Opc(x, y)) +// Note that we only expect Flags to be passed from FP operations. For integer +// operations they need to be dropped. +SDValue DAGCombiner::reassociateReduction(unsigned RedOpc, unsigned Opc, + const SDLoc &DL, EVT VT, SDValue N0, + SDValue N1, SDNodeFlags Flags) { + if (N0.getOpcode() == RedOpc && N1.getOpcode() == RedOpc && + N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() && + N0->hasOneUse() && N1->hasOneUse() && + TLI.isOperationLegalOrCustom(Opc, N0.getOperand(0).getValueType()) && + TLI.shouldReassociateReduction(RedOpc, N0.getOperand(0).getValueType())) { + SelectionDAG::FlagInserter FlagsInserter(DAG, Flags); + return DAG.getNode(RedOpc, DL, VT, + DAG.getNode(Opc, DL, N0.getOperand(0).getValueType(), + N0.getOperand(0), N1.getOperand(0))); + } + return SDValue(); +} + SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo) { assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); @@ -2650,6 +2672,11 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { return Add; if (SDValue Add = ReassociateAddOr(N1, N0)) return Add; + + // Fold add(vecreduce(x), vecreduce(y)) -> vecreduce(add(x, y)) + if (SDValue SD = + reassociateReduction(ISD::VECREDUCE_ADD, ISD::ADD, DL, VT, N0, N1)) + return SD; } // fold ((0-A) + B) -> B-A if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0))) @@ -4351,6 +4378,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags())) return RMUL; + // Fold mul(vecreduce(x), vecreduce(y)) -> vecreduce(mul(x, y)) + if (SDValue SD = + reassociateReduction(ISD::VECREDUCE_MUL, ISD::MUL, DL, VT, N0, N1)) + return SD; + // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -5486,6 +5518,25 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N0, N1, ISD::SETULT, DAG)) return S; + // Fold and(vecreduce(x), vecreduce(y)) -> vecreduce(and(x, y)) + auto ReductionOpcode = [](unsigned Opcode) { + switch (Opcode) { + case ISD::SMIN: + return ISD::VECREDUCE_SMIN; + case ISD::SMAX: + return ISD::VECREDUCE_SMAX; + case ISD::UMIN: + return ISD::VECREDUCE_UMIN; + case ISD::UMAX: + return ISD::VECREDUCE_UMAX; + default: + llvm_unreachable("Unexpected opcode"); + } + }; + if (SDValue SD = reassociateReduction(ReductionOpcode(Opcode), Opcode, + SDLoc(N), VT, N0, N1)) + return SD; + // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -6525,6 +6576,11 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) return RAND; + // Fold and(vecreduce(x), vecreduce(y)) -> vecreduce(and(x, y)) + if (SDValue SD = reassociateReduction(ISD::VECREDUCE_AND, ISD::AND, SDLoc(N), + VT, N0, N1)) + return SD; + // fold (and (or x, C), D) -> D if (C & D) == D auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); @@ -7419,6 +7475,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) return ROR; + // Fold or(vecreduce(x), vecreduce(y)) -> vecreduce(or(x, y)) + if (SDValue SD = reassociateReduction(ISD::VECREDUCE_OR, ISD::OR, SDLoc(N), + VT, N0, N1)) + return SD; + // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) // iff (c1 & c2) != 0 or c1/c2 are undef. auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) { @@ -8903,6 +8964,11 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) return RXOR; + // Fold xor(vecreduce(x), vecreduce(y)) -> vecreduce(xor(x, y)) + if (SDValue SD = + reassociateReduction(ISD::VECREDUCE_XOR, ISD::XOR, DL, VT, N0, N1)) + return SD; + // fold (a^b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && DAG.haveNoCommonBitsSet(N0, N1)) @@ -15621,6 +15687,11 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { DAG.getConstantFP(4.0, DL, VT)); } } + + // Fold fadd(vecreduce(x), vecreduce(y)) -> vecreduce(fadd(x, y)) + if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FADD, ISD::FADD, DL, + VT, N0, N1, Flags)) + return SD; } // enable-unsafe-fp-math // FADD -> FMA combines: @@ -15795,6 +15866,11 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1); return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts); } + + // Fold fmul(vecreduce(x), vecreduce(y)) -> vecreduce(fmul(x, y)) + if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FMUL, ISD::FMUL, DL, + VT, N0, N1, Flags)) + return SD; } // fold (fmul X, 2.0) -> (fadd X, X) @@ -16845,6 +16921,14 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { } } + const TargetOptions &Options = DAG.getTarget().Options; + if ((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) || + (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) + if (SDValue SD = reassociateReduction(IsMin ? ISD::VECREDUCE_FMIN + : ISD::VECREDUCE_FMAX, + Opc, SDLoc(N), VT, N0, N1, Flags)) + return SD; + return SDValue(); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 86ad9a4767964e..3bc936b6cce209 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -617,6 +617,10 @@ class VectorType; return TargetLowering::shouldFormOverflowOp(Opcode, VT, true); } + bool shouldReassociateReduction(unsigned Opc, EVT VT) const override { + return Opc != ISD::VECREDUCE_ADD; + } + /// Returns true if an argument of type Ty needs to be passed in a /// contiguous block of registers in calling convention CallConv. bool functionArgumentNeedsConsecutiveRegisters( diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index 2b71126ee175bd..15736933b61f84 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -102,11 +102,9 @@ define i32 @oversized_ADDV_512(ptr %arr) { define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) { ; CHECK-LABEL: addv_combine_i8: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b ; CHECK-NEXT: addv b0, v0.8b -; CHECK-NEXT: addv b1, v1.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1) @@ -118,11 +116,9 @@ entry: define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) { ; CHECK-LABEL: addv_combine_i16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: addv h0, v0.4h -; CHECK-NEXT: addv h1, v1.4h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1) diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll index 1fd1eb6fc5dd42..78408ae10e0bac 100644 --- a/llvm/test/CodeGen/AArch64/double_reduct.ll +++ b/llvm/test/CodeGen/AArch64/double_reduct.ll @@ -5,11 +5,9 @@ define float @add_f32(<8 x float> %a, <4 x float> %b) { ; CHECK-LABEL: add_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s -; CHECK-NEXT: faddp s1, v2.2s ; CHECK-NEXT: faddp s0, v0.2s -; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) @@ -20,14 +18,11 @@ define float @add_f32(<8 x float> %a, <4 x float> %b) { define float @fmul_f32(<8 x float> %a, <4 x float> %b) { ; CHECK-LABEL: fmul_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 ; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: fmul v1.2s, v2.2s, v3.2s -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fmul s1, s1, v1.s[1] -; CHECK-NEXT: fmul v0.2s, v0.2s, v2.2s +; CHECK-NEXT: fmul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: fmul s0, s0, s1 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) @@ -81,11 +76,10 @@ define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) { define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: add_ext_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddlv h0, v0.16b -; CHECK-NEXT: uaddlv h1, v1.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: uaddlp v1.8h, v1.16b +; CHECK-NEXT: uadalp v1.8h, v0.16b +; CHECK-NEXT: addv h0, v1.8h +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %ae = zext <16 x i8> %a to <16 x i16> %be = zext <16 x i8> %b to <16 x i16> @@ -100,12 +94,10 @@ define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: uaddl2 v3.8h, v0.16b, v1.16b ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: uaddlv h2, v2.16b ; CHECK-NEXT: add v0.8h, v0.8h, v3.8h -; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: uadalp v0.8h, v2.16b ; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %ae = zext <32 x i8> %a to <32 x i16> %be = zext <16 x i8> %b to <16 x i16> @@ -118,18 +110,13 @@ define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: mul_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mul v1.2s, v2.2s, v3.2s -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov w9, v1.s[1] -; CHECK-NEXT: fmov w11, s1 -; CHECK-NEXT: mul v0.2s, v0.2s, v2.2s -; CHECK-NEXT: mul w9, w11, w9 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: mul w8, w10, w8 -; CHECK-NEXT: mul w0, w8, w9 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mul w0, w9, w8 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) @@ -141,16 +128,11 @@ define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: and_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v2.8b, v2.8b, v3.8b ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v2.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: fmov w11, s2 -; CHECK-NEXT: and w9, w10, w9 -; CHECK-NEXT: and w8, w11, w8 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: and w0, w9, w8 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a) @@ -163,16 +145,11 @@ define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: or_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v2.8b, v2.8b, v3.8b ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v2.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: fmov w11, s2 -; CHECK-NEXT: orr w9, w10, w9 -; CHECK-NEXT: orr w8, w11, w8 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a) @@ -185,16 +162,11 @@ define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: xor_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v2.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: fmov w11, s2 -; CHECK-NEXT: eor w9, w10, w9 -; CHECK-NEXT: eor w8, w11, w8 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: eor w0, w9, w8 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a) @@ -207,12 +179,9 @@ define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: umin_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uminv s2, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: uminv s0, v0.4s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: csel w0, w9, w8, lo +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) @@ -224,12 +193,9 @@ define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: umax_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umaxv s2, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: umaxv s0, v0.4s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: csel w0, w9, w8, hi +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) @@ -241,12 +207,9 @@ define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: smin_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sminv s2, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: sminv s0, v0.4s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: csel w0, w9, w8, lt +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) @@ -258,12 +221,9 @@ define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: smax_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smaxv s2, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: smaxv s0, v0.4s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: csel w0, w9, w8, gt +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll index c79c87b2950795..6a06d38e06712f 100644 --- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll +++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll @@ -4,11 +4,11 @@ define float @add_f32( %a, %b) { ; CHECK-LABEL: add_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fadd z0.s, z0.s, z1.s -; CHECK-NEXT: faddv s2, p0, z2.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fadd z0.s, z0.s, z2.s ; CHECK-NEXT: faddv s0, p0, z0.s -; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fadd.f32.nxv8f32(float -0.0, %a) %r2 = call fast float @llvm.vector.reduce.fadd.f32.nxv4f32(float -0.0, %b) @@ -57,13 +57,12 @@ define float @fmax_f32( %a, %b) { define i32 @add_i32( %a, %b) { ; CHECK-LABEL: add_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: uaddv d2, p0, z2.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add z0.s, z0.s, z2.s ; CHECK-NEXT: uaddv d0, p0, z0.s -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d2 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32( %a) %r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32( %b) @@ -78,14 +77,13 @@ define i16 @add_ext_i16( %a, %b) { ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpkhi z3.h, z1.b ; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add z0.h, z0.h, z2.h ; CHECK-NEXT: add z1.h, z1.h, z3.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: uaddv d0, p0, z0.h -; CHECK-NEXT: uaddv d1, p0, z1.h -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %ae = zext %a to %be = zext %b to @@ -106,14 +104,13 @@ define i16 @add_ext_v32i16( %a, %b) { ; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: add z1.h, z4.h, z3.h -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: add z1.h, z2.h, z5.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: uaddv d0, p0, z0.h -; CHECK-NEXT: uaddv d1, p0, z1.h -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %ae = zext %a to %be = zext %b to @@ -133,13 +130,11 @@ define i16 @add_ext_v32i16( %a, %b) { define i32 @and_i32( %a, %b) { ; CHECK-LABEL: and_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.d, z0.d, z1.d -; CHECK-NEXT: andv s2, p0, z2.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: andv s0, p0, z0.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.and.i32.nxv8i32( %a) %r2 = call i32 @llvm.vector.reduce.and.i32.nxv4i32( %b) @@ -150,13 +145,11 @@ define i32 @and_i32( %a, %b) { define i32 @or_i32( %a, %b) { ; CHECK-LABEL: or_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: orr z0.d, z0.d, z1.d -; CHECK-NEXT: orv s2, p0, z2.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: orr z0.d, z0.d, z2.d ; CHECK-NEXT: orv s0, p0, z0.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.or.i32.nxv8i32( %a) %r2 = call i32 @llvm.vector.reduce.or.i32.nxv4i32( %b) @@ -168,12 +161,9 @@ define i32 @xor_i32( %a, %b) { ; CHECK-LABEL: xor_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: eor z0.d, z0.d, z1.d -; CHECK-NEXT: eorv s2, p0, z2.s +; CHECK-NEXT: eor3 z0.d, z0.d, z1.d, z2.d ; CHECK-NEXT: eorv s0, p0, z0.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.xor.i32.nxv8i32( %a) %r2 = call i32 @llvm.vector.reduce.xor.i32.nxv4i32( %b) @@ -186,12 +176,9 @@ define i32 @umin_i32( %a, %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uminv s2, p0, z2.s +; CHECK-NEXT: umin z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: uminv s0, p0, z0.s -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.umin.i32.nxv8i32( %a) %r2 = call i32 @llvm.vector.reduce.umin.i32.nxv4i32( %b) @@ -204,12 +191,9 @@ define i32 @umax_i32( %a, %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: umaxv s2, p0, z2.s +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: umaxv s0, p0, z0.s -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.umax.i32.nxv8i32( %a) %r2 = call i32 @llvm.vector.reduce.umax.i32.nxv4i32( %b) @@ -222,12 +206,9 @@ define i32 @smin_i32( %a, %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: sminv s2, p0, z2.s +; CHECK-NEXT: smin z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: sminv s0, p0, z0.s -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w0, w8, w9, lt +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.smin.i32.nxv8i32( %a) %r2 = call i32 @llvm.vector.reduce.smin.i32.nxv4i32( %b) @@ -240,12 +221,9 @@ define i32 @smax_i32( %a, %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: smaxv s2, p0, z2.s +; CHECK-NEXT: smax z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: smaxv s0, p0, z0.s -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w0, w8, w9, gt +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.smax.i32.nxv8i32( %a) %r2 = call i32 @llvm.vector.reduce.smax.i32.nxv4i32( %b) diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll index 0106dc2e7f7f5c..4183a83ed01b1a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll @@ -357,11 +357,11 @@ define double @fminv_nxv2f64( %a) { define float @fadd_reduct_reassoc_v4v8f32( %a, %b) { ; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fadd z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fadd z0.s, z0.s, z1.s ; CHECK-NEXT: faddv s0, p0, z0.s -; CHECK-NEXT: faddv s1, p0, z1.s -; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.0, %a) %r2 = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.0, %b) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 7b4da9ecf3cd49..9e113be3148880 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -1219,9 +1219,9 @@ entry: define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: add_pair_v4i32_v4i64_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: uadalp v0.2d, v1.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: uaddlp v1.2d, v1.4s +; CHECK-NEXT: uadalp v1.2d, v0.4s +; CHECK-NEXT: addp d0, v1.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -1236,9 +1236,9 @@ entry: define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: add_pair_v4i32_v4i64_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlp v0.2d, v0.4s -; CHECK-NEXT: sadalp v0.2d, v1.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: saddlp v1.2d, v1.4s +; CHECK-NEXT: sadalp v1.2d, v0.4s +; CHECK-NEXT: addp d0, v1.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -1285,9 +1285,9 @@ entry: define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: add_pair_v8i16_v8i32_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: uadalp v0.4s, v1.8h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: uaddlp v1.4s, v1.8h +; CHECK-NEXT: uadalp v1.4s, v0.8h +; CHECK-NEXT: addv s0, v1.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -1302,9 +1302,9 @@ entry: define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: add_pair_v8i16_v8i32_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlp v0.4s, v0.8h -; CHECK-NEXT: sadalp v0.4s, v1.8h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: saddlp v1.4s, v1.8h +; CHECK-NEXT: sadalp v1.4s, v0.8h +; CHECK-NEXT: addv s0, v1.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -1351,12 +1351,9 @@ entry: define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: add_pair_v8i16_v8i16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: addv h1, v1.8h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: and w0, w8, #0xffff +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) @@ -1420,11 +1417,11 @@ entry: define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) { ; CHECK-LABEL: add_pair_v4i16_v4i64_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: uadalp v0.2d, v1.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uaddlp v1.2d, v1.4s +; CHECK-NEXT: uadalp v1.2d, v0.4s +; CHECK-NEXT: addp d0, v1.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -1439,11 +1436,11 @@ entry: define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) { ; CHECK-LABEL: add_pair_v4i16_v4i64_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-NEXT: saddlp v0.2d, v0.4s -; CHECK-NEXT: sadalp v0.2d, v1.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: saddlp v1.2d, v1.4s +; CHECK-NEXT: sadalp v1.2d, v0.4s +; CHECK-NEXT: addp d0, v1.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -1570,11 +1567,11 @@ entry: define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_zext: ; CHECK-BASE: // %bb.0: // %entry -; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h -; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: addv s0, v0.4s +; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BASE-NEXT: uaddlp v1.4s, v1.8h +; CHECK-BASE-NEXT: uadalp v1.4s, v0.8h +; CHECK-BASE-NEXT: addv s0, v1.4s ; CHECK-BASE-NEXT: fmov w0, s0 ; CHECK-BASE-NEXT: ret ; @@ -1599,11 +1596,11 @@ entry: define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_sext: ; CHECK-BASE: // %bb.0: // %entry -; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h -; CHECK-BASE-NEXT: sadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: addv s0, v0.4s +; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BASE-NEXT: saddlp v1.4s, v1.8h +; CHECK-BASE-NEXT: sadalp v1.4s, v0.8h +; CHECK-BASE-NEXT: addv s0, v1.4s ; CHECK-BASE-NEXT: fmov w0, s0 ; CHECK-BASE-NEXT: ret ; @@ -1667,12 +1664,10 @@ entry: define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_pair_v16i8_v16i16_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddlv h0, v0.16b -; CHECK-NEXT: uaddlv h1, v1.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: and w0, w8, #0xffff +; CHECK-NEXT: uaddlp v1.8h, v1.16b +; CHECK-NEXT: uadalp v1.8h, v0.16b +; CHECK-NEXT: addv h0, v1.8h +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %xx = zext <16 x i8> %x to <16 x i16> @@ -1686,12 +1681,10 @@ entry: define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_pair_v16i8_v16i16_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: saddlv h0, v0.16b -; CHECK-NEXT: saddlv h1, v1.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: sxth w0, w8 +; CHECK-NEXT: saddlp v1.8h, v1.16b +; CHECK-NEXT: sadalp v1.8h, v0.16b +; CHECK-NEXT: addv h0, v1.8h +; CHECK-NEXT: smov w0, v0.h[0] ; CHECK-NEXT: ret entry: %xx = sext <16 x i8> %x to <16 x i16> @@ -1705,14 +1698,9 @@ entry: define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: add_pair_v8i8_v8i16_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: addv h1, v1.8h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: and w0, w8, #0xffff +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %xx = zext <8 x i8> %x to <8 x i16> @@ -1726,14 +1714,9 @@ entry: define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: add_pair_v8i8_v8i16_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: addv h1, v1.8h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: sxth w0, w8 +; CHECK-NEXT: smov w0, v0.h[0] ; CHECK-NEXT: ret entry: %xx = sext <8 x i8> %x to <8 x i16> @@ -1747,12 +1730,9 @@ entry: define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_pair_v16i8_v16i8: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b ; CHECK-NEXT: addv b0, v0.16b -; CHECK-NEXT: addv b1, v1.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) @@ -1904,13 +1884,13 @@ entry: define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: add_pair_v4i8_v4i64_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: bic v1.4h, #255, lsl #8 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: uadalp v0.2d, v1.4s -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uaddlp v1.2d, v1.4s +; CHECK-NEXT: uadalp v1.2d, v0.4s +; CHECK-NEXT: addp d0, v1.2d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: @@ -1995,15 +1975,15 @@ entry: define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) { ; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext: ; CHECK-BASE: // %bb.0: // %entry -; CHECK-BASE-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BASE-NEXT: saddlp v2.4s, v2.8h -; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h -; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-BASE-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: sadalp v2.4s, v3.8h -; CHECK-BASE-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-BASE-NEXT: saddlp v3.4s, v3.8h +; CHECK-BASE-NEXT: uaddlp v1.4s, v1.8h +; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BASE-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-BASE-NEXT: uadalp v1.4s, v0.8h +; CHECK-BASE-NEXT: sadalp v3.4s, v2.8h +; CHECK-BASE-NEXT: add v0.4s, v3.4s, v1.4s ; CHECK-BASE-NEXT: addv s0, v0.4s ; CHECK-BASE-NEXT: fmov w0, s0 ; CHECK-BASE-NEXT: ret @@ -2091,48 +2071,48 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-BASE-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-BASE-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-BASE-NEXT: sxtw x8, w1 -; CHECK-BASE-NEXT: sxtw x9, w3 -; CHECK-BASE-NEXT: add x10, x0, x8 -; CHECK-BASE-NEXT: add x11, x2, x9 -; CHECK-BASE-NEXT: ldr d2, [x0] -; CHECK-BASE-NEXT: ldr d3, [x2] -; CHECK-BASE-NEXT: ldr d0, [x10] -; CHECK-BASE-NEXT: add x10, x10, x8 -; CHECK-BASE-NEXT: ldr d1, [x11] -; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: sxtw x10, w3 +; CHECK-BASE-NEXT: add x9, x0, x8 +; CHECK-BASE-NEXT: ldr d0, [x0] +; CHECK-BASE-NEXT: ldr d1, [x2] +; CHECK-BASE-NEXT: add x11, x2, x10 +; CHECK-BASE-NEXT: ldr d2, [x9] +; CHECK-BASE-NEXT: add x9, x9, x8 ; CHECK-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b -; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x10] -; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: add x10, x10, x8 +; CHECK-BASE-NEXT: ldr d1, [x11] +; CHECK-BASE-NEXT: add x11, x11, x10 ; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h -; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v1.8b +; CHECK-BASE-NEXT: ldr d2, [x9] +; CHECK-BASE-NEXT: ldr d3, [x11] +; CHECK-BASE-NEXT: add x9, x9, x8 +; CHECK-BASE-NEXT: add x11, x11, x10 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h ; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d2, [x9] ; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: add x10, x10, x8 -; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: add x9, x9, x8 +; CHECK-BASE-NEXT: add x11, x11, x10 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h ; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d2, [x9] ; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: add x10, x10, x8 -; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: add x9, x9, x8 +; CHECK-BASE-NEXT: add x11, x11, x10 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h ; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d2, [x9] ; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: add x10, x10, x8 -; CHECK-BASE-NEXT: add x11, x11, x9 +; CHECK-BASE-NEXT: add x9, x9, x8 +; CHECK-BASE-NEXT: add x11, x11, x10 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h ; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x10] +; CHECK-BASE-NEXT: ldr d2, [x9] ; CHECK-BASE-NEXT: ldr d3, [x11] ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: ldr d1, [x10, x8] +; CHECK-BASE-NEXT: ldr d1, [x9, x8] ; CHECK-BASE-NEXT: uabdl v2.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d3, [x11, x9] +; CHECK-BASE-NEXT: ldr d3, [x11, x10] ; CHECK-BASE-NEXT: uadalp v0.4s, v2.8h ; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v3.8b ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll index 452fc36571ef3c..ba44bc99ce8cd0 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -438,13 +438,10 @@ exit: define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { ; FULLFP16-LABEL: fadd_reduct_reassoc_v8f16: ; FULLFP16: // %bb.0: -; FULLFP16-NEXT: faddp v2.8h, v0.8h, v0.8h -; FULLFP16-NEXT: faddp v3.8h, v1.8h, v1.8h -; FULLFP16-NEXT: faddp v0.8h, v2.8h, v0.8h -; FULLFP16-NEXT: faddp v1.8h, v3.8h, v1.8h +; FULLFP16-NEXT: fadd v0.8h, v0.8h, v1.8h +; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h +; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h ; FULLFP16-NEXT: faddp h0, v0.2h -; FULLFP16-NEXT: faddp h1, v1.2h -; FULLFP16-NEXT: fadd h0, h0, h1 ; FULLFP16-NEXT: ret ; ; CHECKNOFP16-LABEL: fadd_reduct_reassoc_v8f16: @@ -535,11 +532,9 @@ define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: fadd v2.4s, v2.4s, v3.4s ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: faddp v1.4s, v2.4s, v2.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s -; CHECK-NEXT: faddp s1, v1.2s ; CHECK-NEXT: faddp s0, v0.2s -; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b) @@ -550,11 +545,9 @@ define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { define float @fadd_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: fadd_reduct_reassoc_v4f32: ; CHECK: // %bb.0: +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s -; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s ; CHECK-NEXT: faddp s0, v0.2s -; CHECK-NEXT: faddp s1, v1.2s -; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) @@ -582,11 +575,9 @@ define float @fadd_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { ; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fadd v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s -; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s ; CHECK-NEXT: faddp s0, v0.2s -; CHECK-NEXT: faddp s1, v1.2s -; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b) @@ -599,9 +590,8 @@ define double @fadd_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: fadd v2.2d, v2.2d, v3.2d ; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d -; CHECK-NEXT: faddp d1, v2.2d +; CHECK-NEXT: fadd v0.2d, v0.2d, v2.2d ; CHECK-NEXT: faddp d0, v0.2d -; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret %r1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a) %r2 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %b) diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll index bd910f1230a718..d5bde8bad7c3b7 100644 --- a/llvm/test/CodeGen/RISCV/double_reduct.ll +++ b/llvm/test/CodeGen/RISCV/double_reduct.ll @@ -8,12 +8,10 @@ define float @add_f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: add_f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: vfredusum.vs v8, v9, v10 -; CHECK-NEXT: vfmv.f.s ft1, v8 -; CHECK-NEXT: fadd.s fa0, ft0, ft1 +; CHECK-NEXT: vfadd.vv v8, v8, v9 +; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vfredusum.vs v8, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) @@ -83,27 +81,14 @@ define float @fmax_f32(<4 x float> %a, <4 x float> %b) { define i32 @add_i32(<4 x i32> %a, <4 x i32> %b) { -; RV32-LABEL: add_i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.s.x v10, zero -; RV32-NEXT: vredsum.vs v8, v8, v10 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vredsum.vs v8, v9, v10 -; RV32-NEXT: vmv.x.s a1, v8 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: ret -; -; RV64-LABEL: add_i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vredsum.vs v8, v8, v10 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: vredsum.vs v8, v9, v10 -; RV64-NEXT: vmv.x.s a1, v8 -; RV64-NEXT: addw a0, a0, a1 -; RV64-NEXT: ret +; CHECK-LABEL: add_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vredsum.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %a) %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b) %r = add i32 %r1, %r2 @@ -116,14 +101,10 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { ; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma ; CHECK-NEXT: vmv.s.x v10, zero ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v10 -; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma +; CHECK-NEXT: vwaddu.vv v12, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vredsum.vs v8, v12, v10 ; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v9, v10 -; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: ret %ae = zext <16 x i8> %a to <16 x i16> %be = zext <16 x i8> %b to <16 x i16> @@ -200,12 +181,10 @@ define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: and_i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vredand.vs v8, v8, v10 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.i v9, -1 +; CHECK-NEXT: vredand.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: vredand.vs v8, v9, v10 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a) %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) @@ -218,11 +197,9 @@ define i32 @or_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vredor.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: vredor.vs v8, v9, v10 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: or a0, a0, a1 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a) %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) @@ -235,11 +212,9 @@ define i32 @xor_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vxor.vv v8, v8, v9 ; CHECK-NEXT: vredxor.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: vredxor.vs v8, v9, v10 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a) %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) @@ -251,15 +226,10 @@ define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: umin_i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vredminu.vs v8, v8, v10 +; CHECK-NEXT: vminu.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.i v9, -1 +; CHECK-NEXT: vredminu.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: vredminu.vs v8, v9, v10 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: bltu a0, a1, .LBB11_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB11_2: ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a) %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) @@ -272,14 +242,9 @@ define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vmaxu.vv v8, v8, v9 ; CHECK-NEXT: vredmaxu.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: vredmaxu.vs v8, v9, v10 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: bltu a1, a0, .LBB12_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB12_2: ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a) %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) @@ -290,34 +255,24 @@ define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) { define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) { ; RV32-LABEL: smin_i32: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmin.vv v8, v8, v9 ; RV32-NEXT: lui a0, 524288 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.s.x v10, a0 -; RV32-NEXT: vredmin.vs v8, v8, v10 +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vredmin.vs v8, v8, v9 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vredmin.vs v8, v9, v10 -; RV32-NEXT: vmv.x.s a1, v8 -; RV32-NEXT: blt a0, a1, .LBB13_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: .LBB13_2: ; RV32-NEXT: ret ; ; RV64-LABEL: smin_i32: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmin.vv v8, v8, v9 ; RV64-NEXT: lui a0, 524288 ; RV64-NEXT: addiw a0, a0, -1 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vredmin.vs v8, v8, v10 +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vredmin.vs v8, v8, v9 ; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: vredmin.vs v8, v9, v10 -; RV64-NEXT: vmv.x.s a1, v8 -; RV64-NEXT: blt a0, a1, .LBB13_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB13_2: ; RV64-NEXT: ret %r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a) %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) @@ -331,14 +286,9 @@ define i32 @smax_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: vredmax.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: vredmax.vs v8, v9, v10 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: blt a1, a0, .LBB14_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB14_2: ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a) %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) diff --git a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll index 975f7b43067bb1..fad110df937fb3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll @@ -5,12 +5,9 @@ define float @add_f32(<8 x float> %a, <4 x float> %b) { ; CHECK-LABEL: add_f32: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 s4, s10, s11 +; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s6, s8, s9 -; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: vadd.f32 s2, s6, s4 ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: bx lr %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) @@ -23,12 +20,9 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) { ; CHECK-LABEL: fmul_f32: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmul.f32 q0, q0, q1 -; CHECK-NEXT: vmul.f32 s4, s10, s11 +; CHECK-NEXT: vmul.f32 q0, q0, q2 ; CHECK-NEXT: vmul.f32 s2, s2, s3 ; CHECK-NEXT: vmul.f32 s0, s0, s1 -; CHECK-NEXT: vmul.f32 s6, s8, s9 -; CHECK-NEXT: vmul.f32 s0, s0, s2 -; CHECK-NEXT: vmul.f32 s2, s6, s4 ; CHECK-NEXT: vmul.f32 s0, s0, s2 ; CHECK-NEXT: bx lr %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) @@ -132,21 +126,14 @@ define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: mul_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r6, r3, d0 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmul.i32 q0, q0, q2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: mul r2, r12, lr -; CHECK-NEXT: muls r3, r6, r3 -; CHECK-NEXT: mul r1, r4, r5 -; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: mul r1, r2, r3 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: muls r0, r2, r0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: bx lr %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) %r = mul i32 %r1, %r2 @@ -156,21 +143,14 @@ define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: and_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r6, r1, d5 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: ands r1, r6 -; CHECK-NEXT: ands r2, r3 -; CHECK-NEXT: and.w r0, r12, lr -; CHECK-NEXT: ands r0, r2 -; CHECK-NEXT: and.w r2, r4, r5 -; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: and.w r1, r2, r3 +; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: bx lr %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) %r = and i32 %r1, %r2 @@ -180,21 +160,14 @@ define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: or_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r6, r1, d5 +; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: orrs r1, r6 -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: orr.w r0, r12, lr -; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: orr.w r2, r4, r5 -; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: orr.w r1, r2, r3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: bx lr %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) %r = or i32 %r1, %r2 @@ -204,21 +177,14 @@ define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: xor_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov r6, r1, d5 +; CHECK-NEXT: veor q0, q0, q2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: eors r1, r6 -; CHECK-NEXT: eors r2, r3 -; CHECK-NEXT: eor.w r0, r12, lr -; CHECK-NEXT: eors r0, r2 -; CHECK-NEXT: eor.w r2, r4, r5 -; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: eor.w r1, r2, r3 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: bx lr %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) %r = xor i32 %r1, %r2 @@ -228,13 +194,10 @@ define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: umin_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov.w r0, #-1 ; CHECK-NEXT: vmin.u32 q0, q0, q1 -; CHECK-NEXT: mov.w r1, #-1 -; CHECK-NEXT: vminv.u32 r0, q2 -; CHECK-NEXT: vminv.u32 r1, q0 -; CHECK-NEXT: cmp r1, r0 -; CHECK-NEXT: csel r0, r1, r0, lo +; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: vmin.u32 q0, q0, q2 +; CHECK-NEXT: vminv.u32 r0, q0 ; CHECK-NEXT: bx lr %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) @@ -245,13 +208,10 @@ define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) { define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: umax_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: vmax.u32 q0, q0, q1 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: vmaxv.u32 r0, q2 -; CHECK-NEXT: vmaxv.u32 r1, q0 -; CHECK-NEXT: cmp r1, r0 -; CHECK-NEXT: csel r0, r1, r0, hi +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmax.u32 q0, q0, q2 +; CHECK-NEXT: vmaxv.u32 r0, q0 ; CHECK-NEXT: bx lr %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) @@ -262,13 +222,10 @@ define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) { define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: smin_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: mvn r0, #-2147483648 ; CHECK-NEXT: vmin.s32 q0, q0, q1 -; CHECK-NEXT: mvn r1, #-2147483648 -; CHECK-NEXT: vminv.s32 r0, q2 -; CHECK-NEXT: vminv.s32 r1, q0 -; CHECK-NEXT: cmp r1, r0 -; CHECK-NEXT: csel r0, r1, r0, lt +; CHECK-NEXT: mvn r0, #-2147483648 +; CHECK-NEXT: vmin.s32 q0, q0, q2 +; CHECK-NEXT: vminv.s32 r0, q0 ; CHECK-NEXT: bx lr %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) @@ -279,13 +236,10 @@ define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) { define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: smax_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov.w r0, #-2147483648 ; CHECK-NEXT: vmax.s32 q0, q0, q1 -; CHECK-NEXT: mov.w r1, #-2147483648 -; CHECK-NEXT: vmaxv.s32 r0, q2 -; CHECK-NEXT: vmaxv.s32 r1, q0 -; CHECK-NEXT: cmp r1, r0 -; CHECK-NEXT: csel r0, r1, r0, gt +; CHECK-NEXT: mov.w r0, #-2147483648 +; CHECK-NEXT: vmax.s32 q0, q0, q2 +; CHECK-NEXT: vmaxv.s32 r0, q0 ; CHECK-NEXT: bx lr %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)