From b58b094410f6ffc363e657e4c24073e8e45b97f9 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 12 Sep 2025 10:22:46 +0100 Subject: [PATCH 1/4] [AArch64] Refactor and refine cost-model for partial reductions This cost-model takes into account any type-legalisation that would happen on vectors such as splitting and promotion. This results in wider VFs being chosen for loops that can use partial reductions. The cost-model now also assumes that when SVE is available, the SVE dot instructions for i16 -> i64 dot products can be used for fixed-length vectors. In practice this means that loops with non-scalable VFs are vectorized using partial reductions where they wouldn't before, e.g. int64_t foo2(int8_t *src1, int8_t *src2, int N) { int64_t sum = 0; for (int i=0; i BinOp, TTI::TargetCostKind CostKind) const { InstructionCost Invalid = InstructionCost::getInvalid(); - InstructionCost Cost(TTI::TCC_Basic); if (CostKind != TTI::TCK_RecipThroughput) return Invalid; - // Sub opcodes currently only occur in chained cases. - // Independent partial reduction subtractions are still costed as an add + if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable()) + return Invalid; + + if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() && + (!ST->isNeonAvailable() || !ST->hasDotProd())) + return Invalid; + if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) || OpAExtend == TTI::PR_None) return Invalid; // We only support multiply binary operations for now, and for muls we // require the types being extended to be the same. - // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but - // only if the i8mm or sve/streaming features are available. - if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB || - OpBExtend == TTI::PR_None || - (OpAExtend != OpBExtend && !ST->hasMatMulInt8() && - !ST->isSVEorStreamingSVEAvailable()))) + if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB)) return Invalid; assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && "Unexpected values for OpBExtend or InputTypeB"); - EVT InputEVT = EVT::getEVT(InputTypeA); - EVT AccumEVT = EVT::getEVT(AccumType); + bool IsUSDot = OpBExtend && OpAExtend != OpBExtend; + if (IsUSDot && !ST->hasMatMulInt8()) + return Invalid; - unsigned VFMinValue = VF.getKnownMinValue(); + unsigned Ratio = + AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits(); + if (VF.getKnownMinValue() < Ratio) + return Invalid; - if (VF.isScalable()) { - if (!ST->isSVEorStreamingSVEAvailable()) - return Invalid; + VectorType *InputVectorType = VectorType::get(InputTypeA, VF); + VectorType *AccumVectorType = + VectorType::get(AccumType, VF.divideCoefficientBy(Ratio)); - // Don't accept a partial reduction if the scaled accumulator is vscale x 1, - // since we can't lower that type. - unsigned Scale = - AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits(); - if (VFMinValue == Scale) - return Invalid; - } - if (VF.isFixed() && - (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64)) + // We don't yet support widening for accumulators. + if (AccumVectorType->getElementCount() == ElementCount::getScalable(1)) return Invalid; - if (InputEVT == MVT::i8) { - switch (VFMinValue) { - default: - return Invalid; - case 8: - if (AccumEVT == MVT::i32) - Cost *= 2; - else if (AccumEVT != MVT::i64) - return Invalid; - break; - case 16: - if (AccumEVT == MVT::i64) - Cost *= 2; - else if (AccumEVT != MVT::i32) - return Invalid; - break; - } - } else if (InputEVT == MVT::i16) { - // FIXME: Allow i32 accumulator but increase cost, as we would extend - // it to i64. - if (VFMinValue != 8 || AccumEVT != MVT::i64) - return Invalid; - } else - return Invalid; + // Check what kind of type-legalisation happens. + std::pair AccumLT = + getTypeLegalizationCost(AccumVectorType); + std::pair InputLT = + getTypeLegalizationCost(InputVectorType); - return Cost; + InstructionCost Cost = InputLT.first * TTI::TCC_Basic; + + // Prefer using full types by costing half-full input types as more expensive. + if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(), + TypeSize::getScalable(128))) + // FIXME: This can be removed after the cost of the extends are folded into + // the dot-product expression in VPlan, after landing: + // https://github.com/llvm/llvm-project/pull/147302 + Cost *= 2; + + if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) { + // i16 -> i64 is natively supported for udot/sdot + if (AccumLT.second.getScalarType() == MVT::i64 && + InputLT.second.getScalarType() == MVT::i16) + return Cost; + // i8 -> i64 is supported with an extra level of extends + if (AccumLT.second.getScalarType() == MVT::i64 && + InputLT.second.getScalarType() == MVT::i8) + // FIXME: This cost should probably be a little higher, e.g. Cost + 2 + // because it requires two extra extends on the inputs. But if we'd change + // that now, a regular reduction would be cheaper because the costs of + // the extends in the IR are still counted. This can be fixed + // after https://github.com/llvm/llvm-project/pull/147302 has landed. + return Cost; + } + + // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE. + if (ST->isSVEorStreamingSVEAvailable() || + (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() && + ST->hasDotProd())) { + if (AccumLT.second.getScalarType() == MVT::i32 && + InputLT.second.getScalarType() == MVT::i8) + return Cost; + } + + // Add additional cost for the extends that would need to be inserted. + return Cost + 4; } InstructionCost diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index c3b0bc8c00a74..47f43339dc35f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 27 +; CHECK: Cost for VF 8: 25 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 48 +; CHECK: Cost for VF 16: 41 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 229209e98af78..5ae08393a1804 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -204,37 +204,33 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -670,39 +666,35 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP21]] = add [[TMP19]], [[TMP20]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -996,36 +988,32 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18]] = add [[TMP17]], [[TMP15]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP18]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -1140,32 +1128,28 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP13:%.*]] = add [[VEC_PHI]], [[TMP11]] -; CHECK-SVE-NEXT: [[TMP14]] = add [[TMP13]], [[TMP12]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]]) +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-SVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -1277,36 +1261,32 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-NEXT: [[TMP16:%.*]] = add [[VEC_PHI]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP18]] = add [[TMP16]], [[TMP17]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP18]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index dd239c023c686..263aea6816691 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] @@ -98,23 +98,23 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <1 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <1 x i32> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT11]] to <4 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]] +; CHECK-NEXT: [[PARTIAL_REDUCE12]] = call <1 x i32> @llvm.vector.partial.reduce.add.v1i32.v4i32(<1 x i32> [[VEC_PHI9]], <4 x i32> [[TMP14]]) ; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> [[PARTIAL_REDUCE12]]) ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 49e9989b65d2f..09b41fb551775 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -12,40 +12,40 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD2]] to ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP20:%.*]] = mul [[TMP18]], [[TMP11]] -; CHECK-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP20]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP21]]) +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = mul [[TMP18]], [[TMP7]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP14]]) +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP19]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 @@ -82,14 +82,14 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD4]] to ; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul [[TMP18]], [[TMP11]] ; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP20]]) -; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP21]]) +; CHECK-NOI8MM-NEXT: [[TMP22]] = add [[TMP20]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NOI8MM-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add [[TMP23]], [[TMP22]] +; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-NOI8MM: scalar.ph: @@ -123,40 +123,40 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP20:%.*]] = mul [[TMP18]], [[TMP11]] -; CHECK-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP20]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP21]]) +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = mul [[TMP18]], [[TMP7]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP14]]) +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP19]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 @@ -193,14 +193,14 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD4]] to ; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul [[TMP18]], [[TMP11]] ; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP20]]) -; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP21]]) +; CHECK-NOI8MM-NEXT: [[TMP22]] = add [[TMP20]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NOI8MM-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add [[TMP23]], [[TMP22]] +; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-NOI8MM: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 6e11e559151da..3a882730f0379 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -12,74 +12,62 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP16]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP20]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP18]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP6]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP14]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP20]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP28]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP29]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP30]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP11]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { @@ -139,78 +127,52 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP9]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[TMP14]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP15]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i64 [[TMP4]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod( ; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD5]] to -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw [[TMP20]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw [[TMP21]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[TMP23]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP25]], [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP4]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod( ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { @@ -274,86 +236,66 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul i64 [[TMP10]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP20]] -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = mul i64 [[TMP10]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[NEXT_GEP2]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nuw nsw [[TMP15]], [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add [[TMP16]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP17]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i64 [[TMP4]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod( ; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]] -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP30]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[NEXT_GEP3]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP21]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD5]] to -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD6]] to -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul nuw nsw [[TMP22]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw [[TMP23]], [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add [[TMP24]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add [[TMP25]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP27]], [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP15]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod( ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] { @@ -497,7 +439,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] @@ -656,7 +598,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] ; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) @@ -803,7 +745,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP17]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 @@ -851,7 +793,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8 @@ -952,7 +894,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = add [[TMP16]], [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -990,7 +932,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add [[TMP30]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8 @@ -1058,22 +1000,18 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 2 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -1085,38 +1023,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw [[TMP21]], [[TMP36]] -; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add [[TMP38]], [[VEC_PHI3]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD5]] to -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext [[WIDE_LOAD6]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw [[TMP25]], [[TMP42]] -; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add [[TMP28]], [[VEC_PHI2]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext [[WIDE_LOAD7]] to -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext [[WIDE_LOAD8]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw [[TMP31]], [[TMP33]] -; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add [[TMP34]], [[VEC_PHI1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD9]] to -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD10]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw [[TMP37]], [[TMP39]] -; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add [[TMP40]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP41]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP35]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP30]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP23]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -1124,26 +1062,22 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 3 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE29:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -1155,90 +1089,74 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP56]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP21]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD8]] to -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = shl nuw i64 [[TMP25]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP72]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext [[WIDE_LOAD9]] to -; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext [[WIDE_LOAD10]] to -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw [[TMP28]], [[TMP66]] -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw [[TMP82]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add [[TMP30]], [[VEC_PHI6]] -; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add [[TMP31]], [[VEC_PHI7]] -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = shl nuw i64 [[TMP35]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load , ptr [[TMP4]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP37]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext [[WIDE_LOAD11]] to -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD12]] to -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = shl nuw i64 [[TMP41]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load , ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP43]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext [[WIDE_LOAD13]] to -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext [[WIDE_LOAD14]] to -; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw [[TMP38]], [[TMP44]] -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw [[TMP39]], [[TMP45]] -; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add [[TMP46]], [[VEC_PHI4]] -; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add [[TMP47]], [[VEC_PHI5]] -; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load , ptr [[TMP53]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext [[WIDE_LOAD15]] to -; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext [[WIDE_LOAD16]] to -; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = shl nuw i64 [[TMP57]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP59]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext [[WIDE_LOAD17]] to -; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext [[WIDE_LOAD18]] to -; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw [[TMP54]], [[TMP60]] -; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw [[TMP55]], [[TMP61]] -; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add [[TMP62]], [[VEC_PHI2]] -; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add [[TMP63]], [[VEC_PHI3]] -; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = shl nuw i64 [[TMP67]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP69]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext [[WIDE_LOAD19]] to -; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD20]] to -; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = shl nuw i64 [[TMP73]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load , ptr [[TMP75]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext [[WIDE_LOAD21]] to -; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext [[WIDE_LOAD22]] to -; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw [[TMP70]], [[TMP76]] -; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw [[TMP71]], [[TMP77]] -; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add [[TMP78]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add [[TMP79]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]]) +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP81]], [[TMP80]] -; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX23:%.*]] = add [[TMP65]], [[TMP64]] -; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX23]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX24:%.*]] = add [[TMP49]], [[TMP48]] -; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX24]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX25:%.*]] = add [[TMP33]], [[TMP50]] -; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX25]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE23]], [[PARTIAL_REDUCE22]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]] +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -1396,7 +1314,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] @@ -1434,7 +1352,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] @@ -1525,7 +1443,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1572,7 +1490,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) @@ -1607,7 +1525,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP24]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1666,7 +1584,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP15]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] @@ -1713,7 +1631,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI1]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP25]], [[TMP24]] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) @@ -1748,7 +1666,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP14]] = add [[VEC_PHI]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP14]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] @@ -1866,7 +1784,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -1978,7 +1896,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2016,7 +1934,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2053,7 +1971,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2111,7 +2029,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2149,7 +2067,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2186,7 +2104,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2226,36 +2144,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-INTERLEAVE1: for.body.preheader: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = insertelement zeroinitializer, i64 [[COST]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nuw nsw [[TMP16]], [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add [[TMP17]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP18]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -2267,50 +2181,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-INTERLEAVED: for.body.preheader: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = insertelement zeroinitializer, i64 [[COST]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP16]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[NEXT_GEP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = zext [[WIDE_LOAD5]] to -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nuw nsw [[TMP23]], [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nuw nsw [[TMP24]], [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add [[TMP25]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add [[TMP26]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP28]], [[TMP27]] -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -2349,7 +2245,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW-NEXT: [[TMP20]] = add [[TMP17]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP20]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2471,7 +2367,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) @@ -2571,7 +2467,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) @@ -2671,7 +2567,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll index 11ff688cb53a9..7bb47157d7cf2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -12,77 +12,65 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP15]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP8]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sub zeroinitializer, [[TMP21]] -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sub zeroinitializer, [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add [[VEC_PHI1]], [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP26]], [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP13]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll index db3166cc0ec8d..3c2ae1c74d171 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll @@ -17,16 +17,16 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP3]] = add <16 x i32> [[TMP2]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -38,22 +38,22 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <16 x i32> [[TMP3]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -199,16 +199,16 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i64> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP3]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -220,22 +220,22 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i64> [[TMP4]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i64> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i64> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -293,16 +293,16 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <8 x i64> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -314,22 +314,22 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP1]], i32 8 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <8 x i64> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -764,16 +764,16 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -785,22 +785,22 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; @@ -984,21 +984,21 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 ; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -1015,26 +1015,26 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP12]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[VEC_PHI2]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]]) ; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP8]], [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index c61361bb3df76..25ee10077fb47 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK: LV(REG): VF = 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 12 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index d4e5dea3d4aab..0cba88448af48 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -7,7 +7,7 @@ target triple = "aarch64-none-unknown-elf" ; Tests for printing VPlans that are enabled under AArch64 define i32 @print_partial_reduction(ptr %a, ptr %b) { -; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={4,8,16},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count @@ -71,7 +71,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' { +; CHECK: VPlan 'Final VPlan for VF={4,8,16},UF={1}' { ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: From 0a91b0d92f1a0e45c382708b8484d28427534e1d Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 29 Sep 2025 10:33:15 +0000 Subject: [PATCH 2/4] Widening is not supported for <1 x ..> either --- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 13 +++++++++++-- .../AArch64/partial-reduce-dot-product-epilogue.ll | 8 ++++---- .../LoopVectorize/AArch64/vplan-printing.ll | 4 ++-- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a2ee509fe4cb7..ece53cd4af217 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5667,9 +5667,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( VectorType *AccumVectorType = VectorType::get(AccumType, VF.divideCoefficientBy(Ratio)); - // We don't yet support widening for accumulators. - if (AccumVectorType->getElementCount() == ElementCount::getScalable(1)) + // We don't yet support all kinds of legalization (e.g. widening + // of <[vscale x] 1 x ..> accumulators) + auto TA = TLI->getTypeAction(AccumVectorType->getContext(), + EVT::getEVT(AccumVectorType)); + switch (TA) { + default: return Invalid; + case TargetLowering::TypeLegal: + case TargetLowering::TypePromoteInteger: + case TargetLowering::TypeSplitVector: + break; + } // Check what kind of type-legalisation happens. std::pair AccumLT = diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 263aea6816691..1ae2e2da7f8b8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -98,23 +98,23 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <1 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <1 x i32> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT11]] to <4 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[PARTIAL_REDUCE12]] = call <1 x i32> @llvm.vector.partial.reduce.add.v1i32.v4i32(<1 x i32> [[VEC_PHI9]], <4 x i32> [[TMP14]]) +; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]] ; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> [[PARTIAL_REDUCE12]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 0cba88448af48..d4e5dea3d4aab 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -7,7 +7,7 @@ target triple = "aarch64-none-unknown-elf" ; Tests for printing VPlans that are enabled under AArch64 define i32 @print_partial_reduction(ptr %a, ptr %b) { -; CHECK: VPlan 'Initial VPlan for VF={4,8,16},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count @@ -71,7 +71,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK: VPlan 'Final VPlan for VF={4,8,16},UF={1}' { +; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' { ; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: From bf2645aa9bac83c550493c189081188090e60bb9 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 30 Sep 2025 20:05:02 +0000 Subject: [PATCH 3/4] Address review comments --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 6 +----- .../Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll | 2 +- .../AArch64/partial-reduce-dot-product-epilogue.ll | 4 ++-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index ece53cd4af217..1bde8ea51c695 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5636,9 +5636,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( if (CostKind != TTI::TCK_RecipThroughput) return Invalid; - if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable()) - return Invalid; - if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() && (!ST->isNeonAvailable() || !ST->hasDotProd())) return Invalid; @@ -5660,13 +5657,12 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( unsigned Ratio = AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits(); - if (VF.getKnownMinValue() < Ratio) + if (VF.getKnownMinValue() <= Ratio) return Invalid; VectorType *InputVectorType = VectorType::get(InputTypeA, VF); VectorType *AccumVectorType = VectorType::get(AccumType, VF.divideCoefficientBy(Ratio)); - // We don't yet support all kinds of legalization (e.g. widening // of <[vscale x] 1 x ..> accumulators) auto TA = TLI->getTypeAction(AccumVectorType->getContext(), diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 47f43339dc35f..27ca4143b5be5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -82,7 +82,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 25 +; CHECK: Cost for VF 8: 27 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 1ae2e2da7f8b8..8ece59aef90a9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -81,7 +81,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] @@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] From 03c0f18fe37e6a4040e96961a729b2eb46fd14bc Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 2 Oct 2025 09:07:40 +0000 Subject: [PATCH 4/4] Address review comment --- .../lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 1bde8ea51c695..50a8754d74075 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5644,14 +5644,16 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( OpAExtend == TTI::PR_None) return Invalid; + assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && + (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) && + "Unexpected values for OpBExtend or InputTypeB"); + // We only support multiply binary operations for now, and for muls we // require the types being extended to be the same. if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB)) return Invalid; - assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && - "Unexpected values for OpBExtend or InputTypeB"); - bool IsUSDot = OpBExtend && OpAExtend != OpBExtend; + bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend; if (IsUSDot && !ST->hasMatMulInt8()) return Invalid; @@ -5663,8 +5665,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( VectorType *InputVectorType = VectorType::get(InputTypeA, VF); VectorType *AccumVectorType = VectorType::get(AccumType, VF.divideCoefficientBy(Ratio)); - // We don't yet support all kinds of legalization (e.g. widening - // of <[vscale x] 1 x ..> accumulators) + // We don't yet support all kinds of legalization. auto TA = TLI->getTypeAction(AccumVectorType->getContext(), EVT::getEVT(AccumVectorType)); switch (TA) {