diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 30fcc9b7680ed..9ac2adc76dfa7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8197,8 +8197,11 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (std::optional ScaleFactor = getScalingForReduction(Instr)) - return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); + if (std::optional ScaleFactor = getScalingForReduction(Instr)) { + if (auto PartialRed = + tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value())) + return PartialRed; + } if (!shouldWiden(Instr, Range)) return nullptr; @@ -8232,6 +8235,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, isa(BinOpRecipe)) std::swap(BinOp, Accumulator); + if (ScaleFactor != + vputils::getVFScaleFactor(Accumulator->getDefiningRecipe())) + return nullptr; + unsigned ReductionOpcode = Reduction->getOpcode(); if (ReductionOpcode == Instruction::Sub) { auto *const Zero = ConstantInt::get(Reduction->getType(), 0); diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 46ab7712e2671..07bfe7a896d86 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -395,20 +395,6 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, return Base::properlyDominates(ParentA, ParentB); } -/// Get the VF scaling factor applied to the recipe's output, if the recipe has -/// one. -static unsigned getVFScaleFactor(VPValue *R) { - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - assert( - (!isa(R) || cast(R)->getOpcode() != - VPInstruction::ReductionStartVector) && - "getting scaling factor of reduction-start-vector not implemented yet"); - return 1; -} - bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI, unsigned OverrideMaxNumRegs) const { return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) { @@ -571,7 +557,8 @@ SmallVector llvm::calculateRegisterUsageForPlan( } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - unsigned ScaleFactor = getVFScaleFactor(VPV); + unsigned ScaleFactor = + vputils::getVFScaleFactor(VPV->getDefiningRecipe()); ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 917aa01f8a926..eac0e705a877d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -141,6 +141,20 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) { return I == DepthFirst.end() ? nullptr : cast(*I); } +unsigned vputils::getVFScaleFactor(VPRecipeBase *R) { + if (!R) + return 1; + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + assert( + (!isa(R) || cast(R)->getOpcode() != + VPInstruction::ReductionStartVector) && + "getting scaling factor of reduction-start-vector not implemented yet"); + return 1; +} + std::optional vputils::getRecipesForUncountableExit(VPlan &Plan, SmallVectorImpl &Recipes, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 33dd8efaec2db..0222b0aa81063 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -102,6 +102,10 @@ bool isUniformAcrossVFsAndUFs(VPValue *V); /// exist. VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT); +/// Get the VF scaling factor applied to the recipe's output, if the recipe has +/// one. +unsigned getVFScaleFactor(VPRecipeBase *R); + /// Returns the VPValue representing the uncountable exit comparison used by /// AnyOf if the recipes it depends on can be traced back to live-ins and /// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index f4784b6259ce1..229209e98af78 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -1381,8 +1381,8 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 } -define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { -; CHECK-NEON-LABEL: define i32 @red_extended_add_chain( +define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) { +; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEON-NEXT: entry: ; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 @@ -1404,7 +1404,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] ; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1415,7 +1415,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-NEON: scalar.ph: ; -; CHECK-SVE-LABEL: define i32 @red_extended_add_chain( +; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-SVE-NEXT: entry: ; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 @@ -1452,7 +1452,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: ; -; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_chain( +; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-SVE-MAXBW-NEXT: entry: ; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 @@ -1478,7 +1478,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv8i32.nxv8i32( [[VEC_PHI]], [[TMP7]]) +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add [[VEC_PHI]], [[TMP7]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]