From 4ef6a5efbe8954309e2576825003bd375bf8ebb4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 15 Sep 2025 12:00:24 +0100 Subject: [PATCH 1/3] [LV] Don't create partial reductions if factor doesn't match accumulator Check if the scale-factor of the accumulator is the same as the request ScaleFactor in tryToCreatePartialReductions. This prevents creating partial reductions if not all instructions in the reduction chain form partial reductions. e.g. because we do not form a partial reduction for the loop exit instruction. Currently code-gen works fine, because the scale factor of VPPartialReduction is not used during ::execute, but it means we compute incorrect cost/register pressure, because the partial reduction won't reduce to the specified scaling factor. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 11 +++++++++-- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 16 +--------------- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 14 ++++++++++++++ llvm/lib/Transforms/Vectorize/VPlanUtils.h | 4 ++++ .../AArch64/partial-reduce-chained.ll | 4 ++-- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 640a98c622f80..2caeb1bf8a695 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8141,8 +8141,11 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (std::optional ScaleFactor = getScalingForReduction(Instr)) - return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); + if (std::optional ScaleFactor = getScalingForReduction(Instr)) { + if (auto PartialRed = + tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value())) + return PartialRed; + } if (!shouldWiden(Instr, Range)) return nullptr; @@ -8176,6 +8179,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, isa(BinOpRecipe)) std::swap(BinOp, Accumulator); + if (ScaleFactor != + vputils::getVFScaleFactor(Accumulator->getDefiningRecipe())) + return nullptr; + unsigned ReductionOpcode = Reduction->getOpcode(); if (ReductionOpcode == Instruction::Sub) { auto *const Zero = ConstantInt::get(Reduction->getType(), 0); diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index d400ceff7797c..b4e62d39173e1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -394,20 +394,6 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, return Base::properlyDominates(ParentA, ParentB); } -/// Get the VF scaling factor applied to the recipe's output, if the recipe has -/// one. -static unsigned getVFScaleFactor(VPRecipeBase *R) { - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - assert( - (!isa(R) || cast(R)->getOpcode() != - VPInstruction::ReductionStartVector) && - "getting scaling factor of reduction-start-vector not implemented yet"); - return 1; -} - bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI, unsigned OverrideMaxNumRegs) const { return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) { @@ -569,7 +555,7 @@ SmallVector llvm::calculateRegisterUsageForPlan( } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - unsigned ScaleFactor = getVFScaleFactor(R); + unsigned ScaleFactor = vputils::getVFScaleFactor(R); ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index ddc4ad1977401..94c492f52e89f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -141,3 +141,17 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) { }); return I == DepthFirst.end() ? nullptr : cast(*I); } + +unsigned vputils::getVFScaleFactor(VPRecipeBase *R) { + if (!R) + return 1; + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + assert( + (!isa(R) || cast(R)->getOpcode() != + VPInstruction::ReductionStartVector) && + "getting scaling factor of reduction-start-vector not implemented yet"); + return 1; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 77c099b271717..0dd6885da6fdd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -101,6 +101,10 @@ bool isUniformAcrossVFsAndUFs(VPValue *V); /// Returns the header block of the first, top-level loop, or null if none /// exist. VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT); + +/// Get the VF scaling factor applied to the recipe's output, if the recipe has +/// one. +unsigned getVFScaleFactor(VPRecipeBase *R); } // namespace vputils //===----------------------------------------------------------------------===// diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index c0995ec150c8d..f8b9d5db07fce 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -1404,7 +1404,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.experimental.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] ; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1478,7 +1478,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32( [[VEC_PHI]], [[TMP7]]) +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add [[VEC_PHI]], [[TMP7]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] From b9c39a13727c4c917fad8df04aaef28f697a8a8c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 24 Sep 2025 11:36:53 +0100 Subject: [PATCH 2/3] !fixup update after merge, rename test --- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 2 +- .../LoopVectorize/AArch64/partial-reduce-chained.ll | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 9605024a4f1ec..4de8d1cbc377c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -557,7 +557,7 @@ SmallVector llvm::calculateRegisterUsageForPlan( } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - unsigned ScaleFactor = vputils::getVFScaleFactor(VPV); + unsigned ScaleFactor = vputils::getVFScaleFactor(VPV->getDefiningRecipe()); ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 0e87bfbeac282..229209e98af78 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -1381,8 +1381,8 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 } -define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { -; CHECK-NEON-LABEL: define i32 @red_extended_add_chain( +define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) { +; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEON-NEXT: entry: ; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 @@ -1415,7 +1415,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-NEON: scalar.ph: ; -; CHECK-SVE-LABEL: define i32 @red_extended_add_chain( +; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-SVE-NEXT: entry: ; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 @@ -1452,7 +1452,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: ; -; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_chain( +; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-SVE-MAXBW-NEXT: entry: ; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 From 62405994f5aac1f983926f03f9fdcbc322f3b62d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 24 Sep 2025 11:43:52 +0100 Subject: [PATCH 3/3] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 4de8d1cbc377c..07bfe7a896d86 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -557,7 +557,8 @@ SmallVector llvm::calculateRegisterUsageForPlan( } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - unsigned ScaleFactor = vputils::getVFScaleFactor(VPV->getDefiningRecipe()); + unsigned ScaleFactor = + vputils::getVFScaleFactor(VPV->getDefiningRecipe()); ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF