llvm · fhahn · Sep 24, 2025 · Sep 15, 2025 · Sep 24, 2025 · Sep 24, 2025
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8197,8 +8197,11 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
-  if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
-    return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
+  if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
+    if (auto PartialRed =
+            tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
+      return PartialRed;
+  }
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
@@ -8232,6 +8235,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
       isa<VPPartialReductionRecipe>(BinOpRecipe))
     std::swap(BinOp, Accumulator);
 
+  if (ScaleFactor !=
+      vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
+    return nullptr;
+
   unsigned ReductionOpcode = Reduction->getOpcode();
   if (ReductionOpcode == Instruction::Sub) {
     auto *const Zero = ConstantInt::get(Reduction->getType(), 0);

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -395,20 +395,6 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
   return Base::properlyDominates(ParentA, ParentB);
 }
 
-/// Get the VF scaling factor applied to the recipe's output, if the recipe has
-/// one.
-static unsigned getVFScaleFactor(VPValue *R) {
-  if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
-    return RR->getVFScaleFactor();
-  if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
-    return RR->getVFScaleFactor();
-  assert(
-      (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
-                                     VPInstruction::ReductionStartVector) &&
-      "getting scaling factor of reduction-start-vector not implemented yet");
-  return 1;
-}
-
 bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
                                         unsigned OverrideMaxNumRegs) const {
   return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
@@ -571,7 +557,8 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
         } else {
           // The output from scaled phis and scaled reductions actually has
           // fewer lanes than the VF.
-          unsigned ScaleFactor = getVFScaleFactor(VPV);
+          unsigned ScaleFactor =
+              vputils::getVFScaleFactor(VPV->getDefiningRecipe());
           ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
           LLVM_DEBUG(if (VF != VFs[J]) {
             dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -141,6 +141,20 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
   return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
 }
 
+unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
+  if (!R)
+    return 1;
+  if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
+    return RR->getVFScaleFactor();
+  if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
+    return RR->getVFScaleFactor();
+  assert(
+      (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
+                                     VPInstruction::ReductionStartVector) &&
+      "getting scaling factor of reduction-start-vector not implemented yet");
+  return 1;
+}
+
 std::optional<VPValue *>
 vputils::getRecipesForUncountableExit(VPlan &Plan,
                                       SmallVectorImpl<VPRecipeBase *> &Recipes,

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -102,6 +102,10 @@ bool isUniformAcrossVFsAndUFs(VPValue *V);
 /// exist.
 VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);
 
+/// Get the VF scaling factor applied to the recipe's output, if the recipe has
+/// one.
+unsigned getVFScaleFactor(VPRecipeBase *R);
+
 /// Returns the VPValue representing the uncountable exit comparison used by
 /// AnyOf if the recipes it depends on can be traced back to live-ins and
 /// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1381,8 +1381,8 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
 }
 
-define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
-; CHECK-NEON-LABEL: define i32 @red_extended_add_chain(
+define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
+; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
 ; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEON-NEXT:  entry:
 ; CHECK-NEON-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
@@ -1404,7 +1404,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
 ; CHECK-NEON-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
 ; CHECK-NEON-NEXT:    [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1415,7 +1415,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
 ; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-NEON:       scalar.ph:
 ;
-; CHECK-SVE-LABEL: define i32 @red_extended_add_chain(
+; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
 ; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-SVE-NEXT:  entry:
 ; CHECK-SVE-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
@@ -1452,7 +1452,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
 ;
-; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_chain(
+; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
 ; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-SVE-MAXBW-NEXT:  entry:
 ; CHECK-SVE-MAXBW-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
@@ -1478,7 +1478,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
 ; CHECK-SVE-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 8 x i32> @llvm.vector.partial.reduce.add.nxv8i32.nxv8i32(<vscale x 8 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP7]])
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]