From 7869fe8f72e5993e09fc2fb842ddc1f3368eaa92 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 13 Nov 2025 18:51:55 +0800
Subject: [PATCH] [VPlan] Disable partial reductions again with EVL tail
 folding

VPPartialReductionRecipe doesn't yet support an EVL variant, and we guard against this by not calling convertToAbstractRecipes when we're tail folding with EVL.

However recently some things got shuffled around which means we may detect some scaled reductions in collectScaledReductions and store them in ScaledReductionMap, where outside of convertToAbstractRecipes we may look them up and start e.g. adding a scale factor to an otherwise regular VPReductionPHI.

This fixes it by skipping collectScaledReductions, and fixes #167861
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 .../RISCV/partial-reduce-dot-product.ll       | 125 ++++++++++++++++++
 2 files changed, 128 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 83dee09f94b99..54974070d0672 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8382,7 +8382,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // ---------------------------------------------------------------------------
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
                                 Builder, BlockMaskCache, LVer);
-  RecipeBuilder.collectScaledReductions(Range);
+  // TODO: Handle partial reductions with EVL tail folding.
+  if (!CM.foldTailWithEVL())
+    RecipeBuilder.collectScaledReductions(Range);
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index 61e3a1848ceed..8d3026e63748a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -3,6 +3,7 @@
 ; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ
 ; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V
 ; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ
+; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -S < %s | FileCheck %s --check-prefixes=CHECK,TAILFOLD
 
 ; TODO: Remove -prefer-predicate-over-epilogue=scalar-epilogue when partial reductions with EVL tail folding is supported.
 
@@ -147,6 +148,37 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ:       for.exit:
 ; FIXED-ZVQDOTQ-NEXT:    ret i32 [[TMP13]]
 ;
+; TAILFOLD-LABEL: define i32 @vqdot(
+; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; TAILFOLD-NEXT:  entry:
+; TAILFOLD-NEXT:    br label [[VECTOR_PH:%.*]]
+; TAILFOLD:       vector.ph:
+; TAILFOLD-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TAILFOLD:       vector.body:
+; TAILFOLD-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; TAILFOLD-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32>
+; TAILFOLD-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP4:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32>
+; TAILFOLD-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]]
+; TAILFOLD-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]]
+; TAILFOLD-NEXT:    [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
+; TAILFOLD-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
+; TAILFOLD-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; TAILFOLD-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; TAILFOLD:       middle.block:
+; TAILFOLD-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
+; TAILFOLD-NEXT:    br label [[FOR_EXIT:%.*]]
+; TAILFOLD:       for.exit:
+; TAILFOLD-NEXT:    ret i32 [[TMP10]]
+;
 entry:
   br label %for.body
 
@@ -309,6 +341,37 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ:       for.exit:
 ; FIXED-ZVQDOTQ-NEXT:    ret i32 [[TMP13]]
 ;
+; TAILFOLD-LABEL: define i32 @vqdotu(
+; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; TAILFOLD-NEXT:  entry:
+; TAILFOLD-NEXT:    br label [[VECTOR_PH:%.*]]
+; TAILFOLD:       vector.ph:
+; TAILFOLD-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TAILFOLD:       vector.body:
+; TAILFOLD-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; TAILFOLD-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32>
+; TAILFOLD-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP4:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32>
+; TAILFOLD-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]]
+; TAILFOLD-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]]
+; TAILFOLD-NEXT:    [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
+; TAILFOLD-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
+; TAILFOLD-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; TAILFOLD-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TAILFOLD:       middle.block:
+; TAILFOLD-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
+; TAILFOLD-NEXT:    br label [[FOR_EXIT:%.*]]
+; TAILFOLD:       for.exit:
+; TAILFOLD-NEXT:    ret i32 [[TMP10]]
+;
 entry:
   br label %for.body
 
@@ -471,6 +534,37 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ:       for.exit:
 ; FIXED-ZVQDOTQ-NEXT:    ret i32 [[TMP13]]
 ;
+; TAILFOLD-LABEL: define i32 @vqdotsu(
+; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; TAILFOLD-NEXT:  entry:
+; TAILFOLD-NEXT:    br label [[VECTOR_PH:%.*]]
+; TAILFOLD:       vector.ph:
+; TAILFOLD-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TAILFOLD:       vector.body:
+; TAILFOLD-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; TAILFOLD-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32>
+; TAILFOLD-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP4:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32>
+; TAILFOLD-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]]
+; TAILFOLD-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]]
+; TAILFOLD-NEXT:    [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
+; TAILFOLD-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
+; TAILFOLD-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; TAILFOLD-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TAILFOLD:       middle.block:
+; TAILFOLD-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
+; TAILFOLD-NEXT:    br label [[FOR_EXIT:%.*]]
+; TAILFOLD:       for.exit:
+; TAILFOLD-NEXT:    ret i32 [[TMP10]]
+;
 entry:
   br label %for.body
 
@@ -632,6 +726,37 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ:       for.exit:
 ; FIXED-ZVQDOTQ-NEXT:    ret i32 [[TMP13]]
 ;
+; TAILFOLD-LABEL: define i32 @vqdotsu2(
+; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; TAILFOLD-NEXT:  entry:
+; TAILFOLD-NEXT:    br label [[VECTOR_PH:%.*]]
+; TAILFOLD:       vector.ph:
+; TAILFOLD-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TAILFOLD:       vector.body:
+; TAILFOLD-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TAILFOLD-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; TAILFOLD-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i8> [[VP_OP_LOAD]] to <vscale x 4 x i32>
+; TAILFOLD-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP4:%.*]] = zext <vscale x 4 x i8> [[VP_OP_LOAD1]] to <vscale x 4 x i32>
+; TAILFOLD-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], [[TMP2]]
+; TAILFOLD-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[TMP5]], [[VEC_PHI]]
+; TAILFOLD-NEXT:    [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
+; TAILFOLD-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
+; TAILFOLD-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
+; TAILFOLD-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
+; TAILFOLD-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; TAILFOLD-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TAILFOLD:       middle.block:
+; TAILFOLD-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
+; TAILFOLD-NEXT:    br label [[FOR_EXIT:%.*]]
+; TAILFOLD:       for.exit:
+; TAILFOLD-NEXT:    ret i32 [[TMP10]]
+;
 entry:
   br label %for.body