From 1dc7bbcbbcd0c8c292a65e8cbb538e9c90116efb Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Mon, 28 Apr 2025 22:31:50 -0700 Subject: [PATCH 1/4] [LV] Pre-commit test case for condition VPBB. (NFC) --- .../RISCV/vplan-conditional-basic-block.ll | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll new file mode 100644 index 0000000000000..d43c598db5476 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s + +define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]] +; CHECK-NEXT: [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64 +; CHECK-NEXT: [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]] +; CHECK-NEXT: [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64 +; CHECK-NEXT: [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]] +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[SHL6]] +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4 +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]] +; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]] +; CHECK-NEXT: br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]] +; CHECK: [[IF_THEN9]]: +; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]] +; CHECK-NEXT: store i64 [[XOR]], ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: br label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp sgt i32 %reg.4.val, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %sh_prom = zext nneg i32 %control1 to i64 + %shl = shl nuw i64 1, %sh_prom + %sh_prom5 = zext nneg i32 %control2 to i64 + %shl6 = shl nuw i64 1, %sh_prom5 + %sh_prom10 = zext nneg i32 %target to i64 + %shl11 = shl nuw nsw i64 1, %sh_prom10 + %wide.trip.count = zext nneg i32 %reg.4.val to i64 + %0 = freeze i64 %shl6 + %1 = or i64 %shl, %0 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv + %2 = load i64, ptr %arrayidx, align 8 + %3 = and i64 %2, %1 + %or.cond.not = icmp eq i64 %3, %1 + br i1 %or.cond.not, label %if.then9, label %for.inc + +if.then9: + %xor = xor i64 %2, %shl11 + store i64 %xor, ptr %arrayidx, align 8 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. From 99225c3a03e841f2b1cc7b730ec7a64f06fb26af Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Sun, 11 May 2025 20:02:56 -0700 Subject: [PATCH 2/4] [TTI] Add `preferFlattenControlFlow` for loop vectorizer. (NFC) This patch add a need attribut in TTI to let LV knows which is better. Default value of preferControlFlow is false to match current TTI implementation. If preferFlattenControlFlow() return true, LV will try to generate conditional VPBB if possible. --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 6 ++++++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 ++ llvm/include/llvm/CodeGen/BasicTTIImpl.h | 2 ++ llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 2 ++ 5 files changed, 16 insertions(+) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 5d3b233ed6b6a..e63889c9fd2a1 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1852,6 +1852,12 @@ class TargetTransformInfo { /// maximum register pressure exceeds getNumberOfRegisters. LLVM_ABI bool shouldConsiderVectorizationRegPressure() const; + /// Return true if the loop vectorizer can generate control flow (conditional + /// blocks) inside the vector region. Otherwise, the loop vectorizer will + /// generate a single block for the vector region and handle control flow via + /// a mask. + LLVM_ABI bool preferControlFlow() const; + /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 4cd607c0d0c8d..5dd418294dad0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1099,6 +1099,8 @@ class TargetTransformInfoImplBase { virtual bool shouldConsiderVectorizationRegPressure() const { return false; } + virtual bool preferControlFlow() const { return false; } + virtual bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 42ddb32d24093..f776dc64b89e7 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -793,6 +793,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::preferPredicateOverEpilogue(TFI); } + bool preferControlFlow() const override { return BaseT::preferControlFlow(); } + TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override { return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index bf62623099a97..8590f667d7e89 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -372,6 +372,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue( return TTIImpl->preferPredicateOverEpilogue(TFI); } +bool TargetTransformInfo::preferControlFlow() const { + return TTIImpl->preferControlFlow(); +} + TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle( bool IVUpdateMayOverflow) const { return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6886e8964e29e..e4db87065bbd3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -143,6 +143,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool shouldConsiderVectorizationRegPressure() const override { return true; } + bool preferControlFlow() const override { return false; } + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, From db1879d4af2b7a68dc63b6a66619b7afc4849122 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Sun, 4 May 2025 16:01:34 -0700 Subject: [PATCH 3/4] [LV] Introduce conditional vector basic block. This patch add the transformation that convert flatten control flow with conditional vector basic block. This transformation can help program skip masked operations without any active lane. First, this transformation will collect all masked stores and operands bottom-up. And put these msaked operations into a new vector basic block. Second, this transformation will split original vector loop and insert the new basic block between split blocks. And update the conditional branch in the orignal blocks. E.g. Before: { vector.loop: ... BranchOnCount %IV, %TC Successors middle.block, vector.loop } After: { vector.loop: ... %any.active.mask = any-of(%mask) BranchOnCond %any.active.mask Successors vector.if.bb, vector.loop.split vector.if.bb: ... (Masked operations) Successors vector.loop.split vector.loop.split: ... BranchOnCount %IV, %TC Successors middle.block, vector.loop } --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 + .../Transforms/Vectorize/VPlanTransforms.cpp | 157 ++++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.h | 23 +++ .../RISCV/vplan-conditional-basic-block.ll | 17 +- 4 files changed, 198 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index febdc54e666a9..fe0350f995219 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -345,6 +345,10 @@ static cl::opt PreferPredicatedReductionSelect( cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +static cl::opt PreferControlFlow( + "prefer-control-flow", cl::init(false), cl::Hidden, + cl::desc("Generate control flow inside the vector region.")); + cl::opt llvm::EnableVPlanNativePath( "enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -8198,6 +8202,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, if (CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength, *Plan, CM.getMaxSafeElements()); + if (PreferControlFlow || TTI.preferControlFlow()) + VPlanTransforms::optimizeConditionalVPBB(*Plan); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index fa1fdaf7b5ce0..8f9d7e73f0480 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4486,3 +4486,160 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, } } } + +void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) { + VPDominatorTree VPDT(Plan); + + VPValue *HeaderMask = findHeaderMask(Plan); + + // Get the mask from the store recipes. + auto GetMask = [&HeaderMask](VPRecipeBase &R) -> VPValue * { + using namespace llvm::VPlanPatternMatch; + if (isa(R)) { + VPValue *OrigMask = cast(R).getMask(); + if (!OrigMask || OrigMask == HeaderMask || + match(OrigMask, m_VPInstruction( + m_VPValue(), m_VPValue()))) + return nullptr; + + return OrigMask; + } + return nullptr; + }; + + // First, collect all masked stores. + SmallVector> MaskedStores; + ReversePostOrderTraversal> RPOT( + Plan.getEntry()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + for (VPRecipeBase &R : *VPBB) { + if (VPValue *Mask = GetMask(R)) + MaskedStores.emplace_back(&R, Mask); + } + } + + if (MaskedStores.empty()) + return; + + DenseSet Candidates; + auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) { + for (VPValue *Op : R->operands()) + if (VPRecipeBase *OpR = Op->getDefiningRecipe()) + Candidates.insert(OpR); + }; + + SmallVector> Tries; + while (!MaskedStores.empty()) { + auto [SR, M] = MaskedStores.pop_back_val(); + Candidates.clear(); + AddOperandsToCandidates(SR); + + SetVector CurrentTree; + CurrentTree.insert(SR); + + VPBasicBlock *MaskBlock = + M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr; + + // Don't move recipes before the mask and PHI recipes. + auto End = MaskBlock == SR->getParent() + ? M->getDefiningRecipe()->getReverseIterator() + : SR->getParent()->getFirstNonPhi()->getReverseIterator(); + // Also don't move the recipes through any recipe that may have side effects + // or write to memory. + for (auto It = std::next(SR->getReverseIterator()); It != End; ++It) { + if (It->mayHaveSideEffects() || It->mayWriteToMemory()) { + End = It; + break; + } + } + + // Greedily add all recipes that are used to compute the stored value to the + // tree. All users of the added recipe must dominate the store + // recipe. + for (VPRecipeBase &R : make_range(SR->getReverseIterator(), End)) { + // Recipe is not part of the tree + if (!Candidates.contains(&R)) + continue; + + if (any_of(R.definedValues(), [&SR = SR, &VPDT](VPValue *Def) { + for (VPUser *U : Def->users()) { + if (auto *UR = dyn_cast(U)) { + if (UR == SR || VPDT.properlyDominates(UR, SR)) + continue; + } + return true; + } + return false; + })) + continue; + + CurrentTree.insert(&R); + AddOperandsToCandidates(&R); + } + // The previous traversal could have added recipes that are used by + // non-added recipes, which need to be removed from the list. + SmallDenseSet ToRemove; + bool Changed; + do { + Changed = false; + for (VPRecipeBase *R : CurrentTree) { + if (ToRemove.contains(R)) + continue; + if (any_of(R->definedValues(), [&](VPValue *Def) { + for (VPUser *U : Def->users()) { + if (auto *UR = dyn_cast(U)) + if (!CurrentTree.contains(UR) || ToRemove.contains(UR)) + return true; + } + return false; + })) { + Changed = true; + ToRemove.insert(R); + } + } + } while (Changed); + + for (VPRecipeBase *R : ToRemove) + CurrentTree.remove(R); + + if (CurrentTree.size() > 1) + Tries.push_back(CurrentTree); + } + + for (const auto &List : Tries) { + VPRecipeBase *SR = List.front(); + VPValue *M = cast(SR)->getMask(); + assert(M && "Mask VPValue must exist at this point"); + auto Recipes = reverse(List.getArrayRef()); + + // Split the current basic block at the store recipe point so that + // a predicated block can be added in between. + VPBasicBlock *ParentBB = SR->getParent(); + VPBasicBlock *ContBB = ParentBB->splitAt(SR->getIterator()); + + // Create VPBB and insert it between ParentBB and ContBB. + VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb"); + VPBlockUtils::insertBlockAfter(IfBB, ParentBB); + if (ContBB->getNumSuccessors() == 0) + ParentBB->getEnclosingLoopRegion()->setExiting(ContBB); + + // Copy recipes into the conditional block. + for (VPRecipeBase *R : Recipes) + R->moveBefore(*IfBB, IfBB->end()); + + // Add the condition and branch in the parent block. + auto *ActiveLane = + new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask"); + + auto *BranchOnCond = + new VPInstruction(VPInstruction::BranchOnCond, ActiveLane); + ParentBB->appendRecipe(ActiveLane); + ParentBB->appendRecipe(BranchOnCond); + + // Set proper predecessors and successors for the conditional block. + ParentBB->clearSuccessors(); + ParentBB->setSuccessors({IfBB, ContBB}); + ContBB->clearPredecessors(); + ContBB->setPredecessors({ParentBB, IfBB}); + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b28559b620e13..56a75c1ddfc3d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -377,6 +377,29 @@ struct VPlanTransforms { /// users in the original exit block using the VPIRInstruction wrapping to the /// LCSSA phi. static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range); + + /// Try to convert flattened control flow into a conditional vector basic + /// block. If there are no active bits in the mask, it will skip all masked + /// operations. This transformation will collect all masked operations + /// bottom-up from the masked stores and put all masked operations in a new + /// vector basic block. The original vector.loop will be split and the newly + /// created basic block will be inserted in between. + /// + /// + /// [ ] <-- vector.loop + /// ^ | %any.active.mask = any-of(%Mask) + /// / | Branch-On-Count %any.active.mask, 0 + /// / |\ + /// | (T)| \ (F) + /// | | v + /// | | [ ] <-- vector.if.bb (masked operations) + /// | | | + /// | | v + /// | +-->[ ] <-- vector.loop.split + /// | | | + /// +---------+ v + /// [ ] <-- middle.block + static void optimizeConditionalVPBB(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll index d43c598db5476..99f03723c567b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s +; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-control-flow %s | FileCheck %s define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) { ; CHECK-LABEL: define void @test( @@ -28,20 +28,27 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_SPLIT:.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]]) +; CHECK-NEXT: br i1 [[TMP11]], label %[[VECTOR_IF_BB:.*]], label %[[VECTOR_BODY_SPLIT]] +; CHECK: [[VECTOR_IF_BB]]: ; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4 ; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]]) -; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]]) +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP4]], i32 8, <4 x i1> [[TMP8]]) +; CHECK-NEXT: br label %[[VECTOR_BODY_SPLIT]] +; CHECK: [[VECTOR_BODY_SPLIT]]: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] From b945b7e4760ceee87c2784d7d4185ca666845a71 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Sun, 19 Oct 2025 18:36:20 -0700 Subject: [PATCH 4/4] Fixup, add the cost of any-of to legacy model. The `any-of (%mask)` is generated during generating conditional VPBB in vplan trnasforms. The cost of the any-of should also account by the legacy model to prevent misaligned issue. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fe0350f995219..8c620bf14ae24 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4206,6 +4206,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { case VPInstruction::ExplicitVectorLength: C += VPI->cost(VF, CostCtx); break; + case VPInstruction::AnyOf: + if (!VPI->getUnderlyingValue()) + C += VPI->cost(VF, CostCtx); + break; default: break; }