diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7a995f85ead15..0c3289afc7c5a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -27373,6 +27373,11 @@ class HorizontalReduction { } public: + /// \returns true if \p V has an opcode that could form an ordered reduction. + static bool isSupportedOrderedReductionOp(Instruction *I) { + return I->getOpcode() == Instruction::FAdd; + } + static RecurKind getRdxKind(Value *V) { auto *I = dyn_cast(V); if (!I) @@ -27619,6 +27624,93 @@ class HorizontalReduction { return RK != ReductionOrdering::None; } + /// Analyze whether \p Root forms a linearized ordered reduction chain. + /// If \p MatchLHS is true, analyzes LHS-associated (left-linearized) chains + /// where the chain recurses on the LHS and the RHS at each level is a leaf: + /// ((((v0 op v1) op v2) op v3) op v4) + /// If \p MatchLHS is false, analyzes RHS-associated (right-linearized) chains + /// where the chain recurses on the RHS and the LHS at each level is a leaf: + /// (v0 op (v1 op (v2 op (v3 op v4)))) + /// Leaf values are stored in ReducedVals.back() in accumulation order + /// (innermost pair first, outermost last), e.g. [v0,v1,v2,v3,v4] for + /// LHS-associated and [v4,v3,v2,v1,v0] for RHS-associated chains. + /// \returns true if the chain is a valid ordered reduction. + bool matchOrderedReduction(BoUpSLP &R, Instruction *Root, bool MatchLHS) { + ReducedVals.clear(); + ReducedValsToOps.clear(); + ReductionOps.clear(); + RdxKind = getRdxKind(Root); + // Currently, only ordered fadd reductions are supported. + if (RdxKind != RecurKind::FAdd) + return false; + if (isVectorizable(RdxKind, Root) != ReductionOrdering::Ordered) + return false; + + // Ordered reductions only support simple binary ops, not min/max + // select(cmp) patterns or poison-safe bool logic ops. + if (isCmpSelMinMax(Root) || isBoolLogicOp(Root)) + return false; + + Type *Ty = Root->getType(); + if (!isValidElementType(Ty) || Ty->isPointerTy()) + return false; + + // This is an ordered (linearized) reduction regardless of whether the + // individual operations are associative. + RK = ReductionOrdering::Ordered; + ReductionRoot = Root; + unsigned FirstOpIdx = getFirstOperandIndex(Root); + unsigned ChainOpIdx = MatchLHS ? FirstOpIdx : FirstOpIdx + 1; + unsigned LeafOpIdx = MatchLHS ? FirstOpIdx + 1 : FirstOpIdx; + initReductionOps(Root); + ReducedVals.emplace_back(); + SmallPtrSet Visited; + Instruction *TreeN = Root; + unsigned Depth = 0; + bool ChainComplete = false; + constexpr unsigned MaxReducedVals = 1024; + while (TreeN) { + if (Depth++ > RecursionMaxDepth) + break; + if (ReducedVals.back().size() >= MaxReducedVals) + break; + if (!Visited.insert(TreeN).second) + break; + if (getRdxKind(TreeN) != RdxKind) + break; + addReductionOps(TreeN); + Value *EdgeVal = getRdxOperand(TreeN, LeafOpIdx); + Value *ChainVal = getRdxOperand(TreeN, ChainOpIdx); + ReducedValsToOps[EdgeVal].push_back(TreeN); + ReducedValsToOps[ChainVal].push_back(TreeN); + ReducedVals.back().push_back(EdgeVal); + auto *EdgeInst = dyn_cast(ChainVal); + if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || + !hasRequiredNumberOfUses(/*IsCmpSelMinMax=*/false, EdgeInst)) { + ReducedVals.back().push_back(ChainVal); + ChainComplete = true; + break; + } + TreeN = EdgeInst; + } + // Leaves are collected outer-to-inner (top-down). Reverse to get the + // accumulation order (innermost first) needed by ordered reduction + // intrinsics. This is correct for both LHS- and RHS-associated chains + // because fadd is commutative — each step only relies on a+b == b+a, + // never on associativity. + std::reverse(ReducedVals.back().begin(), ReducedVals.back().end()); + if (!ChainComplete || ReducedVals.back().size() < ReductionLimit) { + for (ReductionOpsType &RdxOps : ReductionOps) + for (Value *RdxOp : RdxOps) + R.analyzedReductionRoot(cast(RdxOp)); + ReducedVals.pop_back(); + ReducedValsToOps.clear(); + ReductionOps.clear(); + return false; + } + return true; + } + /// Try to find a reduction tree. bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, ScalarEvolution &SE, DominatorTree &DT, @@ -28635,6 +28727,239 @@ class HorizontalReduction { return VectorizedTree; } + /// Attempt to vectorize an ordered (linearized) reduction chain. + /// Reduced values from matchOrderedReduction() are in accumulation order. + /// Vectorized subsets are immediately reduced via ordered reduction + /// intrinsics; non-vectorized values are folded linearly. + Value *tryToReduceOrdered(BoUpSLP &V, const DataLayout &DL, + TargetTransformInfo *TTI, + const TargetLibraryInfo &TLI, AssumptionCache *AC, + DominatorTree &DT) { + constexpr unsigned RegMaxNumber = 4; + constexpr unsigned RedValsMaxNumber = 128; + + assert(RK == ReductionOrdering::Ordered && "Expected ordered reduction"); + assert(ReducedVals.size() == 1 && + "Expected single group from matchOrderedReduction"); + + IRBuilder Builder(ReductionRoot->getContext(), + TargetFolder(DL)); + Instruction *RdxRootInst = cast(ReductionRoot); + Builder.SetInsertPoint(RdxRootInst); + + SmallVector Candidates(ReducedVals.back()); + + // Intersect the fast-math-flags from all reduction operations. + FastMathFlags RdxFMF; + RdxFMF.set(); + for (Value *RdxVal : Candidates) + for (Instruction *Op : ReducedValsToOps.at(RdxVal)) + if (auto *FPMO = dyn_cast(Op)) + RdxFMF &= FPMO->getFastMathFlags(); + + unsigned MaxVecRegSize = V.getMaxVecRegSize(); + unsigned EltSize = V.getVectorElementSize(Candidates[0]); + const unsigned MaxElts = + std::clamp(llvm::bit_floor(MaxVecRegSize / EltSize), + RedValsMaxNumber, RegMaxNumber * RedValsMaxNumber); + + unsigned ReduxWidth = 0; + auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) { + Type *ScalarTy = Candidates.front()->getType(); + ReduxWidth = + getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth); + VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); + unsigned NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy); + unsigned NumRegs = + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); + while (NumParts > NumRegs) { + assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0."); + ReduxWidth = bit_floor(ReduxWidth - 1); + VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); + NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy); + NumRegs = + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); + } + if (NumParts > NumRegs / 2) + ReduxWidth = bit_floor(ReduxWidth); + return ReduxWidth; + }; + auto ShrinkReduxWidth = [&]() { + --ReduxWidth; + if (ReduxWidth > 1) + ReduxWidth = GetVectorFactor(ReduxWidth); + }; + + // Try to build, cost-check, and vectorize a window [Start, Start+Width). + unsigned SuccessStart = 0, SuccessWidth = 0; + Value *SuccessRoot = nullptr; + SmallMapVector EmptySameValuesCounter; + auto TryWindow = [&](unsigned Start, unsigned Width) -> bool { + ArrayRef VL = ArrayRef(Candidates).slice(Start, Width); + if (V.areAnalyzedReductionVals(VL)) + return false; + if (any_of(VL, [&V](Value *RedVal) { + auto *RedValI = dyn_cast(RedVal); + return RedValI && V.isDeleted(RedValI); + })) + return false; + + SmallDenseSet IgnoreList; + for (Value *RdxVal : VL) + for (Instruction *Op : ReducedValsToOps.at(RdxVal)) + IgnoreList.insert(Op); + + V.buildTree(VL, IgnoreList); + if (V.isTreeTinyAndNotFullyVectorizable(false)) { + V.analyzedReductionVals(VL); + return false; + } + V.reorderTopToBottom(); + V.reorderBottomToTop(); + + BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues; + LocalExternallyUsedValues.insert(ReductionRoot); + for (unsigned Cnt : seq(Candidates.size())) { + if (Cnt >= Start && Cnt < Start + Width) + continue; + if (isa(Candidates[Cnt])) + LocalExternallyUsedValues.insert(Candidates[Cnt]); + } + + V.transformNodes(); + V.computeMinimumValueSizes(); + InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL); + V.buildExternalUses(LocalExternallyUsedValues); + + InstructionCost ReductionCost = + getReductionCost(TTI, VL, EmptySameValuesCounter, + /*IsCmpSelMinMax=*/false, RdxFMF, V, DT, DL, TLI); + InstructionCost Cost = + V.getTreeCost(TreeCost, VL, ReductionCost, RdxRootInst); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost + << " for ordered reduction\n"); + if (Cost > -SLPCostThreshold || + (Cost == -SLPCostThreshold && V.getTreeSize() > 1)) { + if (Cost.isValid()) + V.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", + ReducedValsToOps.at(VL[0]).front()) + << "Vectorizing ordered reduction is possible " + << "but not beneficial with cost " << ore::NV("Cost", Cost) + << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); + V.analyzedReductionVals(VL); + return false; + } + + LLVM_DEBUG(dbgs() << "SLP: Vectorizing ordered reduction at cost:" << Cost + << ". (HorRdx)\n"); + V.getORE()->emit([&]() { + return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", + ReducedValsToOps.at(VL[0]).front()) + << "Vectorized ordered reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize()); + }); + + Builder.setFastMathFlags(RdxFMF); + SuccessRoot = V.vectorizeTree(LocalExternallyUsedValues, RdxRootInst); + assert(SuccessRoot && "Expected vectorized tree"); + SuccessStart = Start; + SuccessWidth = Width; + return true; + }; + + // Phase 1: front-anchored — try [0, W), shrinking W. + // Phase 2: back-anchored — try [N-W, N), shrinking W. + // We try one side fully before the other to limit compile time. + unsigned N = Candidates.size(); + for (bool FromFront : {true, false}) { + ReduxWidth = N; + if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1)) + ReduxWidth = GetVectorFactor(ReduxWidth); + ReduxWidth = std::min(ReduxWidth, MaxElts); + + while (ReduxWidth >= ReductionLimit) { + unsigned Start = FromFront ? 0 : N - ReduxWidth; + if (!FromFront && Start == 0) { + ShrinkReduxWidth(); + continue; + } + if (TryWindow(Start, ReduxWidth)) + break; + ShrinkReduxWidth(); + } + if (SuccessRoot) + break; + } + + if (!SuccessRoot) { + for (ReductionOpsType &RdxOps : ReductionOps) + for (Value *RdxOp : RdxOps) + V.analyzedReductionRoot(cast(RdxOp)); + return nullptr; + } + + // Fold leading scalars [0, SuccessStart) into an accumulator. + Type *DestTy = ReductionRoot->getType(); + Value *VectorizedTree = nullptr; + for (Value *RdxVal : ArrayRef(Candidates).take_front(SuccessStart)) { + Builder.SetCurrentDebugLocation( + ReducedValsToOps.at(RdxVal).front()->getDebugLoc()); + if (!VectorizedTree) + VectorizedTree = RdxVal; + else + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, RdxVal, + "op.rdx", ReductionOps); + } + + // Emit ordered reduction for the vectorized window. + Builder.SetCurrentDebugLocation( + cast(ReductionRoot)->getDebugLoc()); + if (VectorizedTree) + VectorizedTree = + emitReduction(SuccessRoot, Builder, TTI, DestTy, VectorizedTree); + else + VectorizedTree = emitReduction(SuccessRoot, Builder, TTI, DestTy); + + // Fold trailing scalars [SuccessStart+SuccessWidth, N). + for (Value *RdxVal : + ArrayRef(Candidates).drop_front(SuccessStart + SuccessWidth)) { + Builder.SetCurrentDebugLocation( + ReducedValsToOps.at(RdxVal).front()->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, RdxVal, + "op.rdx", ReductionOps); + } + + ReductionRoot->replaceAllUsesWith(VectorizedTree); + +#ifndef NDEBUG + SmallPtrSet IgnoreSet; + for (ArrayRef RdxOps : ReductionOps) + IgnoreSet.insert_range(RdxOps); +#endif + for (ArrayRef RdxOps : ReductionOps) { + for (Value *Ignore : RdxOps) { + if (!Ignore) + continue; +#ifndef NDEBUG + for (auto *U : Ignore->users()) { + assert(IgnoreSet.count(U) && + "All users must be either in the reduction ops list."); + } +#endif + if (!Ignore->use_empty()) { + Value *P = PoisonValue::get(Ignore->getType()); + Ignore->replaceAllUsesWith(P); + } + } + V.removeInstructionsAndOperands(RdxOps, {}); + } + return VectorizedTree; + } + private: /// Creates the reduction from the given \p Vec vector value with the given /// scale \p Scale and signedness \p IsSigned. @@ -29074,14 +29399,17 @@ class HorizontalReduction { } /// Emit a horizontal reduction of the vectorized value. + /// If \p Start is non-null, emit an ordered reduction intrinsic that + /// sequentially accumulates into \p Start (only valid for FAdd/FMulAdd). Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder, - const TargetTransformInfo *TTI, Type *DestTy) { + const TargetTransformInfo *TTI, Type *DestTy, + Value *Start = nullptr) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(RdxKind != RecurKind::FMulAdd && "A call to the llvm.fmuladd intrinsic is not handled yet"); auto *FTy = cast(VectorizedValue->getType()); - if (FTy->getScalarType() == Builder.getInt1Ty() && + if (!Start && FTy->getScalarType() == Builder.getInt1Ty() && RdxKind == RecurKind::Add && DestTy->getScalarType() != FTy->getScalarType()) { // Convert vector_reduce_add(ZExt()) to @@ -29092,6 +29420,8 @@ class HorizontalReduction { return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V); } ++NumVectorInstructions; + if (Start) + return createOrderedReduction(Builder, RdxKind, VectorizedValue, Start); return createSimpleReduction(Builder, VectorizedValue, RdxKind); } @@ -29513,9 +29843,26 @@ bool SLPVectorizerPass::vectorizeHorReduction( if (!isReductionCandidate(Inst)) return nullptr; HorizontalReduction HorRdx; - if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DT, *DL, *TTI, *TLI)) - return nullptr; - return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT); + Value *Res = nullptr; + if (HorRdx.matchAssociativeReduction(R, Inst, *SE, *DT, *DL, *TTI, *TLI)) + if (Value *Red = HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT)) { + if (Red != Inst) + return Red; + Res = Red; + } + if (HorizontalReduction::isSupportedOrderedReductionOp(Inst)) { + if (HorRdx.matchOrderedReduction(R, Inst, /*MatchLHS=*/true)) + if (Value *Red = + HorRdx.tryToReduceOrdered(R, *DL, TTI, *TLI, AC, *DT)) { + if (Red != Inst) + return Red; + Res = Red; + } + if (HorRdx.matchOrderedReduction(R, Inst, /*MatchLHS=*/false)) + if (Value *Red = HorRdx.tryToReduceOrdered(R, *DL, TTI, *TLI, AC, *DT)) + return Red; + } + return Res; }; auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { if (TryOperandsAsNewSeeds && FutureSeed == Root) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 8f80b3ed187cb..9a501d2e7e3fa 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -168,13 +168,7 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP16]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[TMP17]], i32 1 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[TMP28]], [[TMP32]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP17]], i32 2 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP20]] -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[TMP17]], i32 3 -; CHECK-NEXT: [[ADD33:%.*]] = fadd float [[ADD31]], [[TMP30]] +; CHECK-NEXT: [[ADD33:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP17]]) ; CHECK-NEXT: [[ADD32:%.*]] = fadd float [[ADD33]], [[ADD6]] ; CHECK-NEXT: ret float [[ADD32]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll index 6ac77bd77fa02..6cc679f9dbc83 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll @@ -7,15 +7,14 @@ define double @test01() { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr null, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x ptr> zeroinitializer, <2 x i32> [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 [[TMP2]], <2 x i1> splat (i1 true), <2 x double> poison) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = fadd double [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = fadd double [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd double 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[TMP10:%.*]] = fadd double [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], [[TMP8]] -; CHECK-NEXT: ret double [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x double> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP9]]) +; CHECK-NEXT: ret double [[TMP10]] ; %1 = load i32, ptr null, align 8 %2 = load i32, ptr getelementptr inbounds (i32, ptr null, i32 1), align 4