diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4df7aaca736235..beb54dc7822caa 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -895,7 +895,10 @@ class BoUpSLP { /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef Roots, - ArrayRef UserIgnoreLst = None); + const SmallDenseSet &UserIgnoreLst); + + /// Construct a vectorizable tree that starts at \p Roots. + void buildTree(ArrayRef Roots); /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p @@ -916,6 +919,7 @@ class BoUpSLP { } MinBWs.clear(); InstrElementSize.clear(); + UserIgnoreList = nullptr; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -2073,8 +2077,8 @@ class BoUpSLP { AnalyzedReductionVals.clear(); } /// Checks if the given value is gathered in one of the nodes. - bool isGathered(Value *V) const { - return MustGather.contains(V); + bool isAnyGathered(const SmallDenseSet &Vals) const { + return any_of(MustGather, [&](Value *V) { return Vals.contains(V); }); } ~BoUpSLP(); @@ -3192,7 +3196,7 @@ class BoUpSLP { void scheduleBlock(BlockScheduling *BS); /// List of users to ignore during scheduling and that don't need extracting. - SmallPtrSet UserIgnoreList; + const SmallDenseSet *UserIgnoreList = nullptr; /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of /// sorted SmallVectors of unsigned. @@ -4119,7 +4123,7 @@ void BoUpSLP::buildExternalUses( } // Ignore users in the user ignore list. - if (UserIgnoreList.contains(UserInst)) + if (UserIgnoreList && UserIgnoreList->contains(UserInst)) continue; LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " @@ -4276,10 +4280,16 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { } void BoUpSLP::buildTree(ArrayRef Roots, - ArrayRef UserIgnoreLst) { + const SmallDenseSet &UserIgnoreLst) { + deleteTree(); + UserIgnoreList = &UserIgnoreLst; + if (!allSameType(Roots)) + return; + buildTree_rec(Roots, 0, EdgeInfo()); +} + +void BoUpSLP::buildTree(ArrayRef Roots) { deleteTree(); - UserIgnoreList.clear(); - UserIgnoreList.insert(UserIgnoreLst.begin(), UserIgnoreLst.end()); if (!allSameType(Roots)) return; buildTree_rec(Roots, 0, EdgeInfo()); @@ -4595,12 +4605,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // the same block. // Don't vectorize ephemeral values. - for (Value *V : VL) { - if (EphValues.count(V)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V - << ") is ephemeral.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; + if (!EphValues.empty()) { + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V + << ") is ephemeral.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return; + } } } @@ -4638,13 +4650,15 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (Value *V : VL) { - if (UserIgnoreList.contains(V)) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - if (TryToFindDuplicates(S)) - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - return; + if (UserIgnoreList && !UserIgnoreList->empty()) { + for (Value *V : VL) { + if (UserIgnoreList && UserIgnoreList->contains(V)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + return; + } } } @@ -8570,7 +8584,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); // It is legal to delete users in the ignorelist. - assert((getTreeEntry(U) || UserIgnoreList.contains(U) || + assert((getTreeEntry(U) || + (UserIgnoreList && UserIgnoreList->contains(U)) || (isa_and_nonnull(U) && isDeleted(cast(U)))) && "Deleting out-of-tree value"); @@ -10689,6 +10704,8 @@ class HorizontalReduction { /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { constexpr int ReductionLimit = 4; + constexpr unsigned RegMaxNumber = 4; + constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. @@ -10726,12 +10743,12 @@ class HorizontalReduction { // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; - SmallVector IgnoreList; + SmallDenseSet IgnoreList; for (ReductionOpsType &RdxOps : ReductionOps) for (Value *RdxOp : RdxOps) { if (!RdxOp) continue; - IgnoreList.push_back(RdxOp); + IgnoreList.insert(RdxOp); } bool IsCmpSelMinMax = isCmpSelMinMax(cast(ReductionRoot)); @@ -10793,7 +10810,12 @@ class HorizontalReduction { if (NumReducedVals < ReductionLimit) continue; - unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + unsigned MaxVecRegSize = V.getMaxVecRegSize(); + unsigned EltSize = V.getVectorElementSize(Candidates[0]); + unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize); + + unsigned ReduxWidth = std::min( + PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts)); unsigned Start = 0; unsigned Pos = Start; // Restarts vectorization attempt with lower vector factor. @@ -10803,10 +10825,7 @@ class HorizontalReduction { &CheckForReusedReductionOpsLocal, &PrevReduxWidth, &V, &IgnoreList](bool IgnoreVL = false) { - bool IsAnyRedOpGathered = - !IgnoreVL && any_of(IgnoreList, [&V](Value *RedOp) { - return V.isGathered(RedOp); - }); + bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { // Check if any of the reduction ops are gathered. If so, worth // trying again with less number of reduction ops. @@ -10871,13 +10890,37 @@ class HorizontalReduction { LocalExternallyUsedValues[TrackedVals[V]]; }); } - for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { - if (Cnt >= Pos && Cnt < Pos + ReduxWidth) + // Number of uses of the candidates in the vector of values. + SmallDenseMap NumUses; + for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { + Value *V = Candidates[Cnt]; + if (NumUses.count(V) > 0) + continue; + NumUses[V] = std::count(VL.begin(), VL.end(), V); + } + for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { + Value *V = Candidates[Cnt]; + if (NumUses.count(V) > 0) + continue; + NumUses[V] = std::count(VL.begin(), VL.end(), V); + } + // Gather externally used values. + SmallPtrSet Visited; + for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { + Value *V = Candidates[Cnt]; + if (!Visited.insert(V).second) + continue; + unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; + if (NumOps != ReducedValsToOps.find(V)->second.size()) + LocalExternallyUsedValues[V]; + } + for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { + Value *V = Candidates[Cnt]; + if (!Visited.insert(V).second) continue; - unsigned NumOps = VectorizedVals.lookup(Candidates[Cnt]) + - std::count(VL.begin(), VL.end(), Candidates[Cnt]); - if (NumOps != ReducedValsToOps.find(Candidates[Cnt])->second.size()) - LocalExternallyUsedValues[Candidates[Cnt]]; + unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; + if (NumOps != ReducedValsToOps.find(V)->second.size()) + LocalExternallyUsedValues[V]; } V.buildExternalUses(LocalExternallyUsedValues);