diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index bb49e04f85d4f..d73ecd318f7d5 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -2393,6 +2393,21 @@ class ShuffleVectorInst : public Instruction { /// Return true if this shuffle mask is a replication mask. bool isReplicationMask(int &ReplicationFactor, int &VF) const; + /// Return true if this shuffle mask represents "clustered" mask of size VF, + /// i.e. each index between [0..VF) is used exactly once in each submask of + /// size VF. + /// For example, the mask for \p VF=4 is: + /// 0, 1, 2, 3, 3, 2, 0, 1 - "clustered", because each submask of size 4 + /// (0,1,2,3 and 3,2,0,1) uses indices [0..VF) exactly one time. + /// 0, 1, 2, 3, 3, 3, 1, 0 - not "clustered", because + /// element 3 is used twice in the second submask + /// (3,3,1,0) and index 2 is not used at all. + static bool isOneUseSingleSourceMask(ArrayRef Mask, int VF); + + /// Return true if this shuffle mask is a one-use-single-source("clustered") + /// mask. + bool isOneUseSingleSourceMask(int VF) const; + /// Change values in a shuffle permute mask assuming the two vector operands /// of length InVecNumElts have swapped position. static void commuteShuffleMask(MutableArrayRef Mask, diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 78c13e698d6ca..861a0628bb188 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -14,6 +14,7 @@ #include "llvm/IR/Instructions.h" #include "LLVMContextImpl.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/IR/Attributes.h" @@ -2569,6 +2570,37 @@ bool ShuffleVectorInst::isReplicationMask(int &ReplicationFactor, return isReplicationMaskWithParams(ShuffleMask, ReplicationFactor, VF); } +bool ShuffleVectorInst::isOneUseSingleSourceMask(ArrayRef Mask, int VF) { + if (VF <= 0 || Mask.size() < static_cast(VF) || + Mask.size() % VF != 0) + return false; + for (unsigned K = 0, Sz = Mask.size(); K < Sz; K += VF) { + ArrayRef SubMask = Mask.slice(K, VF); + if (all_of(SubMask, [](int Idx) { return Idx == UndefMaskElem; })) + continue; + SmallBitVector Used(VF, false); + for_each(SubMask, [&Used, VF](int Idx) { + if (Idx != UndefMaskElem && Idx < VF) + Used.set(Idx); + }); + if (!Used.all()) + return false; + } + return true; +} + +/// Return true if this shuffle mask is a replication mask. +bool ShuffleVectorInst::isOneUseSingleSourceMask(int VF) const { + // Not possible to express a shuffle mask for a scalable vector for this + // case. + if (isa(getType())) + return false; + if (!isSingleSourceMask(ShuffleMask)) + return false; + + return isOneUseSingleSourceMask(ShuffleMask, VF); +} + //===----------------------------------------------------------------------===// // InsertValueInst Class //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c073ad85d0762..391dcea52d44c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3665,8 +3665,60 @@ Optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the // node. - if (!TE.ReuseShuffleIndices.empty()) - return None; + if (!TE.ReuseShuffleIndices.empty()) { + // Check if reuse shuffle indices can be improved by reordering. + // For this, check that reuse mask is "clustered", i.e. each scalar values + // is used once in each submask of size . + // Example: 4 scalar values. + // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. + // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because + // element 3 is used twice in the second submask. + unsigned Sz = TE.Scalars.size(); + if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, + Sz)) + return None; + unsigned VF = TE.getVectorFactor(); + // Try build correct order for extractelement instructions. + SmallVector ReusedMask(TE.ReuseShuffleIndices.begin(), + TE.ReuseShuffleIndices.end()); + if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() && + all_of(TE.Scalars, [Sz](Value *V) { + Optional Idx = getExtractIndex(cast(V)); + return Idx && *Idx < Sz; + })) { + SmallVector ReorderMask(Sz, UndefMaskElem); + if (TE.ReorderIndices.empty()) + std::iota(ReorderMask.begin(), ReorderMask.end(), 0); + else + inversePermutation(TE.ReorderIndices, ReorderMask); + for (unsigned I = 0; I < VF; ++I) { + int &Idx = ReusedMask[I]; + if (Idx == UndefMaskElem) + continue; + Value *V = TE.Scalars[ReorderMask[Idx]]; + Optional EI = getExtractIndex(cast(V)); + Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI)); + } + } + // Build the order of the VF size, need to reorder reuses shuffles, they are + // always of VF size. + OrdersType ResOrder(VF); + std::iota(ResOrder.begin(), ResOrder.end(), 0); + auto *It = ResOrder.begin(); + for (unsigned K = 0; K < VF; K += Sz) { + OrdersType CurrentOrder(TE.ReorderIndices); + SmallVector SubMask(makeArrayRef(ReusedMask).slice(K, Sz)); + if (SubMask.front() == UndefMaskElem) + std::iota(SubMask.begin(), SubMask.end(), 0); + reorderOrder(CurrentOrder, SubMask); + transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); + std::advance(It, Sz); + } + if (all_of(enumerate(ResOrder), + [](const auto &Data) { return Data.index() == Data.value(); })) + return {}; // Use identity order. + return ResOrder; + } if (TE.State == TreeEntry::Vectorize && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))) && @@ -3783,8 +3835,8 @@ void BoUpSLP::reorderTopToBottom() { UserTE = UserTE->UserTreeIndices.back().UserTE; ++Cnt; } - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - if (TE->State != TreeEntry::Vectorize) + VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); + if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } }); @@ -3808,12 +3860,13 @@ void BoUpSLP::reorderTopToBottom() { for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, // just need to merge reordering shuffle and the reuse shuffle. - if (!OpTE->ReuseShuffleIndices.empty()) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) continue; // Count number of orders uses. const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) { + if (OpTE->State == TreeEntry::NeedToGather || + !OpTE->ReuseShuffleIndices.empty()) { auto It = GathersToOrders.find(OpTE); if (It != GathersToOrders.end()) return It->second; @@ -3829,8 +3882,16 @@ void BoUpSLP::reorderTopToBottom() { auto It = ExternalUserReorderMap.find(OpTE); if (It != ExternalUserReorderMap.end()) { const auto &ExternalUserReorderIndices = It->second; - for (const OrdersType &ExtOrder : ExternalUserReorderIndices) - ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + // If the OpTE vector factor != number of scalars - use natural order, + // it is an attempt to reorder node with reused scalars but with + // external uses. + if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { + OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += + ExternalUserReorderIndices.size(); + } else { + for (const OrdersType &ExtOrder : ExternalUserReorderIndices) + ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + } // No other useful reorder data in this entry. if (Order.empty()) continue; @@ -3990,7 +4051,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { if (Optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - if (TE->State != TreeEntry::Vectorize) + if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } }); @@ -4062,10 +4123,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { TreeEntry *OpTE = Op.second; if (!VisitedOps.insert(OpTE).second) continue; - if (!OpTE->ReuseShuffleIndices.empty()) + if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) continue; const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) + if (OpTE->State == TreeEntry::NeedToGather || + !OpTE->ReuseShuffleIndices.empty()) return GathersToOrders.find(OpTE)->second; return OpTE->ReorderIndices; }(); @@ -6974,7 +7036,7 @@ template static T *performExtractsShuffleAction( MutableArrayRef>> ShuffleMask, Value *Base, function_ref GetVF, - function_ref(T *, ArrayRef)> ResizeAction, + function_ref(T *, ArrayRef, bool)> ResizeAction, function_ref, ArrayRef)> Action) { assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts."); SmallVector Mask(ShuffleMask.begin()->second); @@ -6983,7 +7045,8 @@ static T *performExtractsShuffleAction( bool IsBaseNotUndef = !isUndefVector(Base); if (IsBaseNotUndef) { // Base is not undef, need to combine it with the next subvectors. - std::pair Res = ResizeAction(ShuffleMask.begin()->first, Mask); + std::pair Res = + ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false); for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { if (Mask[Idx] == UndefMaskElem) Mask[Idx] = Idx; @@ -6998,7 +7061,8 @@ static T *performExtractsShuffleAction( } else if (ShuffleMask.size() == 1) { // Base is undef and only 1 vector is shuffled - perform the action only for // single vector, if the mask is not the identity mask. - std::pair Res = ResizeAction(ShuffleMask.begin()->first, Mask); + std::pair Res = ResizeAction(ShuffleMask.begin()->first, Mask, + /*ForSingleMask=*/true); if (Res.second) // Identity mask is found. Prev = Res.first; @@ -7022,9 +7086,10 @@ static T *performExtractsShuffleAction( Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first}); } else { // Vectors of different sizes - resize and reshuffle. - std::pair Res1 = - ResizeAction(ShuffleMask.begin()->first, Mask); - std::pair Res2 = ResizeAction(VMIt->first, VMIt->second); + std::pair Res1 = ResizeAction(ShuffleMask.begin()->first, Mask, + /*ForSingleMask=*/false); + std::pair Res2 = + ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); ArrayRef SecMask = VMIt->second; for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { if (Mask[I] != UndefMaskElem) { @@ -7043,7 +7108,8 @@ static T *performExtractsShuffleAction( // Perform requested actions for the remaining masks/vectors. for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { // Shuffle other input vectors, if any. - std::pair Res = ResizeAction(VMIt->first, VMIt->second); + std::pair Res = + ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); ArrayRef SecMask = VMIt->second; for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { if (SecMask[I] != UndefMaskElem) { @@ -7189,7 +7255,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef Mask) { + auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef Mask, + bool) { InstructionCost C = 0; unsigned VF = Mask.size(); unsigned VecVF = TE->getVectorFactor(); @@ -8805,7 +8872,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { return Op; }; - auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask) { + auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, + bool ForSingleMask) { unsigned VF = Mask.size(); unsigned VecVF = cast(Vec->getType())->getNumElements(); if (VF != VecVF) { @@ -8813,12 +8881,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Vec = CreateShuffle(Vec, nullptr, Mask); return std::make_pair(Vec, true); } - SmallVector ResizeMask(VF, UndefMaskElem); - for (unsigned I = 0; I < VF; ++I) { - if (Mask[I] != UndefMaskElem) - ResizeMask[Mask[I]] = Mask[I]; + if (!ForSingleMask) { + SmallVector ResizeMask(VF, UndefMaskElem); + for (unsigned I = 0; I < VF; ++I) { + if (Mask[I] != UndefMaskElem) + ResizeMask[Mask[I]] = Mask[I]; + } + Vec = CreateShuffle(Vec, nullptr, ResizeMask); } - Vec = CreateShuffle(Vec, nullptr, ResizeMask); } return std::make_pair(Vec, false); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll index b3879bda22356..aabf8a6b1ebfd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll @@ -11,15 +11,15 @@ define { <2 x float>, <2 x float> } @foo(%struct.sw* %v) { ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* undef, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[X]] to <2 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 16 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], undef ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], undef ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP8]], undef -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[INS1:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP10]], 0 ; CHECK-NEXT: [[INS2:%.*]] = insertvalue { <2 x float>, <2 x float> } [[INS1]], <2 x float> [[TMP11]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 8d8cfb3b391ea..e51fd88cbdce1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -345,11 +345,11 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 ; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X0]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6