diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 326006fbb8803..4f99d171469e4 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -153,10 +153,15 @@ struct SLPVectorizerPass : public PassInfoMixin { /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R); - bool vectorizeStoreChain(ArrayRef Chain, slpvectorizer::BoUpSLP &R, - unsigned Idx, unsigned MinVF); - - bool vectorizeStores(ArrayRef Stores, slpvectorizer::BoUpSLP &R); + std::optional vectorizeStoreChain(ArrayRef Chain, + slpvectorizer::BoUpSLP &R, + unsigned Idx, unsigned MinVF, + unsigned &Size); + + bool vectorizeStores( + ArrayRef Stores, slpvectorizer::BoUpSLP &R, + DenseSet> + &Visited); /// The store instructions in a basic block organized by base pointer. StoreListMap Stores; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0cd7bd7772226..fbece8c0109c3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1135,6 +1135,7 @@ class BoUpSLP { ScalarToTreeEntry.clear(); MultiNodeScalars.clear(); MustGather.clear(); + NonScheduledFirst.clear(); EntryToLastInstruction.clear(); ExternalUses.clear(); ExternalUsesAsGEPs.clear(); @@ -1252,7 +1253,7 @@ class BoUpSLP { /// effectively impossible for the backend to undo. /// TODO: If load combining is allowed in the IR optimizer, this analysis /// may not be necessary. - bool isLoadCombineCandidate() const; + bool isLoadCombineCandidate(ArrayRef Stores) const; /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. @@ -2356,6 +2357,14 @@ class BoUpSLP { bool isAnyGathered(const SmallDenseSet &Vals) const { return any_of(MustGather, [&](Value *V) { return Vals.contains(V); }); } + /// Checks if the given value is gathered in one of the nodes. + bool isGathered(const Value *V) const { + return MustGather.contains(V); + } + /// Checks if the specified value was not schedule. + bool isNotScheduled(const Value *V) const { + return NonScheduledFirst.contains(V); + } /// Check if the value is vectorized in the tree. bool isVectorized(Value *V) const { return getTreeEntry(V); } @@ -3071,6 +3080,9 @@ class BoUpSLP { /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A set of first non-schedulable values. + ValueSet NonScheduledFirst; + /// A map between the vectorized entries and the last instructions in the /// bundles. The bundles are built in use order, not in the def order of the /// instructions. So, we cannot rely directly on the last instruction in the @@ -6646,6 +6658,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, "tryScheduleBundle should cancelScheduling on failure"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); + NonScheduledFirst.insert(VL.front()); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -9587,11 +9600,11 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { /* MatchOr */ false); } -bool BoUpSLP::isLoadCombineCandidate() const { +bool BoUpSLP::isLoadCombineCandidate(ArrayRef Stores) const { // Peek through a final sequence of stores and check if all operations are // likely to be load-combined. - unsigned NumElts = VectorizableTree[0]->Scalars.size(); - for (Value *Scalar : VectorizableTree[0]->Scalars) { + unsigned NumElts = Stores.size(); + for (Value *Scalar : Stores) { Value *X; if (!match(Scalar, m_Store(m_Value(X), m_Value())) || !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) @@ -15210,8 +15223,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, return Changed; } -bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, - unsigned Idx, unsigned MinVF) { +std::optional +SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, + unsigned Idx, unsigned MinVF, + unsigned &Size) { + Size = 0; LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() << "\n"); const unsigned Sz = R.getVectorElementSize(Chain[0]); @@ -15228,11 +15244,42 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx << "\n"); + SetVector ValOps; + for (Value *V : Chain) + ValOps.insert(cast(V)->getValueOperand()); + // Operands are not same/alt opcodes or non-power-of-2 uniques - exit. + InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI); + if (all_of(ValOps, IsaPred) && ValOps.size() > 1) { + DenseSet Stores(Chain.begin(), Chain.end()); + bool IsPowerOf2 = + isPowerOf2_32(ValOps.size()) || + (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1)); + if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load && + (!S.MainOp->isSafeToRemove() || + any_of(ValOps.getArrayRef(), + [&](Value *V) { + return !isa(V) && + (V->getNumUses() > Chain.size() || + any_of(V->users(), [&](User *U) { + return !Stores.contains(U); + })); + }))) || + (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) { + Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2; + return false; + } + } + if (R.isLoadCombineCandidate(Chain)) + return true; R.buildTree(Chain); - if (R.isTreeTinyAndNotFullyVectorizable()) - return false; - if (R.isLoadCombineCandidate()) + // Check if tree tiny and store itself or its value is not vectorized. + if (R.isTreeTinyAndNotFullyVectorizable()) { + if (R.isGathered(Chain.front()) || + R.isNotScheduled(cast(Chain.front())->getValueOperand())) + return std::nullopt; + Size = R.getTreeSize(); return false; + } R.reorderTopToBottom(); R.reorderBottomToTop(); R.buildExternalUses(); @@ -15240,6 +15287,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, R.computeMinimumValueSizes(); R.transformNodes(); + Size = R.getTreeSize(); + if (S.getOpcode() == Instruction::Load) + Size = 2; // cut off masked gather small trees InstructionCost Cost = R.getTreeCost(); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); @@ -15261,17 +15311,45 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, return false; } -bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, - BoUpSLP &R) { +/// Checks if the quadratic mean deviation is less than 90% of the mean size. +static bool checkTreeSizes(ArrayRef> Sizes, + bool First) { + unsigned Num = 0; + uint64_t Sum = std::accumulate( + Sizes.begin(), Sizes.end(), static_cast(0), + [&](uint64_t V, const std::pair &Val) { + unsigned Size = First ? Val.first : Val.second; + if (Size == 1) + return V; + ++Num; + return V + Size; + }); + if (Num == 0) + return true; + uint64_t Mean = Sum / Num; + if (Mean == 0) + return true; + uint64_t Dev = std::accumulate( + Sizes.begin(), Sizes.end(), static_cast(0), + [&](uint64_t V, const std::pair &Val) { + unsigned P = First ? Val.first : Val.second; + if (P == 1) + return V; + return V + (P - Mean) * (P - Mean); + }) / + Num; + return Dev * 81 / (Mean * Mean) == 0; +} + +bool SLPVectorizerPass::vectorizeStores( + ArrayRef Stores, BoUpSLP &R, + DenseSet> + &Visited) { // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - // Stores the pair of stores (first_store, last_store) in a range, that were - // already tried to be vectorized. Allows to skip the store ranges that were - // already tried to be vectorized but the attempts were unsuccessful. - DenseSet> TriedSequences; struct StoreDistCompare { bool operator()(const std::pair &Op1, const std::pair &Op2) const { @@ -15299,7 +15377,14 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, PrevDist = DataVar.second; }); - if (Operands.size() <= 1) + if (Operands.size() <= 1 || + !Visited + .insert({Operands.front(), + cast(Operands.front())->getValueOperand(), + Operands.back(), + cast(Operands.back())->getValueOperand(), + Operands.size()}) + .second) continue; unsigned MaxVecRegSize = R.getMaxVecRegSize(); @@ -15308,13 +15393,19 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); + unsigned MaxRegVF = MaxVF; auto *Store = cast(Operands[0]); Type *StoreTy = Store->getValueOperand()->getType(); Type *ValueTy = StoreTy; if (auto *Trunc = dyn_cast(Store->getValueOperand())) ValueTy = Trunc->getSrcTy(); - unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF( - R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy)); + if (ValueTy == StoreTy && + R.getVectorElementSize(Store->getValueOperand()) <= EltSize) + MaxVF = std::min(MaxVF, bit_floor(Operands.size())); + unsigned MinVF = std::max( + 2, PowerOf2Ceil(TTI->getStoreMinimumVF( + R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, + ValueTy))); if (MaxVF < MinVF) { LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF @@ -15329,7 +15420,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, // consider cases where VF + 1 is a power-of-2, i.e. almost all vector // lanes are used. unsigned CandVF = Operands.size(); - if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) + if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF) NonPowerOf2VF = CandVF; } @@ -15340,40 +15431,184 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, VF = Size > MaxVF ? NonPowerOf2VF : Size; Size *= 2; }); - unsigned StartIdx = 0; - for (unsigned Size : CandidateVFs) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); - assert( - all_of( - Slice, - [&](Value *V) { - return cast(V)->getValueOperand()->getType() == - cast(Slice.front()) - ->getValueOperand() - ->getType(); - }) && - "Expected all operands of same type."); - if (!VectorizedStores.count(Slice.front()) && - !VectorizedStores.count(Slice.back()) && - TriedSequences.insert(std::make_pair(Slice.front(), Slice.back())) - .second && - vectorizeStoreChain(Slice, R, Cnt, MinVF)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Slice.begin(), Slice.end()); - Changed = true; - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += Size; - Cnt += Size; - continue; + unsigned End = Operands.size(); + unsigned Repeat = 0; + constexpr unsigned MaxAttempts = 4; + OwningArrayRef> RangeSizes(Operands.size()); + for_each(RangeSizes, [](std::pair &P) { + P.first = P.second = 1; + }); + DenseMap> NonSchedulable; + auto IsNotVectorized = [](bool First, + const std::pair &P) { + return First ? P.first > 0 : P.second > 0; + }; + auto IsVectorized = [](bool First, + const std::pair &P) { + return First ? P.first == 0 : P.second == 0; + }; + auto VFIsProfitable = [](bool First, unsigned Size, + const std::pair &P) { + return First ? Size >= P.first : Size >= P.second; + }; + auto FirstSizeSame = [](unsigned Size, + const std::pair &P) { + return Size == P.first; + }; + while (true) { + ++Repeat; + bool RepeatChanged = false; + bool AnyProfitableGraph; + for (unsigned Size : CandidateVFs) { + AnyProfitableGraph = false; + unsigned StartIdx = std::distance( + RangeSizes.begin(), + find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF, + std::placeholders::_1))); + while (StartIdx < End) { + unsigned EndIdx = + std::distance(RangeSizes.begin(), + find_if(RangeSizes.drop_front(StartIdx), + std::bind(IsVectorized, Size >= MaxRegVF, + std::placeholders::_1))); + unsigned Sz = EndIdx >= End ? End : EndIdx; + for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) { + if (!checkTreeSizes(RangeSizes.slice(Cnt, Size), + Size >= MaxRegVF)) { + ++Cnt; + continue; + } + ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); + assert(all_of(Slice, + [&](Value *V) { + return cast(V) + ->getValueOperand() + ->getType() == + cast(Slice.front()) + ->getValueOperand() + ->getType(); + }) && + "Expected all operands of same type."); + if (!NonSchedulable.empty()) { + auto [NonSchedSizeMax, NonSchedSizeMin] = + NonSchedulable.lookup(Slice.front()); + if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) { + Cnt += NonSchedSizeMax; + continue; + } + } + unsigned TreeSize; + std::optional Res = + vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize); + if (!Res) { + NonSchedulable + .try_emplace(Slice.front(), std::make_pair(Size, Size)) + .first->getSecond() + .second = Size; + } else if (*Res) { + // Mark the vectorized stores so that we don't vectorize them + // again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + // Mark the vectorized stores so that we don't vectorize them + // again. + AnyProfitableGraph = RepeatChanged = Changed = true; + // If we vectorized initial block, no need to try to vectorize + // it again. + for_each(RangeSizes.slice(Cnt, Size), + [](std::pair &P) { + P.first = P.second = 0; + }); + if (Cnt < StartIdx + MinVF) { + for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx), + [](std::pair &P) { + P.first = P.second = 0; + }); + StartIdx = Cnt + Size; + } + if (Cnt > Sz - Size - MinVF) { + for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)), + [](std::pair &P) { + P.first = P.second = 0; + }); + if (Sz == End) + End = Cnt; + Sz = Cnt; + } + Cnt += Size; + continue; + } + if (Size > 2 && Res && + !all_of(RangeSizes.slice(Cnt, Size), + std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize, + std::placeholders::_1))) { + Cnt += Size; + continue; + } + // Check for the very big VFs that we're not rebuilding same + // trees, just with larger number of elements. + if (Size > MaxRegVF && TreeSize > 1 && + all_of(RangeSizes.slice(Cnt, Size), + std::bind(FirstSizeSame, TreeSize, + std::placeholders::_1))) { + Cnt += Size; + while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize) + ++Cnt; + continue; + } + if (TreeSize > 1) + for_each(RangeSizes.slice(Cnt, Size), + [&](std::pair &P) { + if (Size >= MaxRegVF) + P.second = std::max(P.second, TreeSize); + else + P.first = std::max(P.first, TreeSize); + }); + ++Cnt; + AnyProfitableGraph = true; + } + if (StartIdx >= End) + break; + if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF) + AnyProfitableGraph = true; + StartIdx = std::distance( + RangeSizes.begin(), + find_if(RangeSizes.drop_front(Sz), + std::bind(IsNotVectorized, Size >= MaxRegVF, + std::placeholders::_1))); } - ++Cnt; + if (!AnyProfitableGraph && Size >= MaxRegVF) + break; } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= Operands.size()) + // All values vectorized - exit. + if (all_of(RangeSizes, [](const std::pair &P) { + return P.first == 0 && P.second == 0; + })) break; + // Check if tried all attempts or no need for the last attempts at all. + if (Repeat >= MaxAttempts || + (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph))) + break; + constexpr unsigned StoresLimit = 64; + const unsigned MaxTotalNum = bit_floor(std::min( + Operands.size(), + static_cast( + End - + std::distance( + RangeSizes.begin(), + find_if(RangeSizes, std::bind(IsNotVectorized, true, + std::placeholders::_1))) + + 1))); + unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2; + if (VF > MaxTotalNum || VF >= StoresLimit) + break; + for_each(RangeSizes, [&](std::pair &P) { + if (P.first != 0) + P.first = std::max(P.second, P.first); + }); + // Last attempt to vectorize max number of elements, if all previous + // attempts were unsuccessful because of the cost issues. + CandidateVFs.clear(); + CandidateVFs.push_back(VF); } } }; @@ -18191,6 +18426,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { }; // Attempt to sort and vectorize each of the store-groups. + DenseSet> Attempted; for (auto &Pair : Stores) { if (Pair.second.size() < 2) continue; @@ -18208,8 +18444,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { Pair.second.rend()); Changed |= tryToVectorizeSequence( ReversedStores, StoreSorter, AreCompatibleStores, - [this, &R](ArrayRef Candidates, bool) { - return vectorizeStores(Candidates, R); + [&](ArrayRef Candidates, bool) { + return vectorizeStores(Candidates, R, Attempted); }, /*MaxVFOnly=*/false, R); } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 47d918eabdfe2..9bbd314a27cb9 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -537,24 +537,18 @@ entry: } define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) { -; NON-POW2-LABEL: define void @vec3_extract( -; NON-POW2-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { -; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: store <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], ptr [[CALL3_I536]], align 2 -; NON-POW2-NEXT: ret void -; -; POW2-ONLY-LABEL: define void @vec3_extract( -; POW2-ONLY-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { -; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2 -; POW2-ONLY-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2 -; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2 -; POW2-ONLY-NEXT: [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1 -; POW2-ONLY-NEXT: [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1 -; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2 -; POW2-ONLY-NEXT: [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0 -; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2 -; POW2-ONLY-NEXT: ret void +; CHECK-LABEL: define void @vec3_extract( +; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2 +; CHECK-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2 +; CHECK-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2 +; CHECK-NEXT: [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1 +; CHECK-NEXT: [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1 +; CHECK-NEXT: store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2 +; CHECK-NEXT: [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0 +; CHECK-NEXT: store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2 +; CHECK-NEXT: ret void ; entry: %pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll index 75505f632a43f..29021150ccd2e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -slp-threshold=-1 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i32( @@ -98,58 +98,19 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) { } define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) { -; SSE-LABEL: @store_i64( -; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] -; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 -; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 -; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 -; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 -; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 -; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] -; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 -; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 -; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 -; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 -; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16 -; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] -; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 -; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 -; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 -; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 -; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 -; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 -; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] -; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 -; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 -; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 -; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 -; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 -; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: ret void -; -; AVX-LABEL: @store_i64( -; AVX-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] -; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; AVX-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], -; AVX-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> -; AVX-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] -; AVX-NEXT: ret void +; CHECK-LABEL: @store_i64( +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] +; CHECK-NEXT: ret void ; %4 = zext i32 %1 to i64 %5 = load i64, ptr %0, align 8, !tbaa !7