diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 13fbe884eddf9f..39acfd5bbbeeca 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -679,6 +679,15 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap = ValueToValueMap(), bool Assume = false, bool ShouldCheckWrap = true); +/// Returns the distance between the pointers \p PtrA and \p PtrB iff they are +/// compatible and it is possible to calculate the distance between them. This +/// is a simple API that does not depend on the analysis pass. +/// \param StrictCheck Ensure that the calculated distance matches the +/// type-based one after all the bitcasts removal in the provided pointers. +Optional getPointersDiff(Value *PtrA, Value *PtrB, const DataLayout &DL, + ScalarEvolution &SE, bool StrictCheck = false, + bool CheckType = true); + /// Attempt to sort the pointers in \p VL and return the sorted indices /// in \p SortedIndices, if reordering is required. /// diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e632fe25c24c85..997d4474a44899 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1124,139 +1124,123 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, return Stride; } +Optional llvm::getPointersDiff(Value *PtrA, Value *PtrB, + const DataLayout &DL, ScalarEvolution &SE, + bool StrictCheck, bool CheckType) { + assert(PtrA && PtrB && "Expected non-nullptr pointers."); + // Make sure that A and B are different pointers. + if (PtrA == PtrB) + return 0; + + // Make sure that PtrA and PtrB have the same type if required + if (CheckType && PtrA->getType() != PtrB->getType()) + return None; + + unsigned ASA = PtrA->getType()->getPointerAddressSpace(); + unsigned ASB = PtrB->getType()->getPointerAddressSpace(); + + // Check that the address spaces match. + if (ASA != ASB) + return None; + unsigned IdxWidth = DL.getIndexSizeInBits(ASA); + + APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); + Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + + int Val; + if (PtrA1 == PtrB1) { + // Retrieve the address space again as pointer stripping now tracks through + // `addrspacecast`. + ASA = cast(PtrA1->getType())->getAddressSpace(); + ASB = cast(PtrB1->getType())->getAddressSpace(); + // Check that the address spaces match and that the pointers are valid. + if (ASA != ASB) + return None; + + IdxWidth = DL.getIndexSizeInBits(ASA); + OffsetA = OffsetA.sextOrTrunc(IdxWidth); + OffsetB = OffsetB.sextOrTrunc(IdxWidth); + + OffsetB -= OffsetA; + Val = OffsetB.getSExtValue(); + } else { + // Otherwise compute the distance with SCEV between the base pointers. + const SCEV *PtrSCEVA = SE.getSCEV(PtrA); + const SCEV *PtrSCEVB = SE.getSCEV(PtrB); + const auto *Diff = + dyn_cast(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA)); + if (!Diff) + return None; + Val = Diff->getAPInt().getSExtValue(); + } + Type *Ty = cast(PtrA->getType())->getElementType(); + int Size = DL.getTypeStoreSize(Ty); + int Dist = Val / Size; + + // Ensure that the calculated distance matches the type-based one after all + // the bitcasts removal in the provided pointers. + if (!StrictCheck || Dist * Size == Val) + return Dist; + return None; +} + bool llvm::sortPtrAccesses(ArrayRef VL, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl &SortedIndices) { assert(llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."); - SmallVector, 4> OffValPairs; - OffValPairs.reserve(VL.size()); - // Walk over the pointers, and map each of them to an offset relative to // first pointer in the array. Value *Ptr0 = VL[0]; - const SCEV *Scev0 = SE.getSCEV(Ptr0); - Value *Obj0 = getUnderlyingObject(Ptr0); - - llvm::SmallSet Offsets; - for (auto *Ptr : VL) { - // TODO: Outline this code as a special, more time consuming, version of - // computeConstantDifference() function. - if (Ptr->getType()->getPointerAddressSpace() != - Ptr0->getType()->getPointerAddressSpace()) - return false; - // If a pointer refers to a different underlying object, bail - the - // pointers are by definition incomparable. - Value *CurrObj = getUnderlyingObject(Ptr); - if (CurrObj != Obj0) - return false; - const SCEV *Scev = SE.getSCEV(Ptr); - const auto *Diff = dyn_cast(SE.getMinusSCEV(Scev, Scev0)); - // The pointers may not have a constant offset from each other, or SCEV - // may just not be smart enough to figure out they do. Regardless, - // there's nothing we can do. + using DistOrdPair = std::pair; + auto Compare = [](const DistOrdPair &L, const DistOrdPair &R) { + return L.first < R.first; + }; + std::set Offsets(Compare); + Offsets.emplace(0, 0); + int Cnt = 1; + bool IsConsecutive = true; + for (auto *Ptr : VL.drop_front()) { + Optional Diff = getPointersDiff(Ptr0, Ptr, DL, SE); if (!Diff) return false; // Check if the pointer with the same offset is found. - int64_t Offset = Diff->getAPInt().getSExtValue(); - if (!Offsets.insert(Offset).second) + int64_t Offset = *Diff; + auto Res = Offsets.emplace(Offset, Cnt); + if (!Res.second) return false; - OffValPairs.emplace_back(Offset, Ptr); + // Consecutive order if the inserted element is the last one. + IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end(); + ++Cnt; } SortedIndices.clear(); - SortedIndices.resize(VL.size()); - std::iota(SortedIndices.begin(), SortedIndices.end(), 0); - - // Sort the memory accesses and keep the order of their uses in UseOrder. - llvm::stable_sort(SortedIndices, [&](unsigned Left, unsigned Right) { - return OffValPairs[Left].first < OffValPairs[Right].first; - }); - - // Check if the order is consecutive already. - if (llvm::all_of(SortedIndices, [&SortedIndices](const unsigned I) { - return I == SortedIndices[I]; - })) - SortedIndices.clear(); - + if (!IsConsecutive) { + // Fill SortedIndices array only if it is non-consecutive. + SortedIndices.resize(VL.size()); + Cnt = 0; + for (const std::pair &Pair : Offsets) { + IsConsecutive = IsConsecutive && Cnt == Pair.second; + SortedIndices[Cnt] = Pair.second; + ++Cnt; + } + } return true; } -/// Take the address space operand from the Load/Store instruction. -/// Returns -1 if this is not a valid Load/Store instruction. -static unsigned getAddressSpaceOperand(Value *I) { - if (LoadInst *L = dyn_cast(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast(I)) - return S->getPointerAddressSpace(); - return -1; -} - /// Returns true if the memory operations \p A and \p B are consecutive. bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType) { Value *PtrA = getLoadStorePointerOperand(A); Value *PtrB = getLoadStorePointerOperand(B); - unsigned ASA = getAddressSpaceOperand(A); - unsigned ASB = getAddressSpaceOperand(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; - - // Make sure that A and B are different pointers. - if (PtrA == PtrB) - return false; - - // Make sure that A and B have the same type if required. - if (CheckType && PtrA->getType() != PtrB->getType()) + if (!PtrA || !PtrB) return false; - - unsigned IdxWidth = DL.getIndexSizeInBits(ASA); - Type *Ty = cast(PtrA->getType())->getElementType(); - - APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); - - // Retrieve the address space again as pointer stripping now tracks through - // `addrspacecast`. - ASA = cast(PtrA->getType())->getAddressSpace(); - ASB = cast(PtrB->getType())->getAddressSpace(); - // Check that the address spaces match and that the pointers are valid. - if (ASA != ASB) - return false; - - IdxWidth = DL.getIndexSizeInBits(ASA); - OffsetA = OffsetA.sextOrTrunc(IdxWidth); - OffsetB = OffsetB.sextOrTrunc(IdxWidth); - - APInt Size(IdxWidth, DL.getTypeStoreSize(Ty)); - - // OffsetDelta = OffsetB - OffsetA; - const SCEV *OffsetSCEVA = SE.getConstant(OffsetA); - const SCEV *OffsetSCEVB = SE.getConstant(OffsetB); - const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA); - const APInt &OffsetDelta = cast(OffsetDeltaSCEV)->getAPInt(); - - // Check if they are based on the same pointer. That makes the offsets - // sufficient. - if (PtrA == PtrB) - return OffsetDelta == Size; - - // Compute the necessary base pointer delta to have the necessary final delta - // equal to the size. - // BaseDelta = Size - OffsetDelta; - const SCEV *SizeSCEV = SE.getConstant(Size); - const SCEV *BaseDelta = SE.getMinusSCEV(SizeSCEV, OffsetDeltaSCEV); - - // Otherwise compute the distance with SCEV between the base pointers. - const SCEV *PtrSCEVA = SE.getSCEV(PtrA); - const SCEV *PtrSCEVB = SE.getSCEV(PtrB); - const SCEV *X = SE.getAddExpr(PtrSCEVA, BaseDelta); - return X == PtrSCEVB; + Optional Diff = + getPointersDiff(PtrA, PtrB, DL, SE, /*StrictCheck=*/true, CheckType); + return Diff && *Diff == 1; } MemoryDepChecker::VectorizationSafetyStatus diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 385b6f30dc0fc3..78d2ea0032db49 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -941,10 +941,16 @@ class BoUpSLP { ScalarEvolution &SE) { auto *LI1 = dyn_cast(V1); auto *LI2 = dyn_cast(V2); - if (LI1 && LI2) - return isConsecutiveAccess(LI1, LI2, DL, SE) - ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreFail; + if (LI1 && LI2) { + if (LI1->getParent() != LI2->getParent()) + return VLOperands::ScoreFail; + + Optional Dist = + getPointersDiff(LI1->getPointerOperand(), LI2->getPointerOperand(), + DL, SE, /*StrictCheck=*/true); + return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + } auto *C1 = dyn_cast(V1); auto *C2 = dyn_cast(V2); @@ -2871,13 +2877,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Ptr0 = PointerOps[CurrentOrder.front()]; PtrN = PointerOps[CurrentOrder.back()]; } - const SCEV *Scev0 = SE->getSCEV(Ptr0); - const SCEV *ScevN = SE->getSCEV(PtrN); - const auto *Diff = - dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); - uint64_t Size = DL->getTypeAllocSize(ScalarTy); + Optional Diff = getPointersDiff(Ptr0, PtrN, *DL, *SE); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (static_cast(*Diff) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -3150,13 +3152,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Ptr0 = PointerOps[CurrentOrder.front()]; PtrN = PointerOps[CurrentOrder.back()]; } - const SCEV *Scev0 = SE->getSCEV(Ptr0); - const SCEV *ScevN = SE->getSCEV(PtrN); - const auto *Diff = - dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); - uint64_t Size = DL->getTypeAllocSize(ScalarTy); + Optional Dist = getPointersDiff(Ptr0, PtrN, *DL, *SE); // Check that the sorted pointer operands are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (static_cast(*Dist) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original stores are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -6107,20 +6105,41 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, int E = Stores.size(); SmallBitVector Tails(E, false); - SmallVector ConsecutiveChain(E, E + 1); int MaxIter = MaxStoreLookup.getValue(); + SmallVector, 16> ConsecutiveChain( + E, std::make_pair(E, INT_MAX)); + SmallVector CheckedPairs(E, SmallBitVector(E, false)); int IterCnt; auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, + &CheckedPairs, &ConsecutiveChain](int K, int Idx) { if (IterCnt >= MaxIter) return true; + if (CheckedPairs[Idx].test(K)) + return ConsecutiveChain[K].second == 1 && + ConsecutiveChain[K].first == Idx; ++IterCnt; - if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) + CheckedPairs[Idx].set(K); + CheckedPairs[K].set(Idx); + Optional Diff = getPointersDiff(Stores[K]->getPointerOperand(), + Stores[Idx]->getPointerOperand(), *DL, + *SE, /*StrictCheck=*/true); + if (!Diff || *Diff == 0) + return false; + int Val = *Diff; + if (Val < 0) { + if (ConsecutiveChain[Idx].second > -Val) { + Tails.set(K); + ConsecutiveChain[Idx] = std::make_pair(K, -Val); + } + return false; + } + if (ConsecutiveChain[K].second <= Val) return false; Tails.set(Idx); - ConsecutiveChain[K] = Idx; - return true; + ConsecutiveChain[K] = std::make_pair(Idx, Val); + return Val == 1; }; // Do a quadratic search on all of the given stores in reverse order and find // all of the pairs of stores that follow each other. @@ -6140,17 +6159,31 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, // For stores that start but don't end a link in the chain: for (int Cnt = E; Cnt > 0; --Cnt) { int I = Cnt - 1; - if (ConsecutiveChain[I] == E + 1 || Tails.test(I)) + if (ConsecutiveChain[I].first == E || Tails.test(I)) continue; // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; // Collect the chain into a list. - while (I != E + 1 && !VectorizedStores.count(Stores[I])) { + while (I != E && !VectorizedStores.count(Stores[I])) { Operands.push_back(Stores[I]); + Tails.set(I); + if (ConsecutiveChain[I].second != 1) { + // Mark the new end in the chain and go back, if required. It might be + // required if the original stores come in reversed order, for example. + if (ConsecutiveChain[I].first != E && + Tails.test(ConsecutiveChain[I].first) && + !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) { + Tails.reset(ConsecutiveChain[I].first); + if (Cnt < ConsecutiveChain[I].first + 2) + Cnt = ConsecutiveChain[I].first + 2; + } + break; + } // Move to the next value in the chain. - I = ConsecutiveChain[I]; + I = ConsecutiveChain[I].first; } + assert(!Operands.empty() && "Expected non-empty list of stores."); unsigned MaxVecRegSize = R.getMaxVecRegSize(); unsigned EltSize = R.getVectorElementSize(Operands[0]); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll index 267cf1a02c298f..e283628949107e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+avx -S | FileCheck %s --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX %class.1 = type { %class.2 } %class.2 = type { %"class.3" } @@ -117,13 +117,10 @@ define void @pr35497() local_unnamed_addr #0 { ; AVX-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 ; AVX-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* ; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP11]], i32 0 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], -; AVX-NEXT: [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]] -; AVX-NEXT: [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; AVX-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1 +; AVX-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], +; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]] +; AVX-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* +; AVX-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 ; AVX-NEXT: ret void ; entry: