From 0eb966900d2001a7c5abff565fc218a0f93966e8 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Wed, 17 Sep 2025 15:32:39 +0000 Subject: [PATCH 1/8] [LoadStoreVectorizer] Fill gaps in loads/stores to enable vectorization --- .../llvm/Analysis/TargetTransformInfo.h | 6 + .../llvm/Analysis/TargetTransformInfoImpl.h | 2 + llvm/lib/Analysis/TargetTransformInfo.cpp | 4 + .../Target/NVPTX/NVPTXTargetTransformInfo.h | 2 + .../Vectorize/LoadStoreVectorizer.cpp | 435 ++++++++++++-- .../test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 40 +- .../CodeGen/NVPTX/param-vectorize-device.ll | 6 +- llvm/test/CodeGen/NVPTX/variadics-backend.ll | 2 +- .../LoadStoreVectorizer/NVPTX/extend-chain.ll | 81 +++ .../NVPTX/gap-fill-cleanup.ll | 37 ++ .../NVPTX/gap-fill-invariant.ll | 83 +++ .../NVPTX/gap-fill-vectors.ll | 186 ++++++ .../LoadStoreVectorizer/NVPTX/gap-fill.ll | 194 +++++++ .../LoadStoreVectorizer/NVPTX/masked-store.ll | 541 ++++++++++++++++++ .../LoadStoreVectorizer/NVPTX/vectorize_i8.ll | 3 +- 15 files changed, 1544 insertions(+), 78 deletions(-) create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 41ff54f0781a2..f8f134c833ea2 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -817,6 +817,12 @@ class TargetTransformInfo { LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const; + /// Return true if it is legal to widen loads beyond their current width, + /// assuming the result is still well-aligned. For example, converting a load + /// i32 to a load i64, or vectorizing three continuous load i32s into a load + /// <4 x i32>. + LLVM_ABI bool isLegalToWidenLoads() const; + /// Return true if the target supports nontemporal store. LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const; /// Return true if the target supports nontemporal load. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 566e1cf51631a..55bd4bd709589 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -318,6 +318,8 @@ class TargetTransformInfoImplBase { return false; } + virtual bool isLegalToWidenLoads() const { return false; } + virtual bool isLegalNTStore(Type *DataType, Align Alignment) const { // By default, assume nontemporal memory stores are available for stores // that are aligned and have a size that is a power of 2. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 09b50c5270e57..89cda79558057 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -476,6 +476,10 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType, Align Alignment, return TTIImpl->isLegalMaskedLoad(DataType, Alignment, AddressSpace); } +bool TargetTransformInfo::isLegalToWidenLoads() const { + return TTIImpl->isLegalToWidenLoads(); +} + bool TargetTransformInfo::isLegalNTStore(Type *DataType, Align Alignment) const { return TTIImpl->isLegalNTStore(DataType, Alignment); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index b32d931bd3074..d56cff1ce3695 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -72,6 +72,8 @@ class NVPTXTTIImpl final : public BasicTTIImplBase { return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace); } + bool isLegalToWidenLoads() const override { return true; }; + // NVPTX has infinite registers of all kinds, but the actual machine doesn't. // We conservatively return 1 here which is just enough to enable the // vectorizers but disables heuristics based on the number of registers. diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 7b5137b0185ab..04f4e92826a52 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -119,6 +119,29 @@ using namespace llvm; #define DEBUG_TYPE "load-store-vectorizer" +cl::opt + ExtendLoads("vect-extend-loads", cl::Hidden, + cl::desc("Load more elements if the target VF is higher " + "than the chain length."), + cl::init(true)); + +cl::opt ExtendStores( + "vect-extend-stores", cl::Hidden, + cl::desc("Store more elements if the target VF is higher " + "than the chain length and we have access to masked stores."), + cl::init(true)); + +cl::opt FillLoadGaps( + "vect-fill-load-gaps", cl::Hidden, + cl::desc("Should Loads be introduced in gaps to enable vectorization."), + cl::init(true)); + +cl::opt + FillStoreGaps("vect-fill-store-gaps", cl::Hidden, + cl::desc("Should Stores be introduced in gaps to enable " + "vectorization into masked stores."), + cl::init(true)); + STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); @@ -246,12 +269,16 @@ class Vectorizer { const DataLayout &DL; IRBuilder<> Builder; - // We could erase instrs right after vectorizing them, but that can mess up - // our BB iterators, and also can make the equivalence class keys point to - // freed memory. This is fixable, but it's simpler just to wait until we're - // done with the BB and erase all at once. + /// We could erase instrs right after vectorizing them, but that can mess up + /// our BB iterators, and also can make the equivalence class keys point to + /// freed memory. This is fixable, but it's simpler just to wait until we're + /// done with the BB and erase all at once. SmallVector ToErase; + /// We insert load/store instructions and GEPs to fill gaps and extend chains + /// to enable vectorization. Keep track and delete them later. + DenseSet ExtraElements; + public: Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) @@ -344,6 +371,28 @@ class Vectorizer { /// Postcondition: For all i, ret[i][0].second == 0, because the first instr /// in the chain is the leader, and an instr touches distance 0 from itself. std::vector gatherChains(ArrayRef Instrs); + + /// Is a load/store with this alignment allowed by TTI and at least as fast + /// as an unvectorized load/store. + bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment, + unsigned VecElemBits) const; + + /// Before attempting to fill gaps, check if the chain is a candidate for + /// a masked store, to save compile time if it is not possible for the address + /// space and element type. + bool shouldAttemptMaskedStore(const ArrayRef C) const; + + /// Create a new GEP and a new Load/Store instruction such that the GEP + /// is pointing at PrevElem + Offset. In the case of stores, store poison. + /// Extra elements will either be combined into a vector/masked store or + /// deleted before the end of the pass. + ChainElem createExtraElementAfter(const ChainElem &PrevElem, APInt Offset, + StringRef Prefix, + Align Alignment = Align(1)); + + /// Delete dead GEPs and extra Load/Store instructions created by + /// createExtraElementAfter + void deleteExtraElements(); }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -457,12 +506,21 @@ bool Vectorizer::run() { Changed |= runOnPseudoBB(*It, *std::next(It)); for (Instruction *I : ToErase) { + // These will get deleted in deleteExtraElements. + // This is because ExtraElements will include both extra elements + // that *were* vectorized and extra elements that *were not* + // vectorized. ToErase will only include extra elements that *were* + // vectorized, so in order to avoid double deletion we skip them here and + // handle them in deleteExtraElements. + if (ExtraElements.contains(I)) + continue; auto *PtrOperand = getLoadStorePointerOperand(I); if (I->use_empty()) I->eraseFromParent(); RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); } ToErase.clear(); + deleteExtraElements(); } return Changed; @@ -623,6 +681,29 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { dumpChain(C); }); + // If the chain is not contiguous, we try to fill the gap with "extra" + // elements to artificially make it contiguous, to try to enable + // vectorization. + // - Filling gaps in loads is always ok if the target supports widening loads. + // - For stores, we only fill gaps if there is a potentially legal masked + // store for the target. If later on, we don't end up with a chain that + // could be vectorized into a legal masked store, the chains with extra + // elements will be filtered out in splitChainByAlignment. + bool TryFillGaps = isa(C[0].Inst) + ? (FillLoadGaps && TTI.isLegalToWidenLoads()) + : (FillStoreGaps && shouldAttemptMaskedStore(C)); + + unsigned ASPtrBits = + DL.getIndexSizeInBits(getLoadStoreAddressSpace(C[0].Inst)); + + // Compute the alignment of the leader of the chain (which every stored offset + // is based on) using the current first element of the chain. This is + // conservative, we may be able to derive better alignment by iterating over + // the chain and finding the leader. + Align LeaderOfChainAlign = + commonAlignment(getLoadStoreAlignment(C[0].Inst), + C[0].OffsetFromLeader.abs().getLimitedValue()); + std::vector Ret; Ret.push_back({C.front()}); @@ -633,7 +714,8 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst)); assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by " "collectEquivalenceClass"); - APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8; + APInt PrevSzBytes = APInt(ASPtrBits, SzBits / 8); + APInt PrevReadEnd = Prev.OffsetFromLeader + PrevSzBytes; // Add this instruction to the end of the current chain, or start a new one. bool AreContiguous = It->OffsetFromLeader == PrevReadEnd; @@ -642,10 +724,54 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { << *Prev.Inst << " (ends at offset " << PrevReadEnd << ") -> " << *It->Inst << " (starts at offset " << It->OffsetFromLeader << ")\n"); - if (AreContiguous) + + if (AreContiguous) { CurChain.push_back(*It); - else - Ret.push_back({*It}); + continue; + } + + // For now, we aren't filling gaps between load/stores of different sizes. + // Additionally, as a conservative heuristic, we only fill gaps of 1-2 + // elements. Generating loads/stores with too many unused bytes has a side + // effect of increasing register pressure (on NVIDIA targets at least), + // which could cancel out the benefits of reducing number of load/stores. + if (TryFillGaps && + SzBits == DL.getTypeSizeInBits(getLoadStoreType(It->Inst))) { + APInt OffsetOfGapStart = Prev.OffsetFromLeader + PrevSzBytes; + APInt GapSzBytes = It->OffsetFromLeader - OffsetOfGapStart; + if (GapSzBytes == PrevSzBytes) { + // There is a single gap between Prev and Curr, create one extra element + ChainElem NewElem = createExtraElementAfter( + Prev, PrevSzBytes, "GapFill", + commonAlignment(LeaderOfChainAlign, + OffsetOfGapStart.abs().getLimitedValue())); + CurChain.push_back(NewElem); + CurChain.push_back(*It); + continue; + } + // There are two gaps between Prev and Curr, only create two extra + // elements if Prev is the first element in a sequence of four. + // This has the highest chance of resulting in a beneficial vectorization. + if ((GapSzBytes == 2 * PrevSzBytes) && (CurChain.size() % 4 == 1)) { + ChainElem NewElem1 = createExtraElementAfter( + Prev, PrevSzBytes, "GapFill", + commonAlignment(LeaderOfChainAlign, + OffsetOfGapStart.abs().getLimitedValue())); + ChainElem NewElem2 = createExtraElementAfter( + NewElem1, PrevSzBytes, "GapFill", + commonAlignment( + LeaderOfChainAlign, + (OffsetOfGapStart + PrevSzBytes).abs().getLimitedValue())); + CurChain.push_back(NewElem1); + CurChain.push_back(NewElem2); + CurChain.push_back(*It); + continue; + } + } + + // The chain is not contiguous and cannot be made contiguous with gap + // filling, so we need to start a new chain. + Ret.push_back({*It}); } // Filter out length-1 chains, these are uninteresting. @@ -721,6 +847,14 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { unsigned AS = getLoadStoreAddressSpace(C[0].Inst); unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8; + // For compile time reasons, we cache whether or not the superset + // of all candidate chains contains any extra stores from earlier gap + // filling. + bool CandidateChainsMayContainExtraStores = + !IsLoadChain && any_of(C, [this](const ChainElem &E) { + return ExtraElements.contains(E.Inst); + }); + std::vector Ret; for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { // Find candidate chains of size not greater than the largest vector reg. @@ -769,41 +903,6 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { continue; } - // Is a load/store with this alignment allowed by TTI and at least as fast - // as an unvectorized load/store? - // - // TTI and F are passed as explicit captures to WAR an MSVC misparse (??). - auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI, - &F = F](Align Alignment) { - if (Alignment.value() % SizeBytes == 0) - return true; - unsigned VectorizedSpeed = 0; - bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( - F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); - if (!AllowsMisaligned) { - LLVM_DEBUG(dbgs() - << "LSV: Access of " << SizeBytes << "B in addrspace " - << AS << " with alignment " << Alignment.value() - << " is misaligned, and therefore can't be vectorized.\n"); - return false; - } - - unsigned ElementwiseSpeed = 0; - (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, - Alignment, &ElementwiseSpeed); - if (VectorizedSpeed < ElementwiseSpeed) { - LLVM_DEBUG(dbgs() - << "LSV: Access of " << SizeBytes << "B in addrspace " - << AS << " with alignment " << Alignment.value() - << " has relative speed " << VectorizedSpeed - << ", which is lower than the elementwise speed of " - << ElementwiseSpeed - << ". Therefore this access won't be vectorized.\n"); - return false; - } - return true; - }; - // If we're loading/storing from an alloca, align it if possible. // // FIXME: We eagerly upgrade the alignment, regardless of whether TTI @@ -818,8 +917,7 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { isa(PtrOperand->stripPointerCasts()); Align Alignment = getLoadStoreAlignment(C[CBegin].Inst); Align PrefAlign = Align(StackAdjustedAlignment); - if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 && - IsAllowedAndFast(PrefAlign)) { + if (IsAllocaAccess && Alignment.value() % SizeBytes != 0) { Align NewAlign = getOrEnforceKnownAlignment( PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT); if (NewAlign >= Alignment) { @@ -831,7 +929,59 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { } } - if (!IsAllowedAndFast(Alignment)) { + Chain ExtendingLoadsStores; + bool ExtendChain = IsLoadChain + ? ExtendLoads + : ExtendStores; + if (ExtendChain && NumVecElems < TargetVF && NumVecElems % 2 != 0 && + VecElemBits >= 8) { + // TargetVF may be a lot higher than NumVecElems, + // so only extend to the next power of 2. + assert(VecElemBits % 8 == 0); + unsigned VecElemBytes = VecElemBits / 8; + unsigned NewNumVecElems = PowerOf2Ceil(NumVecElems); + unsigned NewSizeBytes = VecElemBytes * NewNumVecElems; + + assert(NewNumVecElems <= TargetVF); + + LLVM_DEBUG(dbgs() << "LSV: attempting to extend chain of " + << NumVecElems << " " + << (IsLoadChain ? "loads" : "stores") << " to " + << NewNumVecElems << " elements\n"); + // Do not artificially increase the chain if it becomes misaligned, + // otherwise we may unnecessary split the chain when the target actually + // supports non-pow2 VF. + if (accessIsAllowedAndFast(NewSizeBytes, AS, Alignment, VecElemBits) && + ((IsLoadChain ? TTI.isLegalToWidenLoads() + : TTI.isLegalMaskedStore( + FixedVectorType::get(VecElemTy, NewNumVecElems), + Alignment, AS, /*IsMaskConstant=*/true)))) { + LLVM_DEBUG(dbgs() + << "LSV: extending " << (IsLoadChain ? "load" : "store") + << " chain of " << NumVecElems << " " + << (IsLoadChain ? "loads" : "stores") + << " with total byte size of " << SizeBytes << " to " + << NewNumVecElems << " " + << (IsLoadChain ? "loads" : "stores") + << " with total byte size of " << NewSizeBytes + << ", TargetVF=" << TargetVF << " \n"); + + unsigned ASPtrBits = DL.getIndexSizeInBits(AS); + ChainElem Prev = C[CEnd]; + for (unsigned i = 0; i < (NewNumVecElems - NumVecElems); i++) { + ChainElem NewElem = createExtraElementAfter( + Prev, APInt(ASPtrBits, VecElemBytes), "Extend"); + ExtendingLoadsStores.push_back(NewElem); + Prev = ExtendingLoadsStores.back(); + } + + // Update the size and number of elements for upcoming checks. + SizeBytes = NewSizeBytes; + NumVecElems = NewNumVecElems; + } + } + + if (!accessIsAllowedAndFast(SizeBytes, AS, Alignment, VecElemBits)) { LLVM_DEBUG( dbgs() << "LSV: splitChainByAlignment discarding candidate chain " "because its alignment is not AllowedAndFast: " @@ -849,10 +999,41 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { continue; } + if (CandidateChainsMayContainExtraStores) { + // The legality of adding extra stores to ExtendingLoadsStores has + // already been checked, but if the candidate chain contains extra + // stores from an earlier optimization, confirm legality now. + // This filter is essential because, when filling gaps in + // splitChainByContinuity, we queried the API to check that (for a given + // element type and address space) there *may* be a legal masked store + // we can try to create. Now, we need to check if the actual chain we + // ended up with is legal to turn into a masked store. + // This is relevant for NVPTX targets, for example, where a masked store + // is only legal if we have ended up with a 256-bit vector. + bool CandidateChainContainsExtraStores = llvm::any_of( + ArrayRef(C).slice(CBegin, CEnd - CBegin + 1), + [this](const ChainElem &E) { + return ExtraElements.contains(E.Inst); + }); + + if (CandidateChainContainsExtraStores && + !TTI.isLegalMaskedStore( + FixedVectorType::get(VecElemTy, NumVecElems), Alignment, AS, + /*IsMaskConstant=*/true)) { + LLVM_DEBUG(dbgs() + << "LSV: splitChainByAlignment discarding candidate chain " + "because it contains extra stores that we cannot " + "legally vectorize into a masked store \n"); + continue; + } + } + // Hooray, we can vectorize this chain! Chain &NewChain = Ret.emplace_back(); for (unsigned I = CBegin; I <= CEnd; ++I) NewChain.emplace_back(C[I]); + for (ChainElem E : ExtendingLoadsStores) + NewChain.emplace_back(E); CBegin = CEnd; // Skip over the instructions we've added to the chain. break; } @@ -864,6 +1045,12 @@ bool Vectorizer::vectorizeChain(Chain &C) { if (C.size() < 2) return false; + // If we are left with a two-element chain, and one of the elements is an + // extra element, we don't want to vectorize + if (C.size() == 2 && (ExtraElements.contains(C[0].Inst) || + ExtraElements.contains(C[1].Inst))) + return false; + sortChainInOffsetOrder(C); LLVM_DEBUG({ @@ -983,12 +1170,41 @@ bool Vectorizer::vectorizeChain(Chain &C) { } } - // Chain is in offset order, so C[0] is the instr with the lowest offset, - // i.e. the root of the vector. - VecInst = Builder.CreateAlignedStore( - Vec, - getLoadStorePointerOperand(C[0].Inst), - Alignment); + // If the chain originates from extra stores, we need to vectorize into a + // masked store. + bool ChainContainsExtraStores = llvm::any_of(C, [this](const ChainElem &E) { + return ExtraElements.contains(E.Inst); + }); + if (ChainContainsExtraStores) { + assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS, + /*IsMaskConstant=*/true)); + unsigned MaskIdx = 0; + // loop through the chain and create a mask for the masked store + Value *Mask = PoisonValue::get(FixedVectorType::get( + Builder.getInt1Ty(), cast(VecTy)->getNumElements())); + for (const ChainElem &E : C) { + bool IsExtraStore = ExtraElements.contains(E.Inst); + if (FixedVectorType *VT = + dyn_cast(getLoadStoreType(E.Inst))) { + for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { + Mask = Builder.CreateInsertElement(Mask, + Builder.getInt1(!IsExtraStore), + Builder.getInt32(MaskIdx++)); + } + } else { + Mask = + Builder.CreateInsertElement(Mask, Builder.getInt1(!IsExtraStore), + Builder.getInt32(MaskIdx++)); + } + } + VecInst = Builder.CreateMaskedStore( + Vec, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask); + } else { + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + VecInst = Builder.CreateAlignedStore( + Vec, getLoadStorePointerOperand(C[0].Inst), Alignment); + } } propagateMetadata(VecInst, C); @@ -1641,3 +1857,118 @@ std::optional Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, .sextOrTrunc(OrigBitWidth); return std::nullopt; } + +bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, + Align Alignment, + unsigned VecElemBits) const { + if (Alignment.value() % SizeBytes == 0) + return true; + unsigned VectorizedSpeed = 0; + bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( + F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); + if (!AllowsMisaligned) { + LLVM_DEBUG( + dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS + << " with alignment " << Alignment.value() + << " is misaligned, and therefore can't be vectorized.\n"); + return false; + } + + unsigned ElementwiseSpeed = 0; + (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, + Alignment, &ElementwiseSpeed); + if (VectorizedSpeed < ElementwiseSpeed) { + LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " has relative speed " << VectorizedSpeed + << ", which is lower than the elementwise speed of " + << ElementwiseSpeed + << ". Therefore this access won't be vectorized.\n"); + return false; + } + return true; +} + +bool Vectorizer::shouldAttemptMaskedStore(const ArrayRef C) const { + assert(isa(C[0].Inst)); + + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + Type *ElementType = getLoadStoreType(C[0].Inst)->getScalarType(); + unsigned VecRegBits = TTI.getLoadStoreVecRegBitWidth(AS); + // Assume max alignment, splitChainByAlignment will legalize it later if the + // necessary alignment is not reached. + Align OptimisticAlign = Align(VecRegBits / 8); + unsigned int MaxVectorNumElems = + VecRegBits / DL.getTypeSizeInBits(ElementType); + + // Attempt to find the smallest power-of-two number of elements that, if + // well aligned, could be represented as a legal masked store. + // If one exists for a given element type and address space, it is worth + // attempting to fill gaps as we may be able to create a legal masked store. + // If we do not end up with a legal masked store, chains with extra elements + // will be discarded. + const unsigned MinMaskedStoreNumElems = 4; + for (unsigned NumElems = MinMaskedStoreNumElems; + NumElems <= MaxVectorNumElems; NumElems *= 2) { + FixedVectorType *VectorType = FixedVectorType::get(ElementType, NumElems); + if (TTI.isLegalMaskedStore(VectorType, OptimisticAlign, AS, + /*IsMaskConstant=*/true)) + return true; + } + return false; +} + +ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, + APInt Offset, StringRef Prefix, + Align Alignment) { + Instruction *NewElement = nullptr; + Builder.SetInsertPoint(Prev.Inst->getNextNode()); + if (LoadInst *PrevLoad = dyn_cast(Prev.Inst)) { + Value *NewGep = Builder.CreatePtrAdd( + PrevLoad->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP"); + LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n"); + NewElement = Builder.CreateAlignedLoad(PrevLoad->getType(), NewGep, + Alignment, Prefix); + } else { + StoreInst *PrevStore = cast(Prev.Inst); + + Value *NewGep = Builder.CreatePtrAdd( + PrevStore->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP"); + LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n"); + NewElement = Builder.CreateAlignedStore( + PoisonValue::get(PrevStore->getValueOperand()->getType()), NewGep, + Alignment); + } + + // Attach all metadata to the new element. + // propagateMetadata will fold it into the final vector when applicable. + NewElement->copyMetadata(*Prev.Inst); + + // Cache created elements for tracking and cleanup + ExtraElements.insert(NewElement); + + APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset; + LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n" + << *NewElement + << " OffsetFromLeader: " << NewOffsetFromLeader << "\n"); + return ChainElem{NewElement, NewOffsetFromLeader}; +} + +void Vectorizer::deleteExtraElements() { + for (auto *ExtraElement : ExtraElements) { + if (isa(ExtraElement)) { + [[maybe_unused]] bool Deleted = + RecursivelyDeleteTriviallyDeadInstructions(ExtraElement); + assert(Deleted && "Extra Load should always be trivially dead"); + } else { + // Unlike Extra Loads, Extra Stores won't be "dead", but should all be + // deleted regardless. They will have either been combined into a masked + // store, or will be left behind and need to be cleaned up. + auto *PtrOperand = getLoadStorePointerOperand(ExtraElement); + ExtraElement->eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); + } + } + + ExtraElements.clear(); +} diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index dd9a472984c25..19ec2574e32b4 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -45,29 +45,31 @@ define half @fh(ptr %p) { ; ENABLED-LABEL: fh( ; ENABLED: { ; ENABLED-NEXT: .reg .b16 %rs<10>; -; ENABLED-NEXT: .reg .b32 %r<13>; +; ENABLED-NEXT: .reg .b32 %r<17>; ; ENABLED-NEXT: .reg .b64 %rd<2>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: ; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0]; -; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8]; -; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2; -; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1; -; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3; -; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4; -; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3; -; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6; -; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7; -; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6; -; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9; -; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8; -; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5; -; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12; +; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } +; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2; +; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5; +; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4; +; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7; +; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3; +; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2; +; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10; +; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7; +; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6; +; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13; +; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8; +; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1; +; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16; ; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9; ; ENABLED-NEXT: ret; ; diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index 51f6b00601069..4870050dd2d43 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -171,8 +171,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) ; CHECK-LABEL: callee_St4x3( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] - ; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; - ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0]; ; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]]; ; CHECK-NEXT: ret; @@ -394,8 +393,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St4x7( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; - ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; - ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; + ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16]; ; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; ; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]]; diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 61ff80632c789..5499dbce61bae 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -110,7 +110,7 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: st.b64 [%SP], 4294967297; +; CHECK-PTX-NEXT: st.v2.b32 [%SP], {1, 1}; ; CHECK-PTX-NEXT: st.b32 [%SP+8], 1; ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; ; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408; diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll new file mode 100644 index 0000000000000..24d0dea086ba8 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +;; Check that the vectorizer extends a Chain to the next power of two, +;; essentially loading more vector elements than the original +;; code. Alignment and other requirement for vectorization should +;; still be met. + +define void @load3to4(ptr %p) #0 { +; CHECK-LABEL: define void @load3to4( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P_0]], align 16 +; CHECK-NEXT: [[V01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[V12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[V23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[EXTEND4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i32, ptr %p, i32 0 + %p.1 = getelementptr i32, ptr %p, i32 1 + %p.2 = getelementptr i32, ptr %p, i32 2 + + %v0 = load i32, ptr %p.0, align 16 + %v1 = load i32, ptr %p.1, align 4 + %v2 = load i32, ptr %p.2, align 8 + + ret void +} + +define void @load5to8(ptr %p) #0 { +; CHECK-LABEL: define void @load5to8( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P_0]], align 16 +; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i16, ptr %p, i32 0 + %p.1 = getelementptr i16, ptr %p, i32 1 + %p.2 = getelementptr i16, ptr %p, i32 2 + %p.3 = getelementptr i16, ptr %p, i32 3 + %p.4 = getelementptr i16, ptr %p, i32 4 + + %v0 = load i16, ptr %p.0, align 16 + %v1 = load i16, ptr %p.1, align 2 + %v2 = load i16, ptr %p.2, align 4 + %v3 = load i16, ptr %p.3, align 8 + %v4 = load i16, ptr %p.4, align 2 + + ret void +} + +define void @load3to4_unaligned(ptr %p) #0 { +; CHECK-LABEL: define void @load3to4_unaligned( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0 +; CHECK-NEXT: [[P_2:%.*]] = getelementptr i32, ptr [[P]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P_0]], align 8 +; CHECK-NEXT: [[V01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[V12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[P_2]], align 8 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i32, ptr %p, i32 0 + %p.1 = getelementptr i32, ptr %p, i32 1 + %p.2 = getelementptr i32, ptr %p, i32 2 + + %v0 = load i32, ptr %p.0, align 8 + %v1 = load i32, ptr %p.1, align 4 + %v2 = load i32, ptr %p.2, align 8 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll new file mode 100644 index 0000000000000..e812f8750fa76 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Test that gap filled instructions get deleted if they are not used +%struct.S10 = type { i32, i32, i32, i32 } + +; First, confirm that gap instructions get generated and would be vectorized if the alignment is correct +define void @fillTwoGapsCanVectorize(ptr %in) { +; CHECK-LABEL: define void @fillTwoGapsCanVectorize( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16 +; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr %in, align 16 + %getElem = getelementptr i8, ptr %in, i64 12 + %load3 = load i32, ptr %getElem, align 4 + ret void +} + +; Then, confirm that gap instructions get deleted if the alignment prevents the vectorization +define void @fillTwoGapsCantVectorize(ptr %in) { +; CHECK-LABEL: define void @fillTwoGapsCantVectorize( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[IN]], align 4 +; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[IN]], i64 12 +; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEM]], align 4 +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr %in, align 4 + %getElem = getelementptr i8, ptr %in, i64 12 + %load3 = load i32, ptr %getElem, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll new file mode 100644 index 0000000000000..6d0dfc677780d --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Test that gap filled instructions don't lose invariant metadata +%struct.S10 = type { i32, i32, i32, i32 } + +; With no gaps, if every load is invariant, the vectorized load will be too. +define i32 @noGaps(ptr %in) { +; CHECK-LABEL: define i32 @noGaps( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]] +; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]] +; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]] +; CHECK-NEXT: ret i32 [[SUM0123]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1 + %load1 = load i32, ptr %getElem1, align 4, !invariant.load !0 + %getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2 + %load2 = load i32, ptr %getElem2, align 4, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4, !invariant.load !0 + %sum01 = add i32 %load0, %load1 + %sum012 = add i32 %sum01, %load2 + %sum0123 = add i32 %sum012, %load3 + ret i32 %sum0123 +} + +; If one of the loads is not invariant, the vectorized load will not be invariant. +define i32 @noGapsMissingInvariant(ptr %in) { +; CHECK-LABEL: define i32 @noGapsMissingInvariant( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16 +; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]] +; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]] +; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]] +; CHECK-NEXT: ret i32 [[SUM0123]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1 + %load1 = load i32, ptr %getElem1, align 4, !invariant.load !0 + %getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2 + %load2 = load i32, ptr %getElem2, align 4, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4 + %sum01 = add i32 %load0, %load1 + %sum012 = add i32 %sum01, %load2 + %sum0123 = add i32 %sum012, %load3 + ret i32 %sum0123 +} + +; With two gaps, if every real load is invariant, the vectorized load will be too. +define i32 @twoGaps(ptr %in) { +; CHECK-LABEL: define i32 @twoGaps( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0]] +; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LOAD03]], [[LOAD36]] +; CHECK-NEXT: ret i32 [[SUM]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4, !invariant.load !0 + %sum = add i32 %load0, %load3 + ret i32 %sum +} + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll new file mode 100644 index 0000000000000..fe7123898d450 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S < %s | FileCheck %s + +; The LSV can handle vector inputs, and gap filling can too, with one exception: +; currently, we do not gap fill when the loads enclosing the gap are different sizes +; Otherwise, vectors are treated the same as any other scalar types + +define void @i1x8_gap_gap_i1x8(ptr %ptr) { +; CHECK-LABEL: define void @i1x8_gap_gap_i1x8( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i1>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[L03:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> +; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> +; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> +; CHECK-NEXT: [[L36:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 3 + + %l0 = load <8 x i1>, ptr %ptr0, align 4 + %l3 = load <8 x i1>, ptr %ptr3, align 1 + + ret void +} + +; The chain elements are different sizes, gap filling won't kick in +define void @i1x8_gap_gap_i1x16(ptr %ptr) { +; CHECK-LABEL: define void @i1x8_gap_gap_i1x16( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[L0:%.*]] = load <8 x i1>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[L3:%.*]] = load <16 x i1>, ptr [[PTR3]], align 2 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 3 + + %l0 = load <8 x i1>, ptr %ptr0, align 4 + %l3 = load <16 x i1>, ptr %ptr3, align 2 + + ret void +} + +; Gap of two load <2 x i8>s gets filled +define void @i8x2_gap_gap_i8x2(ptr %ptr) { +; CHECK-LABEL: define void @i8x2_gap_gap_i8x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[L36:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 6 + + %l0 = load <2 x i8>, ptr %ptr0, align 8 + %l3 = load <2 x i8>, ptr %ptr3, align 2 + + ret void +} + +; The chain elements are different sizes, gap filling won't kick in +define void @i8x2_gap_gap_i8(ptr %ptr) { +; CHECK-LABEL: define void @i8x2_gap_gap_i8( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr [[PTR]], i64 6 +; CHECK-NEXT: [[L0:%.*]] = load <2 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[PTR3]], align 1 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 6 + + %l0 = load <2 x i8>, ptr %ptr0, align 8 + %l3 = load i8, ptr %ptr3, align 1 + + ret void +} + + +define void @i16x2_gap_i16x2_i16x2(ptr %ptr) { +; CHECK-LABEL: define void @i16x2_gap_i16x2_i16x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[PTR0]], align 16 +; CHECK-NEXT: [[L01:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L23:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L34:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr2 = getelementptr i8, ptr %ptr, i64 8 + %ptr3 = getelementptr i8, ptr %ptr, i64 12 + + %l0 = load <2 x i16>, ptr %ptr0, align 16 + %l2 = load <2 x i16>, ptr %ptr2, align 2 + %l3 = load <2 x i16>, ptr %ptr3, align 2 + + ret void +} + +define void @i16x2_gap_gap_i16x2(ptr %ptr) { +; CHECK-LABEL: define void @i16x2_gap_gap_i16x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[PTR0]], align 16 +; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L36:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 12 + + %l0 = load <2 x i16>, ptr %ptr0, align 16 + %l3 = load <2 x i16>, ptr %ptr3, align 4 + + ret void +} + +define void @i32x2_i32x2_gap_i32x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i32x2_i32x2_gap_i32x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[VEC34:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i32>, ptr addrspace(1) %in, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %vec1 = load <2 x i32>, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %vec3 = load <2 x i32>, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; This gap is filled but then eventually discarded because the total size +; of the vector is larger than the target supports. +define void @i64x2_gap_i64x2_i64x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i64x2_gap_i64x2_i64x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VEC0:%.*]] = load <2 x i64>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 32 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[GETELEM3]], align 32 +; CHECK-NEXT: [[VEC31:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i64>, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %vec3 = load <2 x i64>, ptr addrspace(1) %getElem3, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + %vec1 = load <2 x i64>, ptr addrspace(1) %getElem1, align 16 + ret void +} + +; This gap is filled but then eventually discarded because the total size +; of the vector is larger than the target supports. +define void @i64x2_i64x2_gap_i64x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i64x2_i64x2_gap_i64x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[VEC32:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48 +; CHECK-NEXT: [[VEC1:%.*]] = load <2 x i64>, ptr addrspace(1) [[GETELEM1]], align 8 +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i64>, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %vec3 = load <2 x i64>, ptr addrspace(1) %getElem3, align 16 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + %vec1 = load <2 x i64>, ptr addrspace(1) %getElem1, align 8 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll new file mode 100644 index 0000000000000..82ebffed7f765 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll @@ -0,0 +1,194 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Load elements 0, 1, and 3, filling the gap with a generated load of element 2 +define void @test(ptr %ptr) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; Load elements 0, 2, and 3, filling the gap with a generated load of element 1 +define void @test2(ptr %ptr) { +; CHECK-LABEL: define void @test2( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LD23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8 + %ld2 = load i32, ptr %gep2, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; This gap can be filled, but the types are too large to do a v4 load, +; So we should end up with a v2 load and a single scalar load +define void @test3(ptr %ptr) { +; CHECK-LABEL: define void @test3( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 24 +; CHECK-NEXT: [[LD3:%.*]] = load i64, ptr [[GEP3]], align 4 +; CHECK-NEXT: ret void +; + %ld0 = load i64, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 8 + %ld1 = load i64, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld3 = load i64, ptr %gep3, align 4 + ret void +} + +; This gap can be filled, but the types are too large to do a v4 load, +; So we should end up with a v2 load and a single scalar load +define void @test4(ptr %ptr) { +; CHECK-LABEL: define void @test4( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[PTR]], align 16 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[GEP2]], align 16 +; CHECK-NEXT: [[LD21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD32:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: ret void +; + %ld0 = load i64, ptr %ptr, align 16 + %gep2 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld2 = load i64, ptr %gep2, align 16 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld3 = load i64, ptr %gep3, align 4 + ret void +} + +; Load elements 0 and 3, filling the gap with a generated load of element 1 and 2 +define void @test5(ptr %ptr) { +; CHECK-LABEL: define void @test5( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; Load elements 0, 1, 3, 4, 6, and 7, filling gaps at elements 2 and 5. +define void @test6(ptr %ptr) { +; CHECK-LABEL: define void @test6( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP4]], align 16 +; CHECK-NEXT: [[LD47:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[GAPFILL28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD710:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep4 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld4 = load i32, ptr %gep4, align 16 + %gep6 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld6 = load i32, ptr %gep6, align 4 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} + +; Load elements 0, 1, 3, 4 and 7, elements 2, 5, and 6 will be filled +define void @test7(ptr %ptr) { +; CHECK-LABEL: define void @test7( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD05:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD16:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL7:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD38:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP4]], align 16 +; CHECK-NEXT: [[LD49:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[GAPFILL210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[GAPFILL411:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD712:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep4 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld4 = load i32, ptr %gep4, align 16 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} + +; Load elements 0, 1, 3, 5, 6, and 7. Elements 2 and 4 will be filled. +; Element 4 will be created and well-aligned because of its +; distance from the first load. +define void @test8(ptr %ptr) { +; CHECK-LABEL: define void @test8( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 12 +; CHECK-NEXT: [[GAPFILLGEP1:%.*]] = getelementptr i8, ptr [[GEP3]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GAPFILLGEP1]], align 16 +; CHECK-NEXT: [[GAPFILL27:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[LD58:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD710:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep5 = getelementptr inbounds i8, ptr %ptr, i32 20 + %ld5 = load i32, ptr %gep5, align 16 + %gep6 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld6 = load i32, ptr %gep6, align 4 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll new file mode 100644 index 0000000000000..1346bd0a3fc26 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll @@ -0,0 +1,541 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=infer-alignment,load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S -o - %s | FileCheck %s + +; POSITIVE TESTS + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @singleGap(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGap( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[OUT]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @singleGapDouble(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapDouble( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p1(<4 x double> , ptr addrspace(1) [[OUT]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store double 1.0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store double 2.0, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store double 4.0, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 3, filling the gaps with generated stores of elements 1 and 2 +define void @multipleGaps(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @multipleGaps( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[OUT]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 3, 4, 7, filling the gaps with generated stores of elements 1, 2, 5, 6 +define void @multipleGaps8xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @multipleGaps8xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 2, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 4, ptr addrspace(1) %getElem4, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i32 8, ptr addrspace(1) %getElem7, align 4 + ret void +} + +; store elements 0, 1, 2, 3, 5, 6, 7, filling the gap with a generated store of element 4, +; resulting in two 4xi64 stores with the second one led by a gap filled store. +define void @singleGapLongerChain(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapLongerChain( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store <4 x i64> , ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GAPFILLGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[GETELEM3]], i64 8 +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[GAPFILLGEP]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i64 3, ptr addrspace(1) %getElem2, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + %getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 40 + store i64 6, ptr addrspace(1) %getElem5, align 8 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 48 + store i64 7, ptr addrspace(1) %getElem6, align 8 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 56 + store i64 8, ptr addrspace(1) %getElem7, align 8 + ret void +} + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @vectorElements(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @vectorElements( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + store <2 x i32> , ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store <2 x i32> , ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store <2 x i32> , ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 1, 3. 2 should not end up filled because 8xi64 is not legal. +define void @vectorElements64(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @vectorElements64( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <4 x i64> , ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48 +; CHECK-NEXT: store <2 x i64> , ptr addrspace(1) [[GETELEM1]], align 16 +; CHECK-NEXT: ret void +; + store <2 x i64> , ptr addrspace(1) %in, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + store <2 x i64> , ptr addrspace(1) %getElem1, align 16 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + store <2 x i64> , ptr addrspace(1) %getElem3, align 16 + ret void +} + +; store elements 0, 1, 2, extending element 3 +define void @extendStores(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStores( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[OUT]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i64 3, ptr addrspace(1) %getElem2, align 8 + ret void +} + +; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7 +define void @extendStores8xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStores8xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 2, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 4, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 5, ptr addrspace(1) %getElem4, align 4 + ret void +} + +; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7 +define void @extendStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStoresFromLoads8xi32( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[LOAD27]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr addrspace(1) %in, align 32 + %loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4 + %loadGetElem2 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load2 = load i32, ptr addrspace(1) %loadGetElem2, align 4 + %loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4 + %loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4 + + store i32 %load0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 %load1, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 %load2, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 %load3, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 %load4, ptr addrspace(1) %getElem4, align 4 + ret void +} + +; store elements 0, 1, 3, 4, gap fill element 2, extend elements 5, 6, 7 +define void @extendAndGapFillStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendAndGapFillStoresFromLoads8xi32( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 poison, i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr addrspace(1) %in, align 32 + %loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4 + %loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4 + %loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4 + + store i32 %load0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 %load1, ptr addrspace(1) %getElem1, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 %load3, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 %load4, ptr addrspace(1) %getElem4, align 4 + ret void +} + + +; NEGATIVE TESTS + +; Wrong address space, no gap filling +define void @singleGapWrongAddrSpace(ptr addrspace(3) %out) { +; CHECK-LABEL: define void @singleGapWrongAddrSpace( +; CHECK-SAME: ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i64> , ptr addrspace(3) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[OUT]], i32 24 +; CHECK-NEXT: store i64 4, ptr addrspace(3) [[GETELEM3]], align 8 +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(3) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 8 + store i64 2, ptr addrspace(3) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 24 + store i64 4, ptr addrspace(3) %getElem3, align 8 + ret void +} + +; Not enough alignment for masked store, but we still vectorize the smaller vector +define void @singleGapMisaligned(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapMisaligned( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i64> , ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i64 4, ptr addrspace(1) [[GETELEM3]], align 8 +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 16 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; Not enough bytes to meet the minimum masked store size for the target +define void @singleGap4xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGap4xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store i32 1, ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[GETELEM2]], align 8 +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 4, ptr addrspace(1) %getElem3, align 4 + ret void +} + +; store elements 0, 1, 2, 5, 6, 7. 3 and 4 don't get filled because the heuristic +; only fills 2-element gaps that are in the middle of a multiple of 4 +define void @gapInWrongLocation(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @gapInWrongLocation( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i32 3, ptr addrspace(1) [[GETELEM2]], align 8 +; CHECK-NEXT: [[GETELEM5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20 +; CHECK-NEXT: store i32 5, ptr addrspace(1) [[GETELEM5]], align 4 +; CHECK-NEXT: [[GETELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[GETELEM6]], align 8 +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 2, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20 + store i32 5, ptr addrspace(1) %getElem5, align 4 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i32 6, ptr addrspace(1) %getElem6, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i32 7, ptr addrspace(1) %getElem7, align 4 + ret void +} + +; This test has 32-bytes of i8s with a 2-element gap in the middle of each 4-byte chunk. +; i8s are not supported by masked stores on the target, so the stores will not be vectorized. +; The loads, on the other hand, get gap filled. +define void @cantMaski8(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @cantMaski8( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[LOAD031:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL32:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL233:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD334:%.*]] = extractelement <32 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD435:%.*]] = extractelement <32 x i8> [[TMP1]], i32 4 +; CHECK-NEXT: [[GAPFILL436:%.*]] = extractelement <32 x i8> [[TMP1]], i32 5 +; CHECK-NEXT: [[GAPFILL637:%.*]] = extractelement <32 x i8> [[TMP1]], i32 6 +; CHECK-NEXT: [[LOAD738:%.*]] = extractelement <32 x i8> [[TMP1]], i32 7 +; CHECK-NEXT: [[LOAD839:%.*]] = extractelement <32 x i8> [[TMP1]], i32 8 +; CHECK-NEXT: [[GAPFILL840:%.*]] = extractelement <32 x i8> [[TMP1]], i32 9 +; CHECK-NEXT: [[GAPFILL1041:%.*]] = extractelement <32 x i8> [[TMP1]], i32 10 +; CHECK-NEXT: [[LOAD1142:%.*]] = extractelement <32 x i8> [[TMP1]], i32 11 +; CHECK-NEXT: [[LOAD1243:%.*]] = extractelement <32 x i8> [[TMP1]], i32 12 +; CHECK-NEXT: [[GAPFILL1244:%.*]] = extractelement <32 x i8> [[TMP1]], i32 13 +; CHECK-NEXT: [[GAPFILL1445:%.*]] = extractelement <32 x i8> [[TMP1]], i32 14 +; CHECK-NEXT: [[LOAD1546:%.*]] = extractelement <32 x i8> [[TMP1]], i32 15 +; CHECK-NEXT: [[LOAD1647:%.*]] = extractelement <32 x i8> [[TMP1]], i32 16 +; CHECK-NEXT: [[GAPFILL1648:%.*]] = extractelement <32 x i8> [[TMP1]], i32 17 +; CHECK-NEXT: [[GAPFILL1849:%.*]] = extractelement <32 x i8> [[TMP1]], i32 18 +; CHECK-NEXT: [[LOAD1950:%.*]] = extractelement <32 x i8> [[TMP1]], i32 19 +; CHECK-NEXT: [[LOAD2051:%.*]] = extractelement <32 x i8> [[TMP1]], i32 20 +; CHECK-NEXT: [[GAPFILL2052:%.*]] = extractelement <32 x i8> [[TMP1]], i32 21 +; CHECK-NEXT: [[GAPFILL2253:%.*]] = extractelement <32 x i8> [[TMP1]], i32 22 +; CHECK-NEXT: [[LOAD2354:%.*]] = extractelement <32 x i8> [[TMP1]], i32 23 +; CHECK-NEXT: [[LOAD2455:%.*]] = extractelement <32 x i8> [[TMP1]], i32 24 +; CHECK-NEXT: [[GAPFILL2456:%.*]] = extractelement <32 x i8> [[TMP1]], i32 25 +; CHECK-NEXT: [[GAPFILL2657:%.*]] = extractelement <32 x i8> [[TMP1]], i32 26 +; CHECK-NEXT: [[LOAD2758:%.*]] = extractelement <32 x i8> [[TMP1]], i32 27 +; CHECK-NEXT: [[LOAD2859:%.*]] = extractelement <32 x i8> [[TMP1]], i32 28 +; CHECK-NEXT: [[GAPFILL2860:%.*]] = extractelement <32 x i8> [[TMP1]], i32 29 +; CHECK-NEXT: [[GAPFILL3061:%.*]] = extractelement <32 x i8> [[TMP1]], i32 30 +; CHECK-NEXT: [[LOAD3162:%.*]] = extractelement <32 x i8> [[TMP1]], i32 31 +; CHECK-NEXT: store i8 [[LOAD031]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[OUTELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 3 +; CHECK-NEXT: store i8 [[LOAD334]], ptr addrspace(1) [[OUTELEM3]], align 1 +; CHECK-NEXT: [[OUTELEM4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 4 +; CHECK-NEXT: store i8 [[LOAD435]], ptr addrspace(1) [[OUTELEM4]], align 4 +; CHECK-NEXT: [[OUTELEM7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 7 +; CHECK-NEXT: store i8 [[LOAD738]], ptr addrspace(1) [[OUTELEM7]], align 1 +; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i8 [[LOAD839]], ptr addrspace(1) [[OUTELEM8]], align 8 +; CHECK-NEXT: [[OUTELEM11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 11 +; CHECK-NEXT: store i8 [[LOAD1142]], ptr addrspace(1) [[OUTELEM11]], align 1 +; CHECK-NEXT: [[OUTELEM12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 12 +; CHECK-NEXT: store i8 [[LOAD1243]], ptr addrspace(1) [[OUTELEM12]], align 4 +; CHECK-NEXT: [[OUTELEM15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 15 +; CHECK-NEXT: store i8 [[LOAD1546]], ptr addrspace(1) [[OUTELEM15]], align 1 +; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16 +; CHECK-NEXT: store i8 [[LOAD1647]], ptr addrspace(1) [[OUTELEM16]], align 16 +; CHECK-NEXT: [[OUTELEM19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 19 +; CHECK-NEXT: store i8 [[LOAD1950]], ptr addrspace(1) [[OUTELEM19]], align 1 +; CHECK-NEXT: [[OUTELEM20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20 +; CHECK-NEXT: store i8 [[LOAD2051]], ptr addrspace(1) [[OUTELEM20]], align 4 +; CHECK-NEXT: [[OUTELEM23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 23 +; CHECK-NEXT: store i8 [[LOAD2354]], ptr addrspace(1) [[OUTELEM23]], align 1 +; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i8 [[LOAD2455]], ptr addrspace(1) [[OUTELEM24]], align 8 +; CHECK-NEXT: [[OUTELEM27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 27 +; CHECK-NEXT: store i8 [[LOAD2758]], ptr addrspace(1) [[OUTELEM27]], align 1 +; CHECK-NEXT: [[OUTELEM28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 28 +; CHECK-NEXT: store i8 [[LOAD2859]], ptr addrspace(1) [[OUTELEM28]], align 4 +; CHECK-NEXT: [[OUTELEM31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 31 +; CHECK-NEXT: store i8 [[LOAD3162]], ptr addrspace(1) [[OUTELEM31]], align 1 +; CHECK-NEXT: ret void +; + %load0 = load i8, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 3 + %load3 = load i8, ptr addrspace(1) %getElem3, align 1 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load4 = load i8, ptr addrspace(1) %getElem4, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 7 + %load7 = load i8, ptr addrspace(1) %getElem7, align 1 + %getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load8 = load i8, ptr addrspace(1) %getElem8, align 8 + %getElem11 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 11 + %load11 = load i8, ptr addrspace(1) %getElem11, align 1 + %getElem12 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load12 = load i8, ptr addrspace(1) %getElem12, align 4 + %getElem15 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 15 + %load15 = load i8, ptr addrspace(1) %getElem15, align 1 + %getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load16 = load i8, ptr addrspace(1) %getElem16, align 16 + %getElem19 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 19 + %load19 = load i8, ptr addrspace(1) %getElem19, align 1 + %getElem20 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 20 + %load20 = load i8, ptr addrspace(1) %getElem20, align 4 + %getElem23 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 23 + %load23 = load i8, ptr addrspace(1) %getElem23, align 1 + %getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %load24 = load i8, ptr addrspace(1) %getElem24, align 8 + %getElem27 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 27 + %load27 = load i8, ptr addrspace(1) %getElem27, align 1 + %getElem28 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 28 + %load28 = load i8, ptr addrspace(1) %getElem28, align 4 + %getElem31 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 31 + %load31 = load i8, ptr addrspace(1) %getElem31, align 1 + + store i8 %load0, ptr addrspace(1) %out, align 32 + %outElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 3 + store i8 %load3, ptr addrspace(1) %outElem3, align 1 + %outElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i8 %load4, ptr addrspace(1) %outElem4, align 4 + %outElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 7 + store i8 %load7, ptr addrspace(1) %outElem7, align 1 + %outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i8 %load8, ptr addrspace(1) %outElem8, align 8 + %outElem11 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 11 + store i8 %load11, ptr addrspace(1) %outElem11, align 1 + %outElem12 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i8 %load12, ptr addrspace(1) %outElem12, align 4 + %outElem15 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 15 + store i8 %load15, ptr addrspace(1) %outElem15, align 1 + %outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i8 %load16, ptr addrspace(1) %outElem16, align 16 + %outElem19 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 19 + store i8 %load19, ptr addrspace(1) %outElem19, align 1 + %outElem20 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20 + store i8 %load20, ptr addrspace(1) %outElem20, align 4 + %outElem23 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 23 + store i8 %load23, ptr addrspace(1) %outElem23, align 1 + %outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i8 %load24, ptr addrspace(1) %outElem24, align 8 + %outElem27 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 27 + store i8 %load27, ptr addrspace(1) %outElem27, align 1 + %outElem28 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i8 %load28, ptr addrspace(1) %outElem28, align 4 + %outElem31 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 31 + store i8 %load31, ptr addrspace(1) %outElem31, align 1 + + ret void +} + +; This test has 32-bytes of i16s with a 2-element gap in the middle of each 4-element chunk. +; i16s are not supported by masked stores on the target, so the stores will not be vectorized. +; The loads, on the other hand, get gap filled. +define void @cantMaski16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @cantMaski16( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[LOAD015:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL217:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD318:%.*]] = extractelement <16 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD419:%.*]] = extractelement <16 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[GAPFILL420:%.*]] = extractelement <16 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[GAPFILL621:%.*]] = extractelement <16 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[LOAD722:%.*]] = extractelement <16 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: [[LOAD823:%.*]] = extractelement <16 x i16> [[TMP1]], i32 8 +; CHECK-NEXT: [[GAPFILL824:%.*]] = extractelement <16 x i16> [[TMP1]], i32 9 +; CHECK-NEXT: [[GAPFILL1025:%.*]] = extractelement <16 x i16> [[TMP1]], i32 10 +; CHECK-NEXT: [[LOAD1126:%.*]] = extractelement <16 x i16> [[TMP1]], i32 11 +; CHECK-NEXT: [[LOAD1227:%.*]] = extractelement <16 x i16> [[TMP1]], i32 12 +; CHECK-NEXT: [[GAPFILL1228:%.*]] = extractelement <16 x i16> [[TMP1]], i32 13 +; CHECK-NEXT: [[GAPFILL1429:%.*]] = extractelement <16 x i16> [[TMP1]], i32 14 +; CHECK-NEXT: [[LOAD1530:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15 +; CHECK-NEXT: store i16 [[LOAD015]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[OUTELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 6 +; CHECK-NEXT: store i16 [[LOAD318]], ptr addrspace(1) [[OUTELEM6]], align 2 +; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i16 [[LOAD419]], ptr addrspace(1) [[OUTELEM8]], align 8 +; CHECK-NEXT: [[OUTELEM14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 14 +; CHECK-NEXT: store i16 [[LOAD722]], ptr addrspace(1) [[OUTELEM14]], align 2 +; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16 +; CHECK-NEXT: store i16 [[LOAD823]], ptr addrspace(1) [[OUTELEM16]], align 16 +; CHECK-NEXT: [[OUTELEM22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 22 +; CHECK-NEXT: store i16 [[LOAD1126]], ptr addrspace(1) [[OUTELEM22]], align 2 +; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i16 [[LOAD1227]], ptr addrspace(1) [[OUTELEM24]], align 8 +; CHECK-NEXT: [[OUTELEM30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 30 +; CHECK-NEXT: store i16 [[LOAD1530]], ptr addrspace(1) [[OUTELEM30]], align 2 +; CHECK-NEXT: ret void +; + %load0 = load i16, ptr addrspace(1) %in, align 32 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 6 + %load3 = load i16, ptr addrspace(1) %getElem6, align 2 + %getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load4 = load i16, ptr addrspace(1) %getElem8, align 8 + %getElem14 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 14 + %load7 = load i16, ptr addrspace(1) %getElem14, align 2 + %getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load8 = load i16, ptr addrspace(1) %getElem16, align 16 + %getElem22 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 22 + %load11 = load i16, ptr addrspace(1) %getElem22, align 2 + %getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %load12 = load i16, ptr addrspace(1) %getElem24, align 8 + %getElem30 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 30 + %load15 = load i16, ptr addrspace(1) %getElem30, align 2 + + store i16 %load0, ptr addrspace(1) %out, align 32 + %outElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 6 + store i16 %load3, ptr addrspace(1) %outElem6, align 2 + %outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i16 %load4, ptr addrspace(1) %outElem8, align 8 + %outElem14 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 14 + store i16 %load7, ptr addrspace(1) %outElem14, align 2 + %outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i16 %load8, ptr addrspace(1) %outElem16, align 16 + %outElem22 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 22 + store i16 %load11, ptr addrspace(1) %outElem22, align 2 + %outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i16 %load12, ptr addrspace(1) %outElem24, align 8 + %outElem30 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 30 + store i16 %load15, ptr addrspace(1) %outElem30, align 2 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll index 2d3c289c2a12b..e031daab6d786 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll @@ -40,8 +40,7 @@ define void @int8x3a4(ptr nocapture align 4 %ptr) { ret void ; CHECK-LABEL: @int8x3a4 -; CHECK: load <2 x i8> -; CHECK: load i8 +; CHECK: load <4 x i8> ; CHECK: store <2 x i8> ; CHECK: store i8 } From 68a88d16ed381b1ebbd713566d781814e3284204 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Wed, 17 Sep 2025 15:48:02 +0000 Subject: [PATCH 2/8] Clang format --- llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 04f4e92826a52..d452e1609957a 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -930,9 +930,7 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { } Chain ExtendingLoadsStores; - bool ExtendChain = IsLoadChain - ? ExtendLoads - : ExtendStores; + bool ExtendChain = IsLoadChain ? ExtendLoads : ExtendStores; if (ExtendChain && NumVecElems < TargetVF && NumVecElems % 2 != 0 && VecElemBits >= 8) { // TargetVF may be a lot higher than NumVecElems, @@ -1047,8 +1045,8 @@ bool Vectorizer::vectorizeChain(Chain &C) { // If we are left with a two-element chain, and one of the elements is an // extra element, we don't want to vectorize - if (C.size() == 2 && (ExtraElements.contains(C[0].Inst) || - ExtraElements.contains(C[1].Inst))) + if (C.size() == 2 && + (ExtraElements.contains(C[0].Inst) || ExtraElements.contains(C[1].Inst))) return false; sortChainInOffsetOrder(C); From 001b4095a931db40264e782afaa443af5cce3ed6 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Thu, 18 Sep 2025 15:37:56 +0000 Subject: [PATCH 3/8] Remove cl opts --- .../Vectorize/LoadStoreVectorizer.cpp | 32 ++----------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index d452e1609957a..b0f7f12b157f3 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -119,29 +119,6 @@ using namespace llvm; #define DEBUG_TYPE "load-store-vectorizer" -cl::opt - ExtendLoads("vect-extend-loads", cl::Hidden, - cl::desc("Load more elements if the target VF is higher " - "than the chain length."), - cl::init(true)); - -cl::opt ExtendStores( - "vect-extend-stores", cl::Hidden, - cl::desc("Store more elements if the target VF is higher " - "than the chain length and we have access to masked stores."), - cl::init(true)); - -cl::opt FillLoadGaps( - "vect-fill-load-gaps", cl::Hidden, - cl::desc("Should Loads be introduced in gaps to enable vectorization."), - cl::init(true)); - -cl::opt - FillStoreGaps("vect-fill-store-gaps", cl::Hidden, - cl::desc("Should Stores be introduced in gaps to enable " - "vectorization into masked stores."), - cl::init(true)); - STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); @@ -689,9 +666,8 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { // store for the target. If later on, we don't end up with a chain that // could be vectorized into a legal masked store, the chains with extra // elements will be filtered out in splitChainByAlignment. - bool TryFillGaps = isa(C[0].Inst) - ? (FillLoadGaps && TTI.isLegalToWidenLoads()) - : (FillStoreGaps && shouldAttemptMaskedStore(C)); + bool TryFillGaps = isa(C[0].Inst) ? TTI.isLegalToWidenLoads() + : shouldAttemptMaskedStore(C); unsigned ASPtrBits = DL.getIndexSizeInBits(getLoadStoreAddressSpace(C[0].Inst)); @@ -930,9 +906,7 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { } Chain ExtendingLoadsStores; - bool ExtendChain = IsLoadChain ? ExtendLoads : ExtendStores; - if (ExtendChain && NumVecElems < TargetVF && NumVecElems % 2 != 0 && - VecElemBits >= 8) { + if (NumVecElems < TargetVF && NumVecElems % 2 != 0 && VecElemBits >= 8) { // TargetVF may be a lot higher than NumVecElems, // so only extend to the next power of 2. assert(VecElemBits % 8 == 0); From da7391b6e6d22b96ded1e6cc71fe2141965d936a Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Thu, 18 Sep 2025 15:58:56 +0000 Subject: [PATCH 4/8] Add context argument to TTI API --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 2 +- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 +- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 4 +++- llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 7 ++++--- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index f8f134c833ea2..aaf37c25939c8 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -821,7 +821,7 @@ class TargetTransformInfo { /// assuming the result is still well-aligned. For example, converting a load /// i32 to a load i64, or vectorizing three continuous load i32s into a load /// <4 x i32>. - LLVM_ABI bool isLegalToWidenLoads() const; + LLVM_ABI bool isLegalToWidenLoads(LLVMContext &Context) const; /// Return true if the target supports nontemporal store. LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 55bd4bd709589..69dd9bffdf8d9 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -318,7 +318,7 @@ class TargetTransformInfoImplBase { return false; } - virtual bool isLegalToWidenLoads() const { return false; } + virtual bool isLegalToWidenLoads(LLVMContext &Context) const { return false; } virtual bool isLegalNTStore(Type *DataType, Align Alignment) const { // By default, assume nontemporal memory stores are available for stores diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index d56cff1ce3695..9ed9f10f770a7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -72,7 +72,9 @@ class NVPTXTTIImpl final : public BasicTTIImplBase { return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace); } - bool isLegalToWidenLoads() const override { return true; }; + bool isLegalToWidenLoads(LLVMContext &Context) const override { + return true; + }; // NVPTX has infinite registers of all kinds, but the actual machine doesn't. // We conservatively return 1 here which is just enough to enable the diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index b0f7f12b157f3..bfdf18b582e7f 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -666,8 +666,9 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { // store for the target. If later on, we don't end up with a chain that // could be vectorized into a legal masked store, the chains with extra // elements will be filtered out in splitChainByAlignment. - bool TryFillGaps = isa(C[0].Inst) ? TTI.isLegalToWidenLoads() - : shouldAttemptMaskedStore(C); + bool TryFillGaps = isa(C[0].Inst) + ? TTI.isLegalToWidenLoads(F.getContext()) + : shouldAttemptMaskedStore(C); unsigned ASPtrBits = DL.getIndexSizeInBits(getLoadStoreAddressSpace(C[0].Inst)); @@ -924,7 +925,7 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { // otherwise we may unnecessary split the chain when the target actually // supports non-pow2 VF. if (accessIsAllowedAndFast(NewSizeBytes, AS, Alignment, VecElemBits) && - ((IsLoadChain ? TTI.isLegalToWidenLoads() + ((IsLoadChain ? TTI.isLegalToWidenLoads(F.getContext()) : TTI.isLegalMaskedStore( FixedVectorType::get(VecElemTy, NewNumVecElems), Alignment, AS, /*IsMaskConstant=*/true)))) { From 838017430d514878d37421cfe9eec6ad1a7b9a50 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Thu, 25 Sep 2025 11:21:06 -0500 Subject: [PATCH 5/8] Update llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp Co-authored-by: Matt Arsenault --- llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index bfdf18b582e7f..bf8dd2580ff80 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -365,7 +365,7 @@ class Vectorizer { /// deleted before the end of the pass. ChainElem createExtraElementAfter(const ChainElem &PrevElem, APInt Offset, StringRef Prefix, - Align Alignment = Align(1)); + Align Alignment = Align()); /// Delete dead GEPs and extra Load/Store instructions created by /// createExtraElementAfter From f07f6301bfa1fe8567a2c9cb9a0239814c5b2f4b Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Thu, 16 Oct 2025 19:08:21 +0000 Subject: [PATCH 6/8] Update tests to test for masked load generation in the LSV --- llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 1 + .../CodeGen/NVPTX/param-vectorize-device.ll | 2 ++ .../LoadStoreVectorizer/NVPTX/extend-chain.ll | 4 ++-- .../NVPTX/gap-fill-cleanup.ll | 2 +- .../NVPTX/gap-fill-invariant.ll | 2 +- .../NVPTX/gap-fill-vectors.ll | 10 +++++----- .../LoadStoreVectorizer/NVPTX/gap-fill.ll | 18 +++++++++--------- .../LoadStoreVectorizer/NVPTX/masked-store.ll | 8 ++++---- .../LoadStoreVectorizer/NVPTX/vectorize_i8.ll | 2 +- 9 files changed, 26 insertions(+), 23 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index 19ec2574e32b4..21b18555371dc 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -50,6 +50,7 @@ define half @fh(ptr %p) { ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: ; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0]; +; ENABLED-NEXT: .pragma "used_bytes_mask 1023"; ; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } ; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2; diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index 4870050dd2d43..67dd29b1b6ca6 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -171,6 +171,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) ; CHECK-LABEL: callee_St4x3( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] + ; CHECK: .pragma "used_bytes_mask 4095"; ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0]; ; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]]; @@ -393,6 +394,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St4x7( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; + ; CHECK: .pragma "used_bytes_mask 4095"; ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16]; ; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll index 24d0dea086ba8..2207d5b471d20 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll @@ -10,7 +10,7 @@ define void @load3to4(ptr %p) #0 { ; CHECK-LABEL: define void @load3to4( ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P_0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[P_0]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[V01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[V12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[V23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 @@ -32,7 +32,7 @@ define void @load5to8(ptr %p) #0 { ; CHECK-LABEL: define void @load5to8( ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P_0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[P_0]], i32 16, <8 x i1> , <8 x i16> poison) ; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 ; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 ; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll index e812f8750fa76..e92ffe8eadbc3 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll @@ -8,7 +8,7 @@ define void @fillTwoGapsCanVectorize(ptr %in) { ; CHECK-LABEL: define void @fillTwoGapsCanVectorize( ; CHECK-SAME: ptr [[IN:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[IN]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll index 6d0dfc677780d..7a28faf8b4810 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll @@ -62,7 +62,7 @@ define i32 @noGapsMissingInvariant(ptr %in) { define i32 @twoGaps(ptr %in) { ; CHECK-LABEL: define i32 @twoGaps( ; CHECK-SAME: ptr [[IN:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[IN]], i32 16, <4 x i1> , <4 x i32> poison), !invariant.load [[META0]] ; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll index fe7123898d450..303ec7e564d49 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll @@ -9,7 +9,7 @@ define void @i1x8_gap_gap_i1x8(ptr %ptr) { ; CHECK-LABEL: define void @i1x8_gap_gap_i1x8( ; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i1>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i1> @llvm.masked.load.v32i1.p0(ptr [[PTR0]], i32 4, <32 x i1> , <32 x i1> poison) ; CHECK-NEXT: [[L03:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> ; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> ; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> @@ -49,7 +49,7 @@ define void @i8x2_gap_gap_i8x2(ptr %ptr) { ; CHECK-LABEL: define void @i8x2_gap_gap_i8x2( ; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[PTR0]], i32 8, <8 x i1> , <8 x i8> poison) ; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> ; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> ; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> @@ -89,7 +89,7 @@ define void @i16x2_gap_i16x2_i16x2(ptr %ptr) { ; CHECK-LABEL: define void @i16x2_gap_i16x2_i16x2( ; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[PTR0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[PTR0]], i32 16, <8 x i1> , <8 x i16> poison) ; CHECK-NEXT: [[L01:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: [[GAPFILL2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: [[L23:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> @@ -111,7 +111,7 @@ define void @i16x2_gap_gap_i16x2(ptr %ptr) { ; CHECK-LABEL: define void @i16x2_gap_gap_i16x2( ; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[PTR0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[PTR0]], i32 16, <8 x i1> , <8 x i16> poison) ; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> @@ -130,7 +130,7 @@ define void @i16x2_gap_gap_i16x2(ptr %ptr) { define void @i32x2_i32x2_gap_i32x2(ptr addrspace(1) %in) { ; CHECK-LABEL: define void @i32x2_i32x2_gap_i32x2( ; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[IN]], i32 32, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[GAPFILL3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll index 82ebffed7f765..aae1a5f7266c4 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll @@ -5,7 +5,7 @@ define void @test(ptr %ptr) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[LD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 @@ -24,7 +24,7 @@ define void @test(ptr %ptr) { define void @test2(ptr %ptr) { ; CHECK-LABEL: define void @test2( ; CHECK-SAME: ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[GAPFILL2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[LD23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 @@ -83,7 +83,7 @@ define void @test4(ptr %ptr) { define void @test5(ptr %ptr) { ; CHECK-LABEL: define void @test5( ; CHECK-SAME: ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 @@ -100,13 +100,13 @@ define void @test5(ptr %ptr) { define void @test6(ptr %ptr) { ; CHECK-LABEL: define void @test6( ; CHECK-SAME: ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 ; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 ; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP4]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[GEP4]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LD47:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: [[GAPFILL28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 @@ -132,13 +132,13 @@ define void @test6(ptr %ptr) { define void @test7(ptr %ptr) { ; CHECK-LABEL: define void @test7( ; CHECK-SAME: ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LD05:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[LD16:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL7:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 ; CHECK-NEXT: [[LD38:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 ; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP4]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[GEP4]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LD49:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: [[GAPFILL210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: [[GAPFILL411:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 @@ -164,14 +164,14 @@ define void @test7(ptr %ptr) { define void @test8(ptr %ptr) { ; CHECK-LABEL: define void @test8( ; CHECK-SAME: ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 ; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 ; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 12 ; CHECK-NEXT: [[GAPFILLGEP1:%.*]] = getelementptr i8, ptr [[GEP3]], i64 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GAPFILLGEP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[GAPFILLGEP1]], i32 16, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[GAPFILL27:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: [[LD58:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll index 1346bd0a3fc26..75d9c4b6e3125 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll @@ -160,7 +160,7 @@ define void @extendStores8xi32(ptr addrspace(1) %out) { define void @extendStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CHECK-LABEL: define void @extendStoresFromLoads8xi32( ; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[IN]], i32 32, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 @@ -206,7 +206,7 @@ define void @extendStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) % define void @extendAndGapFillStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CHECK-LABEL: define void @extendAndGapFillStoresFromLoads8xi32( ; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[IN]], i32 32, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 @@ -332,7 +332,7 @@ define void @gapInWrongLocation(ptr addrspace(1) %out) { define void @cantMaski8(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CHECK-LABEL: define void @cantMaski8( ; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p1(ptr addrspace(1) [[IN]], i32 32, <32 x i1> , <32 x i8> poison) ; CHECK-NEXT: [[LOAD031:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[GAPFILL32:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL233:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2 @@ -471,7 +471,7 @@ define void @cantMaski8(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @cantMaski16(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CHECK-LABEL: define void @cantMaski16( ; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.masked.load.v16i16.p1(ptr addrspace(1) [[IN]], i32 32, <16 x i1> , <16 x i16> poison) ; CHECK-NEXT: [[LOAD015:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0 ; CHECK-NEXT: [[GAPFILL16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1 ; CHECK-NEXT: [[GAPFILL217:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll index e031daab6d786..03c7f31b40d85 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll @@ -40,7 +40,7 @@ define void @int8x3a4(ptr nocapture align 4 %ptr) { ret void ; CHECK-LABEL: @int8x3a4 -; CHECK: load <4 x i8> +; CHECK: call <4 x i8> @llvm.masked.load.v4i8.p0 ; CHECK: store <2 x i8> ; CHECK: store i8 } From 73441cc36f14c601d95782b5fd4966fd17db88c1 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Thu, 16 Oct 2025 19:48:39 +0000 Subject: [PATCH 7/8] Remove isLegalToWidenLoads API --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 6 ------ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 -- llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ---- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 4 ---- 4 files changed, 16 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index aaf37c25939c8..41ff54f0781a2 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -817,12 +817,6 @@ class TargetTransformInfo { LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const; - /// Return true if it is legal to widen loads beyond their current width, - /// assuming the result is still well-aligned. For example, converting a load - /// i32 to a load i64, or vectorizing three continuous load i32s into a load - /// <4 x i32>. - LLVM_ABI bool isLegalToWidenLoads(LLVMContext &Context) const; - /// Return true if the target supports nontemporal store. LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const; /// Return true if the target supports nontemporal load. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 69dd9bffdf8d9..566e1cf51631a 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -318,8 +318,6 @@ class TargetTransformInfoImplBase { return false; } - virtual bool isLegalToWidenLoads(LLVMContext &Context) const { return false; } - virtual bool isLegalNTStore(Type *DataType, Align Alignment) const { // By default, assume nontemporal memory stores are available for stores // that are aligned and have a size that is a power of 2. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 89cda79558057..09b50c5270e57 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -476,10 +476,6 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType, Align Alignment, return TTIImpl->isLegalMaskedLoad(DataType, Alignment, AddressSpace); } -bool TargetTransformInfo::isLegalToWidenLoads() const { - return TTIImpl->isLegalToWidenLoads(); -} - bool TargetTransformInfo::isLegalNTStore(Type *DataType, Align Alignment) const { return TTIImpl->isLegalNTStore(DataType, Alignment); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 9ed9f10f770a7..b32d931bd3074 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -72,10 +72,6 @@ class NVPTXTTIImpl final : public BasicTTIImplBase { return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace); } - bool isLegalToWidenLoads(LLVMContext &Context) const override { - return true; - }; - // NVPTX has infinite registers of all kinds, but the actual machine doesn't. // We conservatively return 1 here which is just enough to enable the // vectorizers but disables heuristics based on the number of registers. From 34a5cdf80242b6211ef636cbc1c1932625d49d09 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Thu, 16 Oct 2025 20:12:01 +0000 Subject: [PATCH 8/8] Change LSV to create masked loads --- .../Vectorize/LoadStoreVectorizer.cpp | 173 ++++++++++-------- 1 file changed, 98 insertions(+), 75 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index bf8dd2580ff80..155d4119ea1fe 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -355,18 +355,23 @@ class Vectorizer { unsigned VecElemBits) const; /// Before attempting to fill gaps, check if the chain is a candidate for - /// a masked store, to save compile time if it is not possible for the address - /// space and element type. - bool shouldAttemptMaskedStore(const ArrayRef C) const; + /// a masked load/store, to save compile time if it is not possible for the + /// address space and element type. + bool shouldAttemptMaskedLoadStore(const ArrayRef C) const; /// Create a new GEP and a new Load/Store instruction such that the GEP /// is pointing at PrevElem + Offset. In the case of stores, store poison. - /// Extra elements will either be combined into a vector/masked store or + /// Extra elements will either be combined into a masked load/store or /// deleted before the end of the pass. ChainElem createExtraElementAfter(const ChainElem &PrevElem, APInt Offset, StringRef Prefix, Align Alignment = Align()); + /// Create a mask that masks off the extra elements in the chain, to be used + /// for the creation of a masked load/store vector. + Value *createMaskForExtraElements(const ArrayRef C, Type *VecTy, + Align Alignment, unsigned AS); + /// Delete dead GEPs and extra Load/Store instructions created by /// createExtraElementAfter void deleteExtraElements(); @@ -660,15 +665,11 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { // If the chain is not contiguous, we try to fill the gap with "extra" // elements to artificially make it contiguous, to try to enable - // vectorization. - // - Filling gaps in loads is always ok if the target supports widening loads. - // - For stores, we only fill gaps if there is a potentially legal masked - // store for the target. If later on, we don't end up with a chain that - // could be vectorized into a legal masked store, the chains with extra - // elements will be filtered out in splitChainByAlignment. - bool TryFillGaps = isa(C[0].Inst) - ? TTI.isLegalToWidenLoads(F.getContext()) - : shouldAttemptMaskedStore(C); + // vectorization. We only fill gaps if there is a potentially legal masked + // load/store for the target. If later on, we don't end up with a chain that + // could be vectorized into a legal masked load/store, the chains with extra + // elements will be filtered out in splitChainByAlignment. + bool TryFillGaps = shouldAttemptMaskedLoadStore(C); unsigned ASPtrBits = DL.getIndexSizeInBits(getLoadStoreAddressSpace(C[0].Inst)); @@ -826,11 +827,9 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { // For compile time reasons, we cache whether or not the superset // of all candidate chains contains any extra stores from earlier gap - // filling. - bool CandidateChainsMayContainExtraStores = - !IsLoadChain && any_of(C, [this](const ChainElem &E) { - return ExtraElements.contains(E.Inst); - }); + // of all candidate chains contains any extra loads/stores from earlier gap + bool CandidateChainsMayContainExtraLoadsStores = any_of( + C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); }); std::vector Ret; for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { @@ -925,10 +924,14 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { // otherwise we may unnecessary split the chain when the target actually // supports non-pow2 VF. if (accessIsAllowedAndFast(NewSizeBytes, AS, Alignment, VecElemBits) && - ((IsLoadChain ? TTI.isLegalToWidenLoads(F.getContext()) - : TTI.isLegalMaskedStore( - FixedVectorType::get(VecElemTy, NewNumVecElems), - Alignment, AS, /*IsMaskConstant=*/true)))) { + ((IsLoadChain && + TTI.isLegalMaskedLoad( + FixedVectorType::get(VecElemTy, NewNumVecElems), Alignment, + AS, true)) || + (!IsLoadChain && + TTI.isLegalMaskedStore( + FixedVectorType::get(VecElemTy, NewNumVecElems), Alignment, + AS, true)))) { LLVM_DEBUG(dbgs() << "LSV: extending " << (IsLoadChain ? "load" : "store") << " chain of " << NumVecElems << " " @@ -972,31 +975,34 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { continue; } - if (CandidateChainsMayContainExtraStores) { - // The legality of adding extra stores to ExtendingLoadsStores has + if (CandidateChainsMayContainExtraLoadsStores) { + // The legality of adding extra loads/stores to ExtendingLoadsStores has // already been checked, but if the candidate chain contains extra - // stores from an earlier optimization, confirm legality now. + // loads/stores from an earlier optimization, confirm legality now. // This filter is essential because, when filling gaps in // splitChainByContinuity, we queried the API to check that (for a given - // element type and address space) there *may* be a legal masked store - // we can try to create. Now, we need to check if the actual chain we - // ended up with is legal to turn into a masked store. - // This is relevant for NVPTX targets, for example, where a masked store - // is only legal if we have ended up with a 256-bit vector. - bool CandidateChainContainsExtraStores = llvm::any_of( + // element type and address space) there *may* be a legal masked + // load/store we can aspire to create. Now, we need to check if the + // actual chain we ended up with is legal to turn into a masked + // load/store. This is relevant for NVPTX, for example, where a masked + // store is only legal if we have ended up with a 256-bit vector. + bool CandidateChainContainsExtraLoadsStores = llvm::any_of( ArrayRef(C).slice(CBegin, CEnd - CBegin + 1), [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); }); - if (CandidateChainContainsExtraStores && - !TTI.isLegalMaskedStore( - FixedVectorType::get(VecElemTy, NumVecElems), Alignment, AS, - /*IsMaskConstant=*/true)) { + if (CandidateChainContainsExtraLoadsStores && + ((IsLoadChain && !TTI.isLegalMaskedLoad( + FixedVectorType::get(VecElemTy, NumVecElems), + Alignment, AS, true)) || + (!IsLoadChain && !TTI.isLegalMaskedStore( + FixedVectorType::get(VecElemTy, NumVecElems), + Alignment, AS, true)))) { LLVM_DEBUG(dbgs() << "LSV: splitChainByAlignment discarding candidate chain " - "because it contains extra stores that we cannot " - "legally vectorize into a masked store \n"); + "because it contains extra loads/stores that we cannot " + "legally vectorize into a masked load/store \n"); continue; } } @@ -1024,6 +1030,9 @@ bool Vectorizer::vectorizeChain(Chain &C) { (ExtraElements.contains(C[0].Inst) || ExtraElements.contains(C[1].Inst))) return false; + bool ChainContainsExtraLoadsStores = llvm::any_of( + C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); }); + sortChainInOffsetOrder(C); LLVM_DEBUG({ @@ -1070,11 +1079,19 @@ bool Vectorizer::vectorizeChain(Chain &C) { return A.Inst->comesBefore(B.Inst); })->Inst); - // Chain is in offset order, so C[0] is the instr with the lowest offset, - // i.e. the root of the vector. - VecInst = Builder.CreateAlignedLoad(VecTy, - getLoadStorePointerOperand(C[0].Inst), - Alignment); + // If the chain contains extra loads, we need to vectorize into a + // masked load. + if (ChainContainsExtraLoadsStores) { + assert(TTI.isLegalMaskedLoad(VecTy, Alignment, AS, true)); + Value *Mask = createMaskForExtraElements(C, VecTy, Alignment, AS); + VecInst = Builder.CreateMaskedLoad( + VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask); + } else { + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + VecInst = Builder.CreateAlignedLoad( + VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment); + } unsigned VecIdx = 0; for (const ChainElem &E : C) { @@ -1145,31 +1162,10 @@ bool Vectorizer::vectorizeChain(Chain &C) { // If the chain originates from extra stores, we need to vectorize into a // masked store. - bool ChainContainsExtraStores = llvm::any_of(C, [this](const ChainElem &E) { - return ExtraElements.contains(E.Inst); - }); - if (ChainContainsExtraStores) { - assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS, - /*IsMaskConstant=*/true)); - unsigned MaskIdx = 0; - // loop through the chain and create a mask for the masked store - Value *Mask = PoisonValue::get(FixedVectorType::get( - Builder.getInt1Ty(), cast(VecTy)->getNumElements())); - for (const ChainElem &E : C) { - bool IsExtraStore = ExtraElements.contains(E.Inst); - if (FixedVectorType *VT = - dyn_cast(getLoadStoreType(E.Inst))) { - for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { - Mask = Builder.CreateInsertElement(Mask, - Builder.getInt1(!IsExtraStore), - Builder.getInt32(MaskIdx++)); - } - } else { - Mask = - Builder.CreateInsertElement(Mask, Builder.getInt1(!IsExtraStore), - Builder.getInt32(MaskIdx++)); - } - } + if (ChainContainsExtraLoadsStores) { + assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS, true)); + Value *Mask = + createMaskForExtraElements(C, Vec->getType(), Alignment, AS); VecInst = Builder.CreateMaskedStore( Vec, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask); } else { @@ -1862,8 +1858,9 @@ bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, return true; } -bool Vectorizer::shouldAttemptMaskedStore(const ArrayRef C) const { - assert(isa(C[0].Inst)); +bool Vectorizer::shouldAttemptMaskedLoadStore( + const ArrayRef C) const { + bool IsLoadChain = isa(C[0].Inst); unsigned AS = getLoadStoreAddressSpace(C[0].Inst); Type *ElementType = getLoadStoreType(C[0].Inst)->getScalarType(); @@ -1875,17 +1872,20 @@ bool Vectorizer::shouldAttemptMaskedStore(const ArrayRef C) const { VecRegBits / DL.getTypeSizeInBits(ElementType); // Attempt to find the smallest power-of-two number of elements that, if - // well aligned, could be represented as a legal masked store. + // well aligned, could be represented as a legal masked load/store. // If one exists for a given element type and address space, it is worth - // attempting to fill gaps as we may be able to create a legal masked store. - // If we do not end up with a legal masked store, chains with extra elements - // will be discarded. + // attempting to fill gaps as we may be able to create a legal masked + // load/store. If we do not end up with a legal masked load/store, chains with + // extra elements will be discarded. const unsigned MinMaskedStoreNumElems = 4; for (unsigned NumElems = MinMaskedStoreNumElems; NumElems <= MaxVectorNumElems; NumElems *= 2) { FixedVectorType *VectorType = FixedVectorType::get(ElementType, NumElems); - if (TTI.isLegalMaskedStore(VectorType, OptimisticAlign, AS, - /*IsMaskConstant=*/true)) + bool IsLegalMaskedInstruction = + IsLoadChain + ? TTI.isLegalMaskedLoad(VectorType, OptimisticAlign, AS, true) + : TTI.isLegalMaskedStore(VectorType, OptimisticAlign, AS, true); + if (IsLegalMaskedInstruction) return true; } return false; @@ -1927,6 +1927,29 @@ ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, return ChainElem{NewElement, NewOffsetFromLeader}; } +Value *Vectorizer::createMaskForExtraElements(const ArrayRef C, + Type *VecTy, Align Alignment, + unsigned AS) { + unsigned MaskIdx = 0; + Value *Mask = PoisonValue::get(FixedVectorType::get( + Builder.getInt1Ty(), cast(VecTy)->getNumElements())); + for (const ChainElem &E : C) { + bool IsExtraElement = ExtraElements.contains(E.Inst); + if (FixedVectorType *VT = + dyn_cast(getLoadStoreType(E.Inst))) { + for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { + Mask = + Builder.CreateInsertElement(Mask, Builder.getInt1(!IsExtraElement), + Builder.getInt32(MaskIdx++)); + } + } else { + Mask = Builder.CreateInsertElement(Mask, Builder.getInt1(!IsExtraElement), + Builder.getInt32(MaskIdx++)); + } + } + return Mask; +} + void Vectorizer::deleteExtraElements() { for (auto *ExtraElement : ExtraElements) { if (isa(ExtraElement)) {