diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 41ff54f0781a2..aaf37c25939c8 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -817,6 +817,12 @@ class TargetTransformInfo { LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const; + /// Return true if it is legal to widen loads beyond their current width, + /// assuming the result is still well-aligned. For example, converting a load + /// i32 to a load i64, or vectorizing three continuous load i32s into a load + /// <4 x i32>. + LLVM_ABI bool isLegalToWidenLoads(LLVMContext &Context) const; + /// Return true if the target supports nontemporal store. LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const; /// Return true if the target supports nontemporal load. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 566e1cf51631a..69dd9bffdf8d9 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -318,6 +318,8 @@ class TargetTransformInfoImplBase { return false; } + virtual bool isLegalToWidenLoads(LLVMContext &Context) const { return false; } + virtual bool isLegalNTStore(Type *DataType, Align Alignment) const { // By default, assume nontemporal memory stores are available for stores // that are aligned and have a size that is a power of 2. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 09b50c5270e57..89cda79558057 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -476,6 +476,10 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType, Align Alignment, return TTIImpl->isLegalMaskedLoad(DataType, Alignment, AddressSpace); } +bool TargetTransformInfo::isLegalToWidenLoads() const { + return TTIImpl->isLegalToWidenLoads(); +} + bool TargetTransformInfo::isLegalNTStore(Type *DataType, Align Alignment) const { return TTIImpl->isLegalNTStore(DataType, Alignment); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index b32d931bd3074..9ed9f10f770a7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -72,6 +72,10 @@ class NVPTXTTIImpl final : public BasicTTIImplBase { return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace); } + bool isLegalToWidenLoads(LLVMContext &Context) const override { + return true; + }; + // NVPTX has infinite registers of all kinds, but the actual machine doesn't. // We conservatively return 1 here which is just enough to enable the // vectorizers but disables heuristics based on the number of registers. diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 7b5137b0185ab..bf8dd2580ff80 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -246,12 +246,16 @@ class Vectorizer { const DataLayout &DL; IRBuilder<> Builder; - // We could erase instrs right after vectorizing them, but that can mess up - // our BB iterators, and also can make the equivalence class keys point to - // freed memory. This is fixable, but it's simpler just to wait until we're - // done with the BB and erase all at once. + /// We could erase instrs right after vectorizing them, but that can mess up + /// our BB iterators, and also can make the equivalence class keys point to + /// freed memory. This is fixable, but it's simpler just to wait until we're + /// done with the BB and erase all at once. SmallVector ToErase; + /// We insert load/store instructions and GEPs to fill gaps and extend chains + /// to enable vectorization. Keep track and delete them later. + DenseSet ExtraElements; + public: Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) @@ -344,6 +348,28 @@ class Vectorizer { /// Postcondition: For all i, ret[i][0].second == 0, because the first instr /// in the chain is the leader, and an instr touches distance 0 from itself. std::vector gatherChains(ArrayRef Instrs); + + /// Is a load/store with this alignment allowed by TTI and at least as fast + /// as an unvectorized load/store. + bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment, + unsigned VecElemBits) const; + + /// Before attempting to fill gaps, check if the chain is a candidate for + /// a masked store, to save compile time if it is not possible for the address + /// space and element type. + bool shouldAttemptMaskedStore(const ArrayRef C) const; + + /// Create a new GEP and a new Load/Store instruction such that the GEP + /// is pointing at PrevElem + Offset. In the case of stores, store poison. + /// Extra elements will either be combined into a vector/masked store or + /// deleted before the end of the pass. + ChainElem createExtraElementAfter(const ChainElem &PrevElem, APInt Offset, + StringRef Prefix, + Align Alignment = Align()); + + /// Delete dead GEPs and extra Load/Store instructions created by + /// createExtraElementAfter + void deleteExtraElements(); }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -457,12 +483,21 @@ bool Vectorizer::run() { Changed |= runOnPseudoBB(*It, *std::next(It)); for (Instruction *I : ToErase) { + // These will get deleted in deleteExtraElements. + // This is because ExtraElements will include both extra elements + // that *were* vectorized and extra elements that *were not* + // vectorized. ToErase will only include extra elements that *were* + // vectorized, so in order to avoid double deletion we skip them here and + // handle them in deleteExtraElements. + if (ExtraElements.contains(I)) + continue; auto *PtrOperand = getLoadStorePointerOperand(I); if (I->use_empty()) I->eraseFromParent(); RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); } ToErase.clear(); + deleteExtraElements(); } return Changed; @@ -623,6 +658,29 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { dumpChain(C); }); + // If the chain is not contiguous, we try to fill the gap with "extra" + // elements to artificially make it contiguous, to try to enable + // vectorization. + // - Filling gaps in loads is always ok if the target supports widening loads. + // - For stores, we only fill gaps if there is a potentially legal masked + // store for the target. If later on, we don't end up with a chain that + // could be vectorized into a legal masked store, the chains with extra + // elements will be filtered out in splitChainByAlignment. + bool TryFillGaps = isa(C[0].Inst) + ? TTI.isLegalToWidenLoads(F.getContext()) + : shouldAttemptMaskedStore(C); + + unsigned ASPtrBits = + DL.getIndexSizeInBits(getLoadStoreAddressSpace(C[0].Inst)); + + // Compute the alignment of the leader of the chain (which every stored offset + // is based on) using the current first element of the chain. This is + // conservative, we may be able to derive better alignment by iterating over + // the chain and finding the leader. + Align LeaderOfChainAlign = + commonAlignment(getLoadStoreAlignment(C[0].Inst), + C[0].OffsetFromLeader.abs().getLimitedValue()); + std::vector Ret; Ret.push_back({C.front()}); @@ -633,7 +691,8 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst)); assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by " "collectEquivalenceClass"); - APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8; + APInt PrevSzBytes = APInt(ASPtrBits, SzBits / 8); + APInt PrevReadEnd = Prev.OffsetFromLeader + PrevSzBytes; // Add this instruction to the end of the current chain, or start a new one. bool AreContiguous = It->OffsetFromLeader == PrevReadEnd; @@ -642,10 +701,54 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { << *Prev.Inst << " (ends at offset " << PrevReadEnd << ") -> " << *It->Inst << " (starts at offset " << It->OffsetFromLeader << ")\n"); - if (AreContiguous) + + if (AreContiguous) { CurChain.push_back(*It); - else - Ret.push_back({*It}); + continue; + } + + // For now, we aren't filling gaps between load/stores of different sizes. + // Additionally, as a conservative heuristic, we only fill gaps of 1-2 + // elements. Generating loads/stores with too many unused bytes has a side + // effect of increasing register pressure (on NVIDIA targets at least), + // which could cancel out the benefits of reducing number of load/stores. + if (TryFillGaps && + SzBits == DL.getTypeSizeInBits(getLoadStoreType(It->Inst))) { + APInt OffsetOfGapStart = Prev.OffsetFromLeader + PrevSzBytes; + APInt GapSzBytes = It->OffsetFromLeader - OffsetOfGapStart; + if (GapSzBytes == PrevSzBytes) { + // There is a single gap between Prev and Curr, create one extra element + ChainElem NewElem = createExtraElementAfter( + Prev, PrevSzBytes, "GapFill", + commonAlignment(LeaderOfChainAlign, + OffsetOfGapStart.abs().getLimitedValue())); + CurChain.push_back(NewElem); + CurChain.push_back(*It); + continue; + } + // There are two gaps between Prev and Curr, only create two extra + // elements if Prev is the first element in a sequence of four. + // This has the highest chance of resulting in a beneficial vectorization. + if ((GapSzBytes == 2 * PrevSzBytes) && (CurChain.size() % 4 == 1)) { + ChainElem NewElem1 = createExtraElementAfter( + Prev, PrevSzBytes, "GapFill", + commonAlignment(LeaderOfChainAlign, + OffsetOfGapStart.abs().getLimitedValue())); + ChainElem NewElem2 = createExtraElementAfter( + NewElem1, PrevSzBytes, "GapFill", + commonAlignment( + LeaderOfChainAlign, + (OffsetOfGapStart + PrevSzBytes).abs().getLimitedValue())); + CurChain.push_back(NewElem1); + CurChain.push_back(NewElem2); + CurChain.push_back(*It); + continue; + } + } + + // The chain is not contiguous and cannot be made contiguous with gap + // filling, so we need to start a new chain. + Ret.push_back({*It}); } // Filter out length-1 chains, these are uninteresting. @@ -721,6 +824,14 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { unsigned AS = getLoadStoreAddressSpace(C[0].Inst); unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8; + // For compile time reasons, we cache whether or not the superset + // of all candidate chains contains any extra stores from earlier gap + // filling. + bool CandidateChainsMayContainExtraStores = + !IsLoadChain && any_of(C, [this](const ChainElem &E) { + return ExtraElements.contains(E.Inst); + }); + std::vector Ret; for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { // Find candidate chains of size not greater than the largest vector reg. @@ -769,41 +880,6 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { continue; } - // Is a load/store with this alignment allowed by TTI and at least as fast - // as an unvectorized load/store? - // - // TTI and F are passed as explicit captures to WAR an MSVC misparse (??). - auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI, - &F = F](Align Alignment) { - if (Alignment.value() % SizeBytes == 0) - return true; - unsigned VectorizedSpeed = 0; - bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( - F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); - if (!AllowsMisaligned) { - LLVM_DEBUG(dbgs() - << "LSV: Access of " << SizeBytes << "B in addrspace " - << AS << " with alignment " << Alignment.value() - << " is misaligned, and therefore can't be vectorized.\n"); - return false; - } - - unsigned ElementwiseSpeed = 0; - (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, - Alignment, &ElementwiseSpeed); - if (VectorizedSpeed < ElementwiseSpeed) { - LLVM_DEBUG(dbgs() - << "LSV: Access of " << SizeBytes << "B in addrspace " - << AS << " with alignment " << Alignment.value() - << " has relative speed " << VectorizedSpeed - << ", which is lower than the elementwise speed of " - << ElementwiseSpeed - << ". Therefore this access won't be vectorized.\n"); - return false; - } - return true; - }; - // If we're loading/storing from an alloca, align it if possible. // // FIXME: We eagerly upgrade the alignment, regardless of whether TTI @@ -818,8 +894,7 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { isa(PtrOperand->stripPointerCasts()); Align Alignment = getLoadStoreAlignment(C[CBegin].Inst); Align PrefAlign = Align(StackAdjustedAlignment); - if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 && - IsAllowedAndFast(PrefAlign)) { + if (IsAllocaAccess && Alignment.value() % SizeBytes != 0) { Align NewAlign = getOrEnforceKnownAlignment( PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT); if (NewAlign >= Alignment) { @@ -831,7 +906,55 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { } } - if (!IsAllowedAndFast(Alignment)) { + Chain ExtendingLoadsStores; + if (NumVecElems < TargetVF && NumVecElems % 2 != 0 && VecElemBits >= 8) { + // TargetVF may be a lot higher than NumVecElems, + // so only extend to the next power of 2. + assert(VecElemBits % 8 == 0); + unsigned VecElemBytes = VecElemBits / 8; + unsigned NewNumVecElems = PowerOf2Ceil(NumVecElems); + unsigned NewSizeBytes = VecElemBytes * NewNumVecElems; + + assert(NewNumVecElems <= TargetVF); + + LLVM_DEBUG(dbgs() << "LSV: attempting to extend chain of " + << NumVecElems << " " + << (IsLoadChain ? "loads" : "stores") << " to " + << NewNumVecElems << " elements\n"); + // Do not artificially increase the chain if it becomes misaligned, + // otherwise we may unnecessary split the chain when the target actually + // supports non-pow2 VF. + if (accessIsAllowedAndFast(NewSizeBytes, AS, Alignment, VecElemBits) && + ((IsLoadChain ? TTI.isLegalToWidenLoads(F.getContext()) + : TTI.isLegalMaskedStore( + FixedVectorType::get(VecElemTy, NewNumVecElems), + Alignment, AS, /*IsMaskConstant=*/true)))) { + LLVM_DEBUG(dbgs() + << "LSV: extending " << (IsLoadChain ? "load" : "store") + << " chain of " << NumVecElems << " " + << (IsLoadChain ? "loads" : "stores") + << " with total byte size of " << SizeBytes << " to " + << NewNumVecElems << " " + << (IsLoadChain ? "loads" : "stores") + << " with total byte size of " << NewSizeBytes + << ", TargetVF=" << TargetVF << " \n"); + + unsigned ASPtrBits = DL.getIndexSizeInBits(AS); + ChainElem Prev = C[CEnd]; + for (unsigned i = 0; i < (NewNumVecElems - NumVecElems); i++) { + ChainElem NewElem = createExtraElementAfter( + Prev, APInt(ASPtrBits, VecElemBytes), "Extend"); + ExtendingLoadsStores.push_back(NewElem); + Prev = ExtendingLoadsStores.back(); + } + + // Update the size and number of elements for upcoming checks. + SizeBytes = NewSizeBytes; + NumVecElems = NewNumVecElems; + } + } + + if (!accessIsAllowedAndFast(SizeBytes, AS, Alignment, VecElemBits)) { LLVM_DEBUG( dbgs() << "LSV: splitChainByAlignment discarding candidate chain " "because its alignment is not AllowedAndFast: " @@ -849,10 +972,41 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { continue; } + if (CandidateChainsMayContainExtraStores) { + // The legality of adding extra stores to ExtendingLoadsStores has + // already been checked, but if the candidate chain contains extra + // stores from an earlier optimization, confirm legality now. + // This filter is essential because, when filling gaps in + // splitChainByContinuity, we queried the API to check that (for a given + // element type and address space) there *may* be a legal masked store + // we can try to create. Now, we need to check if the actual chain we + // ended up with is legal to turn into a masked store. + // This is relevant for NVPTX targets, for example, where a masked store + // is only legal if we have ended up with a 256-bit vector. + bool CandidateChainContainsExtraStores = llvm::any_of( + ArrayRef(C).slice(CBegin, CEnd - CBegin + 1), + [this](const ChainElem &E) { + return ExtraElements.contains(E.Inst); + }); + + if (CandidateChainContainsExtraStores && + !TTI.isLegalMaskedStore( + FixedVectorType::get(VecElemTy, NumVecElems), Alignment, AS, + /*IsMaskConstant=*/true)) { + LLVM_DEBUG(dbgs() + << "LSV: splitChainByAlignment discarding candidate chain " + "because it contains extra stores that we cannot " + "legally vectorize into a masked store \n"); + continue; + } + } + // Hooray, we can vectorize this chain! Chain &NewChain = Ret.emplace_back(); for (unsigned I = CBegin; I <= CEnd; ++I) NewChain.emplace_back(C[I]); + for (ChainElem E : ExtendingLoadsStores) + NewChain.emplace_back(E); CBegin = CEnd; // Skip over the instructions we've added to the chain. break; } @@ -864,6 +1018,12 @@ bool Vectorizer::vectorizeChain(Chain &C) { if (C.size() < 2) return false; + // If we are left with a two-element chain, and one of the elements is an + // extra element, we don't want to vectorize + if (C.size() == 2 && + (ExtraElements.contains(C[0].Inst) || ExtraElements.contains(C[1].Inst))) + return false; + sortChainInOffsetOrder(C); LLVM_DEBUG({ @@ -983,12 +1143,41 @@ bool Vectorizer::vectorizeChain(Chain &C) { } } - // Chain is in offset order, so C[0] is the instr with the lowest offset, - // i.e. the root of the vector. - VecInst = Builder.CreateAlignedStore( - Vec, - getLoadStorePointerOperand(C[0].Inst), - Alignment); + // If the chain originates from extra stores, we need to vectorize into a + // masked store. + bool ChainContainsExtraStores = llvm::any_of(C, [this](const ChainElem &E) { + return ExtraElements.contains(E.Inst); + }); + if (ChainContainsExtraStores) { + assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS, + /*IsMaskConstant=*/true)); + unsigned MaskIdx = 0; + // loop through the chain and create a mask for the masked store + Value *Mask = PoisonValue::get(FixedVectorType::get( + Builder.getInt1Ty(), cast(VecTy)->getNumElements())); + for (const ChainElem &E : C) { + bool IsExtraStore = ExtraElements.contains(E.Inst); + if (FixedVectorType *VT = + dyn_cast(getLoadStoreType(E.Inst))) { + for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { + Mask = Builder.CreateInsertElement(Mask, + Builder.getInt1(!IsExtraStore), + Builder.getInt32(MaskIdx++)); + } + } else { + Mask = + Builder.CreateInsertElement(Mask, Builder.getInt1(!IsExtraStore), + Builder.getInt32(MaskIdx++)); + } + } + VecInst = Builder.CreateMaskedStore( + Vec, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask); + } else { + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + VecInst = Builder.CreateAlignedStore( + Vec, getLoadStorePointerOperand(C[0].Inst), Alignment); + } } propagateMetadata(VecInst, C); @@ -1641,3 +1830,118 @@ std::optional Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, .sextOrTrunc(OrigBitWidth); return std::nullopt; } + +bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, + Align Alignment, + unsigned VecElemBits) const { + if (Alignment.value() % SizeBytes == 0) + return true; + unsigned VectorizedSpeed = 0; + bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( + F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); + if (!AllowsMisaligned) { + LLVM_DEBUG( + dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS + << " with alignment " << Alignment.value() + << " is misaligned, and therefore can't be vectorized.\n"); + return false; + } + + unsigned ElementwiseSpeed = 0; + (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, + Alignment, &ElementwiseSpeed); + if (VectorizedSpeed < ElementwiseSpeed) { + LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " has relative speed " << VectorizedSpeed + << ", which is lower than the elementwise speed of " + << ElementwiseSpeed + << ". Therefore this access won't be vectorized.\n"); + return false; + } + return true; +} + +bool Vectorizer::shouldAttemptMaskedStore(const ArrayRef C) const { + assert(isa(C[0].Inst)); + + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + Type *ElementType = getLoadStoreType(C[0].Inst)->getScalarType(); + unsigned VecRegBits = TTI.getLoadStoreVecRegBitWidth(AS); + // Assume max alignment, splitChainByAlignment will legalize it later if the + // necessary alignment is not reached. + Align OptimisticAlign = Align(VecRegBits / 8); + unsigned int MaxVectorNumElems = + VecRegBits / DL.getTypeSizeInBits(ElementType); + + // Attempt to find the smallest power-of-two number of elements that, if + // well aligned, could be represented as a legal masked store. + // If one exists for a given element type and address space, it is worth + // attempting to fill gaps as we may be able to create a legal masked store. + // If we do not end up with a legal masked store, chains with extra elements + // will be discarded. + const unsigned MinMaskedStoreNumElems = 4; + for (unsigned NumElems = MinMaskedStoreNumElems; + NumElems <= MaxVectorNumElems; NumElems *= 2) { + FixedVectorType *VectorType = FixedVectorType::get(ElementType, NumElems); + if (TTI.isLegalMaskedStore(VectorType, OptimisticAlign, AS, + /*IsMaskConstant=*/true)) + return true; + } + return false; +} + +ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, + APInt Offset, StringRef Prefix, + Align Alignment) { + Instruction *NewElement = nullptr; + Builder.SetInsertPoint(Prev.Inst->getNextNode()); + if (LoadInst *PrevLoad = dyn_cast(Prev.Inst)) { + Value *NewGep = Builder.CreatePtrAdd( + PrevLoad->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP"); + LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n"); + NewElement = Builder.CreateAlignedLoad(PrevLoad->getType(), NewGep, + Alignment, Prefix); + } else { + StoreInst *PrevStore = cast(Prev.Inst); + + Value *NewGep = Builder.CreatePtrAdd( + PrevStore->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP"); + LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n"); + NewElement = Builder.CreateAlignedStore( + PoisonValue::get(PrevStore->getValueOperand()->getType()), NewGep, + Alignment); + } + + // Attach all metadata to the new element. + // propagateMetadata will fold it into the final vector when applicable. + NewElement->copyMetadata(*Prev.Inst); + + // Cache created elements for tracking and cleanup + ExtraElements.insert(NewElement); + + APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset; + LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n" + << *NewElement + << " OffsetFromLeader: " << NewOffsetFromLeader << "\n"); + return ChainElem{NewElement, NewOffsetFromLeader}; +} + +void Vectorizer::deleteExtraElements() { + for (auto *ExtraElement : ExtraElements) { + if (isa(ExtraElement)) { + [[maybe_unused]] bool Deleted = + RecursivelyDeleteTriviallyDeadInstructions(ExtraElement); + assert(Deleted && "Extra Load should always be trivially dead"); + } else { + // Unlike Extra Loads, Extra Stores won't be "dead", but should all be + // deleted regardless. They will have either been combined into a masked + // store, or will be left behind and need to be cleaned up. + auto *PtrOperand = getLoadStorePointerOperand(ExtraElement); + ExtraElement->eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); + } + } + + ExtraElements.clear(); +} diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index dd9a472984c25..19ec2574e32b4 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -45,29 +45,31 @@ define half @fh(ptr %p) { ; ENABLED-LABEL: fh( ; ENABLED: { ; ENABLED-NEXT: .reg .b16 %rs<10>; -; ENABLED-NEXT: .reg .b32 %r<13>; +; ENABLED-NEXT: .reg .b32 %r<17>; ; ENABLED-NEXT: .reg .b64 %rd<2>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: ; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0]; -; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8]; -; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2; -; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1; -; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3; -; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4; -; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3; -; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6; -; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7; -; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6; -; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9; -; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8; -; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5; -; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12; +; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } +; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2; +; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5; +; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4; +; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7; +; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3; +; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2; +; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10; +; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7; +; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6; +; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13; +; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8; +; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1; +; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16; ; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9; ; ENABLED-NEXT: ret; ; diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index 51f6b00601069..4870050dd2d43 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -171,8 +171,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) ; CHECK-LABEL: callee_St4x3( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] - ; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; - ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0]; ; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]]; ; CHECK-NEXT: ret; @@ -394,8 +393,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St4x7( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; - ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; - ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; + ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16]; ; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; ; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]]; diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 61ff80632c789..5499dbce61bae 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -110,7 +110,7 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: st.b64 [%SP], 4294967297; +; CHECK-PTX-NEXT: st.v2.b32 [%SP], {1, 1}; ; CHECK-PTX-NEXT: st.b32 [%SP+8], 1; ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; ; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408; diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll new file mode 100644 index 0000000000000..24d0dea086ba8 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +;; Check that the vectorizer extends a Chain to the next power of two, +;; essentially loading more vector elements than the original +;; code. Alignment and other requirement for vectorization should +;; still be met. + +define void @load3to4(ptr %p) #0 { +; CHECK-LABEL: define void @load3to4( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P_0]], align 16 +; CHECK-NEXT: [[V01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[V12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[V23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[EXTEND4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i32, ptr %p, i32 0 + %p.1 = getelementptr i32, ptr %p, i32 1 + %p.2 = getelementptr i32, ptr %p, i32 2 + + %v0 = load i32, ptr %p.0, align 16 + %v1 = load i32, ptr %p.1, align 4 + %v2 = load i32, ptr %p.2, align 8 + + ret void +} + +define void @load5to8(ptr %p) #0 { +; CHECK-LABEL: define void @load5to8( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P_0]], align 16 +; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i16, ptr %p, i32 0 + %p.1 = getelementptr i16, ptr %p, i32 1 + %p.2 = getelementptr i16, ptr %p, i32 2 + %p.3 = getelementptr i16, ptr %p, i32 3 + %p.4 = getelementptr i16, ptr %p, i32 4 + + %v0 = load i16, ptr %p.0, align 16 + %v1 = load i16, ptr %p.1, align 2 + %v2 = load i16, ptr %p.2, align 4 + %v3 = load i16, ptr %p.3, align 8 + %v4 = load i16, ptr %p.4, align 2 + + ret void +} + +define void @load3to4_unaligned(ptr %p) #0 { +; CHECK-LABEL: define void @load3to4_unaligned( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0 +; CHECK-NEXT: [[P_2:%.*]] = getelementptr i32, ptr [[P]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P_0]], align 8 +; CHECK-NEXT: [[V01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[V12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[P_2]], align 8 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i32, ptr %p, i32 0 + %p.1 = getelementptr i32, ptr %p, i32 1 + %p.2 = getelementptr i32, ptr %p, i32 2 + + %v0 = load i32, ptr %p.0, align 8 + %v1 = load i32, ptr %p.1, align 4 + %v2 = load i32, ptr %p.2, align 8 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll new file mode 100644 index 0000000000000..e812f8750fa76 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Test that gap filled instructions get deleted if they are not used +%struct.S10 = type { i32, i32, i32, i32 } + +; First, confirm that gap instructions get generated and would be vectorized if the alignment is correct +define void @fillTwoGapsCanVectorize(ptr %in) { +; CHECK-LABEL: define void @fillTwoGapsCanVectorize( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16 +; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr %in, align 16 + %getElem = getelementptr i8, ptr %in, i64 12 + %load3 = load i32, ptr %getElem, align 4 + ret void +} + +; Then, confirm that gap instructions get deleted if the alignment prevents the vectorization +define void @fillTwoGapsCantVectorize(ptr %in) { +; CHECK-LABEL: define void @fillTwoGapsCantVectorize( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[IN]], align 4 +; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[IN]], i64 12 +; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEM]], align 4 +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr %in, align 4 + %getElem = getelementptr i8, ptr %in, i64 12 + %load3 = load i32, ptr %getElem, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll new file mode 100644 index 0000000000000..6d0dfc677780d --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Test that gap filled instructions don't lose invariant metadata +%struct.S10 = type { i32, i32, i32, i32 } + +; With no gaps, if every load is invariant, the vectorized load will be too. +define i32 @noGaps(ptr %in) { +; CHECK-LABEL: define i32 @noGaps( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]] +; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]] +; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]] +; CHECK-NEXT: ret i32 [[SUM0123]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1 + %load1 = load i32, ptr %getElem1, align 4, !invariant.load !0 + %getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2 + %load2 = load i32, ptr %getElem2, align 4, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4, !invariant.load !0 + %sum01 = add i32 %load0, %load1 + %sum012 = add i32 %sum01, %load2 + %sum0123 = add i32 %sum012, %load3 + ret i32 %sum0123 +} + +; If one of the loads is not invariant, the vectorized load will not be invariant. +define i32 @noGapsMissingInvariant(ptr %in) { +; CHECK-LABEL: define i32 @noGapsMissingInvariant( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16 +; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]] +; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]] +; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]] +; CHECK-NEXT: ret i32 [[SUM0123]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1 + %load1 = load i32, ptr %getElem1, align 4, !invariant.load !0 + %getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2 + %load2 = load i32, ptr %getElem2, align 4, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4 + %sum01 = add i32 %load0, %load1 + %sum012 = add i32 %sum01, %load2 + %sum0123 = add i32 %sum012, %load3 + ret i32 %sum0123 +} + +; With two gaps, if every real load is invariant, the vectorized load will be too. +define i32 @twoGaps(ptr %in) { +; CHECK-LABEL: define i32 @twoGaps( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0]] +; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LOAD03]], [[LOAD36]] +; CHECK-NEXT: ret i32 [[SUM]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4, !invariant.load !0 + %sum = add i32 %load0, %load3 + ret i32 %sum +} + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll new file mode 100644 index 0000000000000..fe7123898d450 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S < %s | FileCheck %s + +; The LSV can handle vector inputs, and gap filling can too, with one exception: +; currently, we do not gap fill when the loads enclosing the gap are different sizes +; Otherwise, vectors are treated the same as any other scalar types + +define void @i1x8_gap_gap_i1x8(ptr %ptr) { +; CHECK-LABEL: define void @i1x8_gap_gap_i1x8( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i1>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[L03:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> +; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> +; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> +; CHECK-NEXT: [[L36:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 3 + + %l0 = load <8 x i1>, ptr %ptr0, align 4 + %l3 = load <8 x i1>, ptr %ptr3, align 1 + + ret void +} + +; The chain elements are different sizes, gap filling won't kick in +define void @i1x8_gap_gap_i1x16(ptr %ptr) { +; CHECK-LABEL: define void @i1x8_gap_gap_i1x16( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[L0:%.*]] = load <8 x i1>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[L3:%.*]] = load <16 x i1>, ptr [[PTR3]], align 2 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 3 + + %l0 = load <8 x i1>, ptr %ptr0, align 4 + %l3 = load <16 x i1>, ptr %ptr3, align 2 + + ret void +} + +; Gap of two load <2 x i8>s gets filled +define void @i8x2_gap_gap_i8x2(ptr %ptr) { +; CHECK-LABEL: define void @i8x2_gap_gap_i8x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[L36:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 6 + + %l0 = load <2 x i8>, ptr %ptr0, align 8 + %l3 = load <2 x i8>, ptr %ptr3, align 2 + + ret void +} + +; The chain elements are different sizes, gap filling won't kick in +define void @i8x2_gap_gap_i8(ptr %ptr) { +; CHECK-LABEL: define void @i8x2_gap_gap_i8( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr [[PTR]], i64 6 +; CHECK-NEXT: [[L0:%.*]] = load <2 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[PTR3]], align 1 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 6 + + %l0 = load <2 x i8>, ptr %ptr0, align 8 + %l3 = load i8, ptr %ptr3, align 1 + + ret void +} + + +define void @i16x2_gap_i16x2_i16x2(ptr %ptr) { +; CHECK-LABEL: define void @i16x2_gap_i16x2_i16x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[PTR0]], align 16 +; CHECK-NEXT: [[L01:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L23:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L34:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr2 = getelementptr i8, ptr %ptr, i64 8 + %ptr3 = getelementptr i8, ptr %ptr, i64 12 + + %l0 = load <2 x i16>, ptr %ptr0, align 16 + %l2 = load <2 x i16>, ptr %ptr2, align 2 + %l3 = load <2 x i16>, ptr %ptr3, align 2 + + ret void +} + +define void @i16x2_gap_gap_i16x2(ptr %ptr) { +; CHECK-LABEL: define void @i16x2_gap_gap_i16x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[PTR0]], align 16 +; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L36:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 12 + + %l0 = load <2 x i16>, ptr %ptr0, align 16 + %l3 = load <2 x i16>, ptr %ptr3, align 4 + + ret void +} + +define void @i32x2_i32x2_gap_i32x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i32x2_i32x2_gap_i32x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[VEC34:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i32>, ptr addrspace(1) %in, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %vec1 = load <2 x i32>, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %vec3 = load <2 x i32>, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; This gap is filled but then eventually discarded because the total size +; of the vector is larger than the target supports. +define void @i64x2_gap_i64x2_i64x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i64x2_gap_i64x2_i64x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VEC0:%.*]] = load <2 x i64>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 32 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[GETELEM3]], align 32 +; CHECK-NEXT: [[VEC31:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i64>, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %vec3 = load <2 x i64>, ptr addrspace(1) %getElem3, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + %vec1 = load <2 x i64>, ptr addrspace(1) %getElem1, align 16 + ret void +} + +; This gap is filled but then eventually discarded because the total size +; of the vector is larger than the target supports. +define void @i64x2_i64x2_gap_i64x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i64x2_i64x2_gap_i64x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[VEC32:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48 +; CHECK-NEXT: [[VEC1:%.*]] = load <2 x i64>, ptr addrspace(1) [[GETELEM1]], align 8 +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i64>, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %vec3 = load <2 x i64>, ptr addrspace(1) %getElem3, align 16 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + %vec1 = load <2 x i64>, ptr addrspace(1) %getElem1, align 8 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll new file mode 100644 index 0000000000000..82ebffed7f765 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll @@ -0,0 +1,194 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Load elements 0, 1, and 3, filling the gap with a generated load of element 2 +define void @test(ptr %ptr) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; Load elements 0, 2, and 3, filling the gap with a generated load of element 1 +define void @test2(ptr %ptr) { +; CHECK-LABEL: define void @test2( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LD23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8 + %ld2 = load i32, ptr %gep2, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; This gap can be filled, but the types are too large to do a v4 load, +; So we should end up with a v2 load and a single scalar load +define void @test3(ptr %ptr) { +; CHECK-LABEL: define void @test3( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 24 +; CHECK-NEXT: [[LD3:%.*]] = load i64, ptr [[GEP3]], align 4 +; CHECK-NEXT: ret void +; + %ld0 = load i64, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 8 + %ld1 = load i64, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld3 = load i64, ptr %gep3, align 4 + ret void +} + +; This gap can be filled, but the types are too large to do a v4 load, +; So we should end up with a v2 load and a single scalar load +define void @test4(ptr %ptr) { +; CHECK-LABEL: define void @test4( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[PTR]], align 16 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[GEP2]], align 16 +; CHECK-NEXT: [[LD21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD32:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: ret void +; + %ld0 = load i64, ptr %ptr, align 16 + %gep2 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld2 = load i64, ptr %gep2, align 16 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld3 = load i64, ptr %gep3, align 4 + ret void +} + +; Load elements 0 and 3, filling the gap with a generated load of element 1 and 2 +define void @test5(ptr %ptr) { +; CHECK-LABEL: define void @test5( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; Load elements 0, 1, 3, 4, 6, and 7, filling gaps at elements 2 and 5. +define void @test6(ptr %ptr) { +; CHECK-LABEL: define void @test6( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP4]], align 16 +; CHECK-NEXT: [[LD47:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[GAPFILL28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD710:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep4 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld4 = load i32, ptr %gep4, align 16 + %gep6 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld6 = load i32, ptr %gep6, align 4 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} + +; Load elements 0, 1, 3, 4 and 7, elements 2, 5, and 6 will be filled +define void @test7(ptr %ptr) { +; CHECK-LABEL: define void @test7( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD05:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD16:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL7:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD38:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP4]], align 16 +; CHECK-NEXT: [[LD49:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[GAPFILL210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[GAPFILL411:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD712:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep4 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld4 = load i32, ptr %gep4, align 16 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} + +; Load elements 0, 1, 3, 5, 6, and 7. Elements 2 and 4 will be filled. +; Element 4 will be created and well-aligned because of its +; distance from the first load. +define void @test8(ptr %ptr) { +; CHECK-LABEL: define void @test8( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 12 +; CHECK-NEXT: [[GAPFILLGEP1:%.*]] = getelementptr i8, ptr [[GEP3]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GAPFILLGEP1]], align 16 +; CHECK-NEXT: [[GAPFILL27:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[LD58:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD710:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep5 = getelementptr inbounds i8, ptr %ptr, i32 20 + %ld5 = load i32, ptr %gep5, align 16 + %gep6 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld6 = load i32, ptr %gep6, align 4 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll new file mode 100644 index 0000000000000..1346bd0a3fc26 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll @@ -0,0 +1,541 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=infer-alignment,load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S -o - %s | FileCheck %s + +; POSITIVE TESTS + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @singleGap(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGap( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[OUT]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @singleGapDouble(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapDouble( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p1(<4 x double> , ptr addrspace(1) [[OUT]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store double 1.0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store double 2.0, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store double 4.0, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 3, filling the gaps with generated stores of elements 1 and 2 +define void @multipleGaps(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @multipleGaps( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[OUT]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 3, 4, 7, filling the gaps with generated stores of elements 1, 2, 5, 6 +define void @multipleGaps8xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @multipleGaps8xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 2, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 4, ptr addrspace(1) %getElem4, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i32 8, ptr addrspace(1) %getElem7, align 4 + ret void +} + +; store elements 0, 1, 2, 3, 5, 6, 7, filling the gap with a generated store of element 4, +; resulting in two 4xi64 stores with the second one led by a gap filled store. +define void @singleGapLongerChain(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapLongerChain( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store <4 x i64> , ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GAPFILLGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[GETELEM3]], i64 8 +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[GAPFILLGEP]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i64 3, ptr addrspace(1) %getElem2, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + %getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 40 + store i64 6, ptr addrspace(1) %getElem5, align 8 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 48 + store i64 7, ptr addrspace(1) %getElem6, align 8 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 56 + store i64 8, ptr addrspace(1) %getElem7, align 8 + ret void +} + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @vectorElements(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @vectorElements( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + store <2 x i32> , ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store <2 x i32> , ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store <2 x i32> , ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 1, 3. 2 should not end up filled because 8xi64 is not legal. +define void @vectorElements64(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @vectorElements64( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <4 x i64> , ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48 +; CHECK-NEXT: store <2 x i64> , ptr addrspace(1) [[GETELEM1]], align 16 +; CHECK-NEXT: ret void +; + store <2 x i64> , ptr addrspace(1) %in, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + store <2 x i64> , ptr addrspace(1) %getElem1, align 16 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + store <2 x i64> , ptr addrspace(1) %getElem3, align 16 + ret void +} + +; store elements 0, 1, 2, extending element 3 +define void @extendStores(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStores( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[OUT]], i32 32, <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i64 3, ptr addrspace(1) %getElem2, align 8 + ret void +} + +; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7 +define void @extendStores8xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStores8xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 2, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 4, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 5, ptr addrspace(1) %getElem4, align 4 + ret void +} + +; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7 +define void @extendStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStoresFromLoads8xi32( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[LOAD27]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr addrspace(1) %in, align 32 + %loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4 + %loadGetElem2 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load2 = load i32, ptr addrspace(1) %loadGetElem2, align 4 + %loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4 + %loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4 + + store i32 %load0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 %load1, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 %load2, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 %load3, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 %load4, ptr addrspace(1) %getElem4, align 4 + ret void +} + +; store elements 0, 1, 3, 4, gap fill element 2, extend elements 5, 6, 7 +define void @extendAndGapFillStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendAndGapFillStoresFromLoads8xi32( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 poison, i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) [[OUT]], i32 32, <8 x i1> ) +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr addrspace(1) %in, align 32 + %loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4 + %loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4 + %loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4 + + store i32 %load0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 %load1, ptr addrspace(1) %getElem1, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 %load3, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 %load4, ptr addrspace(1) %getElem4, align 4 + ret void +} + + +; NEGATIVE TESTS + +; Wrong address space, no gap filling +define void @singleGapWrongAddrSpace(ptr addrspace(3) %out) { +; CHECK-LABEL: define void @singleGapWrongAddrSpace( +; CHECK-SAME: ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i64> , ptr addrspace(3) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[OUT]], i32 24 +; CHECK-NEXT: store i64 4, ptr addrspace(3) [[GETELEM3]], align 8 +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(3) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 8 + store i64 2, ptr addrspace(3) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 24 + store i64 4, ptr addrspace(3) %getElem3, align 8 + ret void +} + +; Not enough alignment for masked store, but we still vectorize the smaller vector +define void @singleGapMisaligned(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapMisaligned( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i64> , ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i64 4, ptr addrspace(1) [[GETELEM3]], align 8 +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 16 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; Not enough bytes to meet the minimum masked store size for the target +define void @singleGap4xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGap4xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store i32 1, ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[GETELEM2]], align 8 +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 4, ptr addrspace(1) %getElem3, align 4 + ret void +} + +; store elements 0, 1, 2, 5, 6, 7. 3 and 4 don't get filled because the heuristic +; only fills 2-element gaps that are in the middle of a multiple of 4 +define void @gapInWrongLocation(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @gapInWrongLocation( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i32 3, ptr addrspace(1) [[GETELEM2]], align 8 +; CHECK-NEXT: [[GETELEM5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20 +; CHECK-NEXT: store i32 5, ptr addrspace(1) [[GETELEM5]], align 4 +; CHECK-NEXT: [[GETELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[GETELEM6]], align 8 +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 2, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20 + store i32 5, ptr addrspace(1) %getElem5, align 4 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i32 6, ptr addrspace(1) %getElem6, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i32 7, ptr addrspace(1) %getElem7, align 4 + ret void +} + +; This test has 32-bytes of i8s with a 2-element gap in the middle of each 4-byte chunk. +; i8s are not supported by masked stores on the target, so the stores will not be vectorized. +; The loads, on the other hand, get gap filled. +define void @cantMaski8(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @cantMaski8( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[LOAD031:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL32:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL233:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD334:%.*]] = extractelement <32 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD435:%.*]] = extractelement <32 x i8> [[TMP1]], i32 4 +; CHECK-NEXT: [[GAPFILL436:%.*]] = extractelement <32 x i8> [[TMP1]], i32 5 +; CHECK-NEXT: [[GAPFILL637:%.*]] = extractelement <32 x i8> [[TMP1]], i32 6 +; CHECK-NEXT: [[LOAD738:%.*]] = extractelement <32 x i8> [[TMP1]], i32 7 +; CHECK-NEXT: [[LOAD839:%.*]] = extractelement <32 x i8> [[TMP1]], i32 8 +; CHECK-NEXT: [[GAPFILL840:%.*]] = extractelement <32 x i8> [[TMP1]], i32 9 +; CHECK-NEXT: [[GAPFILL1041:%.*]] = extractelement <32 x i8> [[TMP1]], i32 10 +; CHECK-NEXT: [[LOAD1142:%.*]] = extractelement <32 x i8> [[TMP1]], i32 11 +; CHECK-NEXT: [[LOAD1243:%.*]] = extractelement <32 x i8> [[TMP1]], i32 12 +; CHECK-NEXT: [[GAPFILL1244:%.*]] = extractelement <32 x i8> [[TMP1]], i32 13 +; CHECK-NEXT: [[GAPFILL1445:%.*]] = extractelement <32 x i8> [[TMP1]], i32 14 +; CHECK-NEXT: [[LOAD1546:%.*]] = extractelement <32 x i8> [[TMP1]], i32 15 +; CHECK-NEXT: [[LOAD1647:%.*]] = extractelement <32 x i8> [[TMP1]], i32 16 +; CHECK-NEXT: [[GAPFILL1648:%.*]] = extractelement <32 x i8> [[TMP1]], i32 17 +; CHECK-NEXT: [[GAPFILL1849:%.*]] = extractelement <32 x i8> [[TMP1]], i32 18 +; CHECK-NEXT: [[LOAD1950:%.*]] = extractelement <32 x i8> [[TMP1]], i32 19 +; CHECK-NEXT: [[LOAD2051:%.*]] = extractelement <32 x i8> [[TMP1]], i32 20 +; CHECK-NEXT: [[GAPFILL2052:%.*]] = extractelement <32 x i8> [[TMP1]], i32 21 +; CHECK-NEXT: [[GAPFILL2253:%.*]] = extractelement <32 x i8> [[TMP1]], i32 22 +; CHECK-NEXT: [[LOAD2354:%.*]] = extractelement <32 x i8> [[TMP1]], i32 23 +; CHECK-NEXT: [[LOAD2455:%.*]] = extractelement <32 x i8> [[TMP1]], i32 24 +; CHECK-NEXT: [[GAPFILL2456:%.*]] = extractelement <32 x i8> [[TMP1]], i32 25 +; CHECK-NEXT: [[GAPFILL2657:%.*]] = extractelement <32 x i8> [[TMP1]], i32 26 +; CHECK-NEXT: [[LOAD2758:%.*]] = extractelement <32 x i8> [[TMP1]], i32 27 +; CHECK-NEXT: [[LOAD2859:%.*]] = extractelement <32 x i8> [[TMP1]], i32 28 +; CHECK-NEXT: [[GAPFILL2860:%.*]] = extractelement <32 x i8> [[TMP1]], i32 29 +; CHECK-NEXT: [[GAPFILL3061:%.*]] = extractelement <32 x i8> [[TMP1]], i32 30 +; CHECK-NEXT: [[LOAD3162:%.*]] = extractelement <32 x i8> [[TMP1]], i32 31 +; CHECK-NEXT: store i8 [[LOAD031]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[OUTELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 3 +; CHECK-NEXT: store i8 [[LOAD334]], ptr addrspace(1) [[OUTELEM3]], align 1 +; CHECK-NEXT: [[OUTELEM4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 4 +; CHECK-NEXT: store i8 [[LOAD435]], ptr addrspace(1) [[OUTELEM4]], align 4 +; CHECK-NEXT: [[OUTELEM7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 7 +; CHECK-NEXT: store i8 [[LOAD738]], ptr addrspace(1) [[OUTELEM7]], align 1 +; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i8 [[LOAD839]], ptr addrspace(1) [[OUTELEM8]], align 8 +; CHECK-NEXT: [[OUTELEM11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 11 +; CHECK-NEXT: store i8 [[LOAD1142]], ptr addrspace(1) [[OUTELEM11]], align 1 +; CHECK-NEXT: [[OUTELEM12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 12 +; CHECK-NEXT: store i8 [[LOAD1243]], ptr addrspace(1) [[OUTELEM12]], align 4 +; CHECK-NEXT: [[OUTELEM15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 15 +; CHECK-NEXT: store i8 [[LOAD1546]], ptr addrspace(1) [[OUTELEM15]], align 1 +; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16 +; CHECK-NEXT: store i8 [[LOAD1647]], ptr addrspace(1) [[OUTELEM16]], align 16 +; CHECK-NEXT: [[OUTELEM19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 19 +; CHECK-NEXT: store i8 [[LOAD1950]], ptr addrspace(1) [[OUTELEM19]], align 1 +; CHECK-NEXT: [[OUTELEM20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20 +; CHECK-NEXT: store i8 [[LOAD2051]], ptr addrspace(1) [[OUTELEM20]], align 4 +; CHECK-NEXT: [[OUTELEM23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 23 +; CHECK-NEXT: store i8 [[LOAD2354]], ptr addrspace(1) [[OUTELEM23]], align 1 +; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i8 [[LOAD2455]], ptr addrspace(1) [[OUTELEM24]], align 8 +; CHECK-NEXT: [[OUTELEM27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 27 +; CHECK-NEXT: store i8 [[LOAD2758]], ptr addrspace(1) [[OUTELEM27]], align 1 +; CHECK-NEXT: [[OUTELEM28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 28 +; CHECK-NEXT: store i8 [[LOAD2859]], ptr addrspace(1) [[OUTELEM28]], align 4 +; CHECK-NEXT: [[OUTELEM31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 31 +; CHECK-NEXT: store i8 [[LOAD3162]], ptr addrspace(1) [[OUTELEM31]], align 1 +; CHECK-NEXT: ret void +; + %load0 = load i8, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 3 + %load3 = load i8, ptr addrspace(1) %getElem3, align 1 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load4 = load i8, ptr addrspace(1) %getElem4, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 7 + %load7 = load i8, ptr addrspace(1) %getElem7, align 1 + %getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load8 = load i8, ptr addrspace(1) %getElem8, align 8 + %getElem11 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 11 + %load11 = load i8, ptr addrspace(1) %getElem11, align 1 + %getElem12 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load12 = load i8, ptr addrspace(1) %getElem12, align 4 + %getElem15 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 15 + %load15 = load i8, ptr addrspace(1) %getElem15, align 1 + %getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load16 = load i8, ptr addrspace(1) %getElem16, align 16 + %getElem19 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 19 + %load19 = load i8, ptr addrspace(1) %getElem19, align 1 + %getElem20 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 20 + %load20 = load i8, ptr addrspace(1) %getElem20, align 4 + %getElem23 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 23 + %load23 = load i8, ptr addrspace(1) %getElem23, align 1 + %getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %load24 = load i8, ptr addrspace(1) %getElem24, align 8 + %getElem27 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 27 + %load27 = load i8, ptr addrspace(1) %getElem27, align 1 + %getElem28 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 28 + %load28 = load i8, ptr addrspace(1) %getElem28, align 4 + %getElem31 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 31 + %load31 = load i8, ptr addrspace(1) %getElem31, align 1 + + store i8 %load0, ptr addrspace(1) %out, align 32 + %outElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 3 + store i8 %load3, ptr addrspace(1) %outElem3, align 1 + %outElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i8 %load4, ptr addrspace(1) %outElem4, align 4 + %outElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 7 + store i8 %load7, ptr addrspace(1) %outElem7, align 1 + %outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i8 %load8, ptr addrspace(1) %outElem8, align 8 + %outElem11 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 11 + store i8 %load11, ptr addrspace(1) %outElem11, align 1 + %outElem12 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i8 %load12, ptr addrspace(1) %outElem12, align 4 + %outElem15 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 15 + store i8 %load15, ptr addrspace(1) %outElem15, align 1 + %outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i8 %load16, ptr addrspace(1) %outElem16, align 16 + %outElem19 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 19 + store i8 %load19, ptr addrspace(1) %outElem19, align 1 + %outElem20 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20 + store i8 %load20, ptr addrspace(1) %outElem20, align 4 + %outElem23 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 23 + store i8 %load23, ptr addrspace(1) %outElem23, align 1 + %outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i8 %load24, ptr addrspace(1) %outElem24, align 8 + %outElem27 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 27 + store i8 %load27, ptr addrspace(1) %outElem27, align 1 + %outElem28 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i8 %load28, ptr addrspace(1) %outElem28, align 4 + %outElem31 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 31 + store i8 %load31, ptr addrspace(1) %outElem31, align 1 + + ret void +} + +; This test has 32-bytes of i16s with a 2-element gap in the middle of each 4-element chunk. +; i16s are not supported by masked stores on the target, so the stores will not be vectorized. +; The loads, on the other hand, get gap filled. +define void @cantMaski16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @cantMaski16( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[LOAD015:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL217:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD318:%.*]] = extractelement <16 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD419:%.*]] = extractelement <16 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[GAPFILL420:%.*]] = extractelement <16 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[GAPFILL621:%.*]] = extractelement <16 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[LOAD722:%.*]] = extractelement <16 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: [[LOAD823:%.*]] = extractelement <16 x i16> [[TMP1]], i32 8 +; CHECK-NEXT: [[GAPFILL824:%.*]] = extractelement <16 x i16> [[TMP1]], i32 9 +; CHECK-NEXT: [[GAPFILL1025:%.*]] = extractelement <16 x i16> [[TMP1]], i32 10 +; CHECK-NEXT: [[LOAD1126:%.*]] = extractelement <16 x i16> [[TMP1]], i32 11 +; CHECK-NEXT: [[LOAD1227:%.*]] = extractelement <16 x i16> [[TMP1]], i32 12 +; CHECK-NEXT: [[GAPFILL1228:%.*]] = extractelement <16 x i16> [[TMP1]], i32 13 +; CHECK-NEXT: [[GAPFILL1429:%.*]] = extractelement <16 x i16> [[TMP1]], i32 14 +; CHECK-NEXT: [[LOAD1530:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15 +; CHECK-NEXT: store i16 [[LOAD015]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[OUTELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 6 +; CHECK-NEXT: store i16 [[LOAD318]], ptr addrspace(1) [[OUTELEM6]], align 2 +; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i16 [[LOAD419]], ptr addrspace(1) [[OUTELEM8]], align 8 +; CHECK-NEXT: [[OUTELEM14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 14 +; CHECK-NEXT: store i16 [[LOAD722]], ptr addrspace(1) [[OUTELEM14]], align 2 +; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16 +; CHECK-NEXT: store i16 [[LOAD823]], ptr addrspace(1) [[OUTELEM16]], align 16 +; CHECK-NEXT: [[OUTELEM22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 22 +; CHECK-NEXT: store i16 [[LOAD1126]], ptr addrspace(1) [[OUTELEM22]], align 2 +; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i16 [[LOAD1227]], ptr addrspace(1) [[OUTELEM24]], align 8 +; CHECK-NEXT: [[OUTELEM30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 30 +; CHECK-NEXT: store i16 [[LOAD1530]], ptr addrspace(1) [[OUTELEM30]], align 2 +; CHECK-NEXT: ret void +; + %load0 = load i16, ptr addrspace(1) %in, align 32 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 6 + %load3 = load i16, ptr addrspace(1) %getElem6, align 2 + %getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load4 = load i16, ptr addrspace(1) %getElem8, align 8 + %getElem14 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 14 + %load7 = load i16, ptr addrspace(1) %getElem14, align 2 + %getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load8 = load i16, ptr addrspace(1) %getElem16, align 16 + %getElem22 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 22 + %load11 = load i16, ptr addrspace(1) %getElem22, align 2 + %getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %load12 = load i16, ptr addrspace(1) %getElem24, align 8 + %getElem30 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 30 + %load15 = load i16, ptr addrspace(1) %getElem30, align 2 + + store i16 %load0, ptr addrspace(1) %out, align 32 + %outElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 6 + store i16 %load3, ptr addrspace(1) %outElem6, align 2 + %outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i16 %load4, ptr addrspace(1) %outElem8, align 8 + %outElem14 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 14 + store i16 %load7, ptr addrspace(1) %outElem14, align 2 + %outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i16 %load8, ptr addrspace(1) %outElem16, align 16 + %outElem22 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 22 + store i16 %load11, ptr addrspace(1) %outElem22, align 2 + %outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i16 %load12, ptr addrspace(1) %outElem24, align 8 + %outElem30 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 30 + store i16 %load15, ptr addrspace(1) %outElem30, align 2 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll index 2d3c289c2a12b..e031daab6d786 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll @@ -40,8 +40,7 @@ define void @int8x3a4(ptr nocapture align 4 %ptr) { ret void ; CHECK-LABEL: @int8x3a4 -; CHECK: load <2 x i8> -; CHECK: load i8 +; CHECK: load <4 x i8> ; CHECK: store <2 x i8> ; CHECK: store i8 }