diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index bb95265a794a0..f431535c722ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -85,6 +85,42 @@ static cl::opt "when sorting profitable allocas"), cl::init(4)); +// We support vector indices of the form (A * stride) + B +// All parts are optional. +struct GEPToVectorIndex { + Value *VarIndex = nullptr; // defaults to 0 + ConstantInt *VarMul = nullptr; // defaults to 1 + ConstantInt *ConstIndex = nullptr; // defaults to 0 + Value *Full = nullptr; +}; + +struct MemTransferInfo { + ConstantInt *SrcIndex = nullptr; + ConstantInt *DestIndex = nullptr; +}; + +// Analysis for planning the different strategies of alloca promotion. +struct AllocaAnalysis { + AllocaInst *Alloca = nullptr; + SmallVector Pointers; + SmallVector Uses; + unsigned Score = 0; + bool HaveSelectOrPHI = false; + struct { + FixedVectorType *Ty = nullptr; + SmallVector Worklist; + SmallVector UsersToRemove; + MapVector GEPVectorIdx; + MapVector TransferInfo; + } Vector; + struct { + bool Enable = false; + SmallVector Worklist; + } LDS; + + explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {} +}; + // Shared implementation which can do both promotion to vector and to LDS. class AMDGPUPromoteAllocaImpl { private: @@ -106,10 +142,7 @@ class AMDGPUPromoteAllocaImpl { std::pair getLocalSizeYZ(IRBuilder<> &Builder); Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); - /// BaseAlloca is the alloca root the search started from. - /// Val may be that alloca or a recursive user of it. - bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val, - std::vector &WorkList) const; + bool collectAllocaUses(AllocaAnalysis &AA) const; /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). @@ -122,10 +155,13 @@ class AMDGPUPromoteAllocaImpl { /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); - bool tryPromoteAllocaToVector(AllocaInst &I); - bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); + FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const; + void analyzePromoteToVector(AllocaAnalysis &AA) const; + void promoteAllocaToVector(AllocaAnalysis &AA); + void analyzePromoteToLDS(AllocaAnalysis &AA) const; + bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS); - void sortAllocasToPromote(SmallVectorImpl &Allocas); + void scoreAlloca(AllocaAnalysis &AA) const; void setFunctionLimits(const Function &F); @@ -236,53 +272,77 @@ FunctionPass *llvm::createAMDGPUPromoteAlloca() { return new AMDGPUPromoteAlloca(); } -static void collectAllocaUses(AllocaInst &Alloca, - SmallVectorImpl &Uses) { - SmallVector WorkList({&Alloca}); +bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const { + const auto RejectUser = [&](Instruction *Inst, Twine Msg) { + LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n" + << " " << *Inst << "\n"); + return false; + }; + + SmallVector WorkList({AA.Alloca}); while (!WorkList.empty()) { auto *Cur = WorkList.pop_back_val(); + if (find(AA.Pointers, Cur) != AA.Pointers.end()) + continue; + AA.Pointers.push_back(Cur); for (auto &U : Cur->uses()) { - Uses.push_back(&U); + auto *Inst = cast(U.getUser()); + if (isa(Inst)) { + if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) { + return RejectUser(Inst, "pointer escapes via store"); + } + } + AA.Uses.push_back(&U); - if (isa(U.getUser())) - WorkList.push_back(cast(U.getUser())); + if (isa(U.getUser())) { + WorkList.push_back(Inst); + } else if (auto *SI = dyn_cast(Inst)) { + // Only promote a select if we know that the other select operand is + // from another pointer that will also be promoted. + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2)) + return RejectUser(Inst, "select from mixed objects"); + WorkList.push_back(Inst); + AA.HaveSelectOrPHI = true; + } else if (auto *Phi = dyn_cast(Inst)) { + // Repeat for phis. + + // TODO: Handle more complex cases. We should be able to replace loops + // over arrays. + switch (Phi->getNumIncomingValues()) { + case 1: + break; + case 2: + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1)) + return RejectUser(Inst, "phi from mixed objects"); + break; + default: + return RejectUser(Inst, "phi with too many operands"); + } + + WorkList.push_back(Inst); + AA.HaveSelectOrPHI = true; + } } } + return true; } -void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( - SmallVectorImpl &Allocas) { - DenseMap Scores; - - for (auto *Alloca : Allocas) { - LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n"); - unsigned &Score = Scores[Alloca]; - // Increment score by one for each user + a bonus for users within loops. - SmallVector Uses; - collectAllocaUses(*Alloca, Uses); - for (auto *U : Uses) { - Instruction *Inst = cast(U->getUser()); - if (isa(Inst)) - continue; - unsigned UserScore = - 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent())); - LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n"); - Score += UserScore; - } - LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n"); +void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const { + LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n"); + unsigned Score = 0; + // Increment score by one for each user + a bonus for users within loops. + for (auto *U : AA.Uses) { + Instruction *Inst = cast(U->getUser()); + if (isa(Inst) || isa(Inst) || + isa(Inst)) + continue; + unsigned UserScore = + 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent())); + LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n"); + Score += UserScore; } - - stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) { - return Scores.at(A) > Scores.at(B); - }); - - // clang-format off - LLVM_DEBUG( - dbgs() << "Sorted Worklist:\n"; - for (auto *A: Allocas) - dbgs() << " " << *A << "\n"; - ); - // clang-format on + LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n"); + AA.Score = Score; } void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { @@ -319,27 +379,48 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { : (MaxVGPRs * 32)) / VGPRBudgetRatio; - SmallVector Allocas; + std::vector Allocas; for (Instruction &I : F.getEntryBlock()) { if (AllocaInst *AI = dyn_cast(&I)) { // Array allocations are probably not worth handling, since an allocation // of the array type is the canonical form. if (!AI->isStaticAlloca() || AI->isArrayAllocation()) continue; - Allocas.push_back(AI); + + LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n'); + + AllocaAnalysis AA{AI}; + if (collectAllocaUses(AA)) { + analyzePromoteToVector(AA); + if (PromoteToLDS) + analyzePromoteToLDS(AA); + if (AA.Vector.Ty || AA.LDS.Enable) { + scoreAlloca(AA); + Allocas.push_back(std::move(AA)); + } + } } } - sortAllocasToPromote(Allocas); + stable_sort(Allocas, + [](const auto &A, const auto &B) { return A.Score > B.Score; }); + + // clang-format off + LLVM_DEBUG( + dbgs() << "Sorted Worklist:\n"; + for (const auto &AA : Allocas) + dbgs() << " " << *AA.Alloca << "\n"; + ); + // clang-format on bool Changed = false; - for (AllocaInst *AI : Allocas) { - const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType()); - // First, check if we have enough budget to vectorize this alloca. - if (AllocaCost <= VectorizationBudget) { - // If we do, attempt vectorization, otherwise, fall through and try - // promoting to LDS instead. - if (tryPromoteAllocaToVector(*AI)) { + for (AllocaAnalysis &AA : Allocas) { + if (AA.Vector.Ty) { + const unsigned AllocaCost = + DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()); + // First, check if we have enough budget to vectorize this alloca. + if (AllocaCost <= VectorizationBudget) { + promoteAllocaToVector(AA); Changed = true; assert((VectorizationBudget - AllocaCost) < VectorizationBudget && "Underflow!"); @@ -347,14 +428,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { LLVM_DEBUG(dbgs() << " Remaining vectorization budget:" << VectorizationBudget << "\n"); continue; + } else { + LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" + << AllocaCost << ", budget:" << VectorizationBudget + << "): " << *AA.Alloca << "\n"); } - } else { - LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" - << AllocaCost << ", budget:" << VectorizationBudget - << "): " << *AI << "\n"); } - if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS)) + if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS)) Changed = true; } @@ -365,11 +446,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { return Changed; } -struct MemTransferInfo { - ConstantInt *SrcIndex = nullptr; - ConstantInt *DestIndex = nullptr; -}; - // Checks if the instruction I is a memset user of the alloca AI that we can // deal with. Currently, only non-volatile memsets that affect the whole alloca // are handled. @@ -387,23 +463,49 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile(); } -static Value *calculateVectorIndex( - Value *Ptr, const std::map &GEPIdx) { - auto *GEP = dyn_cast(Ptr->stripPointerCasts()); - if (!GEP) - return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext())); +static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) { + IRBuilder<> B(Ptr->getContext()); + + Ptr = Ptr->stripPointerCasts(); + if (Ptr == AA.Alloca) + return B.getInt32(0); + + auto *GEP = cast(Ptr); + auto I = AA.Vector.GEPVectorIdx.find(GEP); + assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!"); - auto I = GEPIdx.find(GEP); - assert(I != GEPIdx.end() && "Must have entry for GEP!"); + if (!I->second.Full) { + Value *Result = nullptr; + B.SetInsertPoint(GEP); + + if (I->second.VarIndex) { + Result = I->second.VarIndex; + Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty()); + + if (I->second.VarMul) + Result = B.CreateMul(Result, I->second.VarMul); + } - Value *IndexValue = I->second; - assert(IndexValue && "index value missing from GEP index map"); - return IndexValue; + if (I->second.ConstIndex) { + if (Result) { + Result = B.CreateAdd(Result, I->second.ConstIndex); + } else { + Result = I->second.ConstIndex; + } + } + + if (!Result) + Result = B.getInt32(0); + + I->second.Full = Result; + } + + return I->second.Full; } -static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, - Type *VecElemTy, const DataLayout &DL, - SmallVector &NewInsts) { +static std::optional +computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, + Type *VecElemTy, const DataLayout &DL) { // TODO: Extracting a "multiple of X" from a GEP might be a useful generic // helper. LLVMContext &Ctx = GEP->getContext(); @@ -431,7 +533,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Value *CurPtr = GEP; while (auto *CurGEP = dyn_cast(CurPtr)) { if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) - return nullptr; + return {}; // Move to the next outer pointer. CurPtr = CurGEP->getPointerOperand(); @@ -441,87 +543,64 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy); if (VarOffsets.size() > 1) - return nullptr; + return {}; APInt IndexQuot; int64_t Rem; APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem); if (Rem != 0) - return nullptr; - if (VarOffsets.size() == 0) - return ConstantInt::get(Ctx, IndexQuot); + return {}; + + GEPToVectorIndex Result; - IRBuilder<> Builder(GEP); + if (!ConstOffset.isZero()) + Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW)); + + if (VarOffsets.empty()) + return Result; const auto &VarOffset = VarOffsets.front(); APInt OffsetQuot; APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); if (Rem != 0 || OffsetQuot.isZero()) - return nullptr; + return {}; - Value *Offset = VarOffset.first; - auto *OffsetType = dyn_cast(Offset->getType()); + Result.VarIndex = VarOffset.first; + auto *OffsetType = dyn_cast(Result.VarIndex->getType()); if (!OffsetType) - return nullptr; + return {}; if (!OffsetQuot.isOne()) { - ConstantInt *ConstMul = - ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth())); - Offset = Builder.CreateMul(Offset, ConstMul); - if (Instruction *NewInst = dyn_cast(Offset)) - NewInsts.push_back(NewInst); + Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW)); } - if (ConstOffset.isZero()) - return Offset; - - ConstantInt *ConstIndex = - ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth())); - Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex); - if (Instruction *NewInst = dyn_cast(IndexAdd)) - NewInsts.push_back(NewInst); - return IndexAdd; + + return Result; } /// Promotes a single user of the alloca to a vector form. /// /// \param Inst Instruction to be promoted. /// \param DL Module Data Layout. -/// \param VectorTy Vectorized Type. +/// \param AA Alloca Analysis. /// \param VecStoreSize Size of \p VectorTy in bytes. /// \param ElementSize Size of \p VectorTy element type in bytes. -/// \param TransferInfo MemTransferInst info map. -/// \param GEPVectorIdx GEP -> VectorIdx cache. /// \param CurVal Current value of the vector (e.g. last stored value) /// \param[out] DeferredLoads \p Inst is added to this vector if it can't /// be promoted now. This happens when promoting requires \p /// CurVal, but \p CurVal is nullptr. /// \return the stored value if \p Inst would have written to the alloca, or /// nullptr otherwise. -static Value *promoteAllocaUserToVector( - Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, - unsigned VecStoreSize, unsigned ElementSize, - DenseMap &TransferInfo, - std::map &GEPVectorIdx, Value *CurVal, - SmallVectorImpl &DeferredLoads) { +static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, + AllocaAnalysis &AA, + unsigned VecStoreSize, + unsigned ElementSize, + function_ref GetCurVal) { // Note: we use InstSimplifyFolder because it can leverage the DataLayout // to do more folding, especially in the case of vector splats. IRBuilder Builder(Inst->getContext(), InstSimplifyFolder(DL)); Builder.SetInsertPoint(Inst); - const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { - if (CurVal) - return CurVal; - - // If the current value is not known, insert a dummy load and lower it on - // the second pass. - LoadInst *Dummy = - Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()), - "promotealloca.dummyload"); - DeferredLoads.push_back(Dummy); - return Dummy; - }; - const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, Type *PtrTy) -> Value * { assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); @@ -537,18 +616,13 @@ static Value *promoteAllocaUserToVector( Val, FixedVectorType::get(EltTy, NumPtrElts)); }; - Type *VecEltTy = VectorTy->getElementType(); + Type *VecEltTy = AA.Vector.Ty->getElementType(); switch (Inst->getOpcode()) { case Instruction::Load: { - // Loads can only be lowered if the value is known. - if (!CurVal) { - DeferredLoads.push_back(cast(Inst)); - return nullptr; - } - - Value *Index = calculateVectorIndex( - cast(Inst)->getPointerOperand(), GEPVectorIdx); + Value *CurVal = GetCurVal(); + Value *Index = + calculateVectorIndex(cast(Inst)->getPointerOperand(), AA); // We're loading the full vector. Type *AccessTy = Inst->getType(); @@ -604,7 +678,7 @@ static Value *promoteAllocaUserToVector( // to know the current value. If this is a store of a single element, we // need to know the value. StoreInst *SI = cast(Inst); - Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx); + Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA); Value *Val = SI->getValueOperand(); // We're storing the full vector, we can handle this without knowing CurVal. @@ -614,9 +688,9 @@ static Value *promoteAllocaUserToVector( if (CI->isZeroValue() && AccessSize == VecStoreSize) { if (AccessTy->isPtrOrPtrVectorTy()) Val = CreateTempPtrIntCast(Val, AccessTy); - else if (VectorTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, VectorTy); - return Builder.CreateBitOrPointerCast(Val, VectorTy); + else if (AA.Vector.Ty->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, AA.Vector.Ty); + return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty); } } @@ -625,7 +699,7 @@ static Value *promoteAllocaUserToVector( assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); const unsigned NumWrittenElts = AccessSize / DL.getTypeStoreSize(VecEltTy); - const unsigned NumVecElts = VectorTy->getNumElements(); + const unsigned NumVecElts = AA.Vector.Ty->getNumElements(); auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); @@ -636,7 +710,7 @@ static Value *promoteAllocaUserToVector( Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); - Value *CurVec = GetOrLoadCurrentVectorValue(); + Value *CurVec = GetCurVal(); for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts); K < NumElts; ++K) { Value *CurIdx = @@ -649,22 +723,21 @@ static Value *promoteAllocaUserToVector( if (Val->getType() != VecEltTy) Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); - return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, - Index); + return Builder.CreateInsertElement(GetCurVal(), Val, Index); } case Instruction::Call: { if (auto *MTI = dyn_cast(Inst)) { // For memcpy, we need to know curval. ConstantInt *Length = cast(MTI->getLength()); unsigned NumCopied = Length->getZExtValue() / ElementSize; - MemTransferInfo *TI = &TransferInfo[MTI]; + MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI]; unsigned SrcBegin = TI->SrcIndex->getZExtValue(); unsigned DestBegin = TI->DestIndex->getZExtValue(); SmallVector Mask; - for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { + for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) { if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { - Mask.push_back(SrcBegin < VectorTy->getNumElements() + Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements() ? SrcBegin++ : PoisonMaskElem); } else { @@ -672,7 +745,7 @@ static Value *promoteAllocaUserToVector( } } - return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask); + return Builder.CreateShuffleVector(GetCurVal(), Mask); } if (auto *MSI = dyn_cast(Inst)) { @@ -693,14 +766,14 @@ static Value *promoteAllocaUserToVector( Elt = Builder.CreateBitCast(EltBytes, VecEltTy); } - return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); + return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt); } if (auto *Intr = dyn_cast(Inst)) { if (Intr->getIntrinsicID() == Intrinsic::objectsize) { Intr->replaceAllUsesWith( Builder.getIntN(Intr->getType()->getIntegerBitWidth(), - DL.getTypeAllocSize(VectorTy))); + DL.getTypeAllocSize(AA.Vector.Ty))); return nullptr; } } @@ -791,16 +864,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, return I; } -// FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { - LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); - +FixedVectorType * +AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const { if (DisablePromoteAllocaToVector) { - LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n"); - return false; + LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n"); + return nullptr; } - Type *AllocaTy = Alloca.getAllocatedType(); auto *VectorTy = dyn_cast(AllocaTy); if (auto *ArrayTy = dyn_cast(AllocaTy)) { uint64_t NumElems = 1; @@ -832,10 +902,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { } } } - if (!VectorTy) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); - return false; + return nullptr; } const unsigned MaxElements = @@ -845,46 +914,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { VectorTy->getNumElements() < 2) { LLVM_DEBUG(dbgs() << " " << *VectorTy << " has an unsupported number of elements\n"); - return false; + return nullptr; } - std::map GEPVectorIdx; - SmallVector WorkList; - SmallVector UsersToRemove; - SmallVector DeferredInsts; - SmallVector NewGEPInsts; - DenseMap TransferInfo; - - const auto RejectUser = [&](Instruction *Inst, Twine Msg) { - LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" - << " " << *Inst << "\n"); - for (auto *Inst : reverse(NewGEPInsts)) - Inst->eraseFromParent(); - return false; - }; - - SmallVector Uses; - collectAllocaUses(Alloca, Uses); - - LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); - Type *VecEltTy = VectorTy->getElementType(); unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " "does not match the type's size\n"); - return false; + return nullptr; + } + + return VectorTy; +} + +void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const { + if (AA.HaveSelectOrPHI) { + LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n"); + return; } - unsigned ElementSize = ElementSizeInBits / 8; + + Type *AllocaTy = AA.Alloca->getAllocatedType(); + AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy); + if (!AA.Vector.Ty) + return; + + const auto RejectUser = [&](Instruction *Inst, Twine Msg) { + LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" + << " " << *Inst << "\n"); + AA.Vector.Ty = nullptr; + }; + + Type *VecEltTy = AA.Vector.Ty->getElementType(); + unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; assert(ElementSize > 0); - for (auto *U : Uses) { + for (auto *U : AA.Uses) { Instruction *Inst = cast(U->getUser()); if (Value *Ptr = getLoadStorePointerOperand(Inst)) { - // This is a store of the pointer, not to the pointer. - if (isa(Inst) && - U->getOperandNo() != StoreInst::getPointerOperandIndex()) - return RejectUser(Inst, "pointer is being stored"); + assert(!isa(Inst) || + U->getOperandNo() == StoreInst::getPointerOperandIndex()); Type *AccessTy = getLoadStoreType(Inst); if (AccessTy->isAggregateType()) @@ -900,34 +969,35 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Ptr = Ptr->stripPointerCasts(); // Alloca already accessed as vector. - if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) == - DL->getTypeStoreSize(AccessTy)) { - WorkList.push_back(Inst); + if (Ptr == AA.Alloca && + DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) == + DL->getTypeStoreSize(AccessTy)) { + AA.Vector.Worklist.push_back(Inst); continue; } - if (!isSupportedAccessType(VectorTy, AccessTy, *DL)) + if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL)) return RejectUser(Inst, "not a supported access type"); - WorkList.push_back(Inst); + AA.Vector.Worklist.push_back(Inst); continue; } if (auto *GEP = dyn_cast(Inst)) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. - Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); + auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); - GEPVectorIdx[GEP] = Index; - UsersToRemove.push_back(Inst); + AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value()); + AA.Vector.UsersToRemove.push_back(Inst); continue; } if (MemSetInst *MSI = dyn_cast(Inst); - MSI && isSupportedMemset(MSI, &Alloca, *DL)) { - WorkList.push_back(Inst); + MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) { + AA.Vector.Worklist.push_back(Inst); continue; } @@ -940,21 +1010,22 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { return RejectUser(Inst, "mem transfer inst length is non-constant or " "not a multiple of the vector element size"); - if (TransferInfo.try_emplace(TransferInst).second) { - DeferredInsts.push_back(Inst); - WorkList.push_back(Inst); - } - auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * { - GetElementPtrInst *GEP = dyn_cast(Ptr); - if (Ptr != &Alloca && !GEPVectorIdx.count(GEP)) - return nullptr; + if (Ptr == AA.Alloca) + return ConstantInt::get(Ptr->getContext(), APInt(32, 0)); - return dyn_cast(calculateVectorIndex(Ptr, GEPVectorIdx)); + GetElementPtrInst *GEP = cast(Ptr); + const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second; + if (GEPI.VarIndex) + return nullptr; + if (GEPI.ConstIndex) + return GEPI.ConstIndex; + return ConstantInt::get(Ptr->getContext(), APInt(32, 0)); }; + MemTransferInfo *TI = + &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second; unsigned OpNum = U->getOperandNo(); - MemTransferInfo *TI = &TransferInfo[TransferInst]; if (OpNum == 0) { Value *Dest = TransferInst->getDest(); ConstantInt *Index = getPointerIndexOfAlloca(Dest); @@ -974,7 +1045,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (auto *Intr = dyn_cast(Inst)) { if (Intr->getIntrinsicID() == Intrinsic::objectsize) { - WorkList.push_back(Inst); + AA.Vector.Worklist.push_back(Inst); continue; } } @@ -983,97 +1054,106 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (isAssumeLikeIntrinsic(Inst)) { if (!Inst->use_empty()) return RejectUser(Inst, "assume-like intrinsic cannot have any users"); - UsersToRemove.push_back(Inst); + AA.Vector.UsersToRemove.push_back(Inst); continue; } if (isa(Inst) && all_of(Inst->users(), [](User *U) { return isAssumeLikeIntrinsic(cast(U)); })) { - UsersToRemove.push_back(Inst); + AA.Vector.UsersToRemove.push_back(Inst); continue; } return RejectUser(Inst, "unhandled alloca user"); } - while (!DeferredInsts.empty()) { - Instruction *Inst = DeferredInsts.pop_back_val(); - MemTransferInst *TransferInst = cast(Inst); - // TODO: Support the case if the pointers are from different alloca or - // from different address spaces. - MemTransferInfo &Info = TransferInfo[TransferInst]; - if (!Info.SrcIndex || !Info.DestIndex) - return RejectUser( - Inst, "mem transfer inst is missing constant src and/or dst index"); + // Follow-up check to ensure we've seen both sides of all transfer insts. + for (const auto &Entry : AA.Vector.TransferInfo) { + const MemTransferInfo &TI = Entry.second; + if (!TI.SrcIndex || !TI.DestIndex) + return RejectUser(Entry.first, + "mem transfer inst between different objects"); + AA.Vector.Worklist.push_back(Entry.first); } +} + +void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) { + LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n'); + LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType() + << " -> " << *AA.Vector.Ty << '\n'); + const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty); - LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " - << *VectorTy << '\n'); - const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy); + Type *VecEltTy = AA.Vector.Ty->getElementType(); + const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; // Alloca is uninitialized memory. Imitate that by making the first value // undef. SSAUpdater Updater; - Updater.Initialize(VectorTy, "promotealloca"); + Updater.Initialize(AA.Vector.Ty, "promotealloca"); - BasicBlock *EntryBB = Alloca.getParent(); + BasicBlock *EntryBB = AA.Alloca->getParent(); BasicBlock::iterator InitInsertPos = - skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator()); - // Alloca memory is undefined to begin, not poison. - Value *AllocaInitValue = - new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos); - AllocaInitValue->takeName(&Alloca); + skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator()); + IRBuilder<> Builder(&*InitInsertPos); + Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty)); + AllocaInitValue->takeName(AA.Alloca); - Updater.AddAvailableValue(EntryBB, AllocaInitValue); + Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue); - // First handle the initial worklist. - SmallVector DeferredLoads; - forEachWorkListItem(WorkList, [&](Instruction *I) { + // First handle the initial worklist, in basic block order. + // + // Insert a placeholder whenever we need the vector value at the top of a + // basic block. + SmallVector Placeholders; + forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) { BasicBlock *BB = I->getParent(); - // On the first pass, we only take values that are trivially known, i.e. - // where AddAvailableValue was already called in this block. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.FindValueForBlock(BB), DeferredLoads); + auto GetCurVal = [&]() -> Value * { + if (Value *CurVal = Updater.FindValueForBlock(BB)) + return CurVal; + + // If the current value in the basic block is not yet known, insert a + // placeholder that we will replace later. + IRBuilder<> Builder(I); + auto *Placeholder = cast(Builder.CreateFreeze( + PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder")); + Placeholders.push_back(Placeholder); + Updater.AddAvailableValue(BB, Placeholder); + return Placeholder; + }; + + Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize, + ElementSize, GetCurVal); if (Result) Updater.AddAvailableValue(BB, Result); }); - // Then handle deferred loads. - forEachWorkListItem(DeferredLoads, [&](Instruction *I) { - SmallVector NewDLs; - BasicBlock *BB = I->getParent(); - // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always - // get a value, inserting PHIs as needed. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs); - if (Result) - Updater.AddAvailableValue(BB, Result); - assert(NewDLs.empty() && "No more deferred loads should be queued!"); - }); + // Now fixup the placeholders. + for (Instruction *Placeholder : Placeholders) { + Placeholder->replaceAllUsesWith( + Updater.GetValueInMiddleOfBlock(Placeholder->getParent())); + Placeholder->eraseFromParent(); + } // Delete all instructions. On the first pass, new dummy loads may have been // added so we need to collect them too. - DenseSet InstsToDelete(WorkList.begin(), WorkList.end()); - InstsToDelete.insert_range(DeferredLoads); + DenseSet InstsToDelete(AA.Vector.Worklist.begin(), + AA.Vector.Worklist.end()); for (Instruction *I : InstsToDelete) { assert(I->use_empty()); I->eraseFromParent(); } // Delete all the users that are known to be removeable. - for (Instruction *I : reverse(UsersToRemove)) { + for (Instruction *I : reverse(AA.Vector.UsersToRemove)) { I->dropDroppableUses(); assert(I->use_empty()); I->eraseFromParent(); } // Alloca should now be dead too. - assert(Alloca.use_empty()); - Alloca.eraseFromParent(); - return true; + assert(AA.Alloca->use_empty()); + AA.Alloca->eraseFromParent(); } std::pair @@ -1247,61 +1327,78 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( return true; } -bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( - Value *BaseAlloca, Value *Val, std::vector &WorkList) const { +void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const { + if (DisablePromoteAllocaToLDS) { + LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n"); + return; + } - for (User *User : Val->users()) { - if (is_contained(WorkList, User)) - continue; + // Don't promote the alloca to LDS for shader calling conventions as the work + // item ID intrinsics are not supported for these calling conventions. + // Furthermore not all LDS is available for some of the stages. + const Function &ContainingFunction = *AA.Alloca->getFunction(); + CallingConv::ID CC = ContainingFunction.getCallingConv(); + + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + break; + default: + LLVM_DEBUG( + dbgs() + << " promote alloca to LDS not supported with calling convention.\n"); + return; + } + + for (Use *Use : AA.Uses) { + auto *User = Use->getUser(); if (CallInst *CI = dyn_cast(User)) { if (!isCallPromotable(CI)) - return false; + return; - WorkList.push_back(User); + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(User); continue; } Instruction *UseInst = cast(User); if (UseInst->getOpcode() == Instruction::PtrToInt) - return false; + return; if (LoadInst *LI = dyn_cast(UseInst)) { if (LI->isVolatile()) - return false; + return; continue; } if (StoreInst *SI = dyn_cast(UseInst)) { if (SI->isVolatile()) - return false; - - // Reject if the stored value is not the pointer operand. - if (SI->getPointerOperand() != Val) - return false; + return; continue; } if (AtomicRMWInst *RMW = dyn_cast(UseInst)) { if (RMW->isVolatile()) - return false; + return; continue; } if (AtomicCmpXchgInst *CAS = dyn_cast(UseInst)) { if (CAS->isVolatile()) - return false; + return; continue; } // Only promote a select if we know that the other select operand // is from another pointer that will also be promoted. if (ICmpInst *ICmp = dyn_cast(UseInst)) { - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1)) - return false; + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1)) + return; // May need to rewrite constant operands. - WorkList.push_back(ICmp); + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(ICmp); continue; } @@ -1309,28 +1406,8 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( // Be conservative if an address could be computed outside the bounds of // the alloca. if (!GEP->isInBounds()) - return false; - } else if (SelectInst *SI = dyn_cast(UseInst)) { - // Only promote a select if we know that the other select operand is from - // another pointer that will also be promoted. - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2)) - return false; - } else if (PHINode *Phi = dyn_cast(UseInst)) { - // Repeat for phis. - - // TODO: Handle more complex cases. We should be able to replace loops - // over arrays. - switch (Phi->getNumIncomingValues()) { - case 1: - break; - case 2: - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1)) - return false; - break; - default: - return false; - } - } else if (!isa(User)) { + return; + } else if (!isa(User)) { // Do not promote vector/aggregate type instructions. It is hard to track // their users. @@ -1338,15 +1415,14 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( // // TODO: If we know the address is only observed through flat pointers, we // could still promote. - return false; + return; } - WorkList.push_back(User); - if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList)) - return false; + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(User); } - return true; + AA.LDS.Enable = true; } bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { @@ -1477,44 +1553,23 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } // FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, +bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS) { - LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n'); - - if (DisablePromoteAllocaToLDS) { - LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n"); - return false; - } - - const DataLayout &DL = Mod->getDataLayout(); - IRBuilder<> Builder(&I); - - const Function &ContainingFunction = *I.getFunction(); - CallingConv::ID CC = ContainingFunction.getCallingConv(); - - // Don't promote the alloca to LDS for shader calling conventions as the work - // item ID intrinsics are not supported for these calling conventions. - // Furthermore not all LDS is available for some of the stages. - switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - break; - default: - LLVM_DEBUG( - dbgs() - << " promote alloca to LDS not supported with calling convention.\n"); - return false; - } + LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n'); // Not likely to have sufficient local memory for promotion. if (!SufficientLDS) return false; + const DataLayout &DL = Mod->getDataLayout(); + IRBuilder<> Builder(AA.Alloca); + + const Function &ContainingFunction = *AA.Alloca->getParent()->getParent(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; - Align Alignment = - DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType()); + Align Alignment = DL.getValueOrABITypeAlignment( + AA.Alloca->getAlign(), AA.Alloca->getAllocatedType()); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. @@ -1524,7 +1579,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = - WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType()); + WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType()); NewSize += AllocSize; if (NewSize > LocalMemLimit) { @@ -1535,24 +1590,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, CurrentLocalMemUsage = NewSize; - std::vector WorkList; - - if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { - LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n"); - Function *F = I.getFunction(); + Function *F = AA.Alloca->getFunction(); - Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); + Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy), - Twine(F->getName()) + Twine('.') + I.getName(), nullptr, + Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(I.getAlign()); + GV->setAlignment(AA.Alloca->getAlign()); Value *TCntY, *TCntZ; @@ -1571,15 +1619,15 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID}; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); - I.mutateType(Offset->getType()); - I.replaceAllUsesWith(Offset); - I.eraseFromParent(); + AA.Alloca->mutateType(Offset->getType()); + AA.Alloca->replaceAllUsesWith(Offset); + AA.Alloca->eraseFromParent(); SmallVector DeferredIntrs; PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); - for (Value *V : WorkList) { + for (Value *V : AA.LDS.Worklist) { CallInst *Call = dyn_cast(V); if (!Call) { if (ICmpInst *CI = dyn_cast(V)) { diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll index 63622e67e7d0b..7b64d8728cc24 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll @@ -262,14 +262,15 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset(ptr %out) { ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 3, i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 4, i32 4 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 5, i32 5 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[SEL3]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP7]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP16]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP17]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP16]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP18]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 @@ -311,15 +312,16 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out) ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 3, i32 3 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 6 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[SEL3]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP17]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP8]], 6 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP18]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP19]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP18]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP20]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll index a865bf5058d6a..7ebb4ca262614 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll @@ -11,8 +11,9 @@ define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP5]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 [[TMP7]] ; CHECK-NEXT: store i8 [[TMP6]], ptr [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; @@ -39,8 +40,9 @@ define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP5]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 [[TMP7]] ; CHECK-NEXT: store i32 [[TMP6]], ptr [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll index ab03177d1edc5..ae6157af2cf4c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll @@ -1,14 +1,16 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s ; REQUIRES: asserts -; CHECK: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5) -; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4 +; CHECK-LABEL: Analyzing: %simpleuser = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+1]: store i64 42, ptr addrspace(5) %simpleuser, align 8 ; CHECK-NEXT: => Final Score:1 +; CHECK-LABEL: Analyzing: %manyusers = alloca [4 x i64], align 4, addrspace(5) ; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5) -; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4 -; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1 -; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4 -; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1 +; CHECK-NEXT: [+1]: store i64 %v0.add, ptr addrspace(5) %manyusers.1, align 8 +; CHECK-NEXT: [+1]: %v0 = load i64, ptr addrspace(5) %manyusers.1, align 8 +; CHECK-NEXT: [+1]: store i64 %v1.add, ptr addrspace(5) %manyusers.2, align 8 +; CHECK-NEXT: [+1]: %v1 = load i64, ptr addrspace(5) %manyusers.2, align 8 ; CHECK-NEXT: => Final Score:4 ; CHECK-NEXT: Sorted Worklist: ; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5) @@ -20,50 +22,52 @@ entry: ; should get a score of 4 %manyusers = alloca [4 x i64], align 4, addrspace(5) - store i32 42, ptr addrspace(5) %simpleuser + store i64 42, ptr addrspace(5) %simpleuser - %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2 - %v0 = load i8, ptr addrspace(5) %manyusers.1 - %v0.ext = zext i8 %v0 to i32 - store i32 %v0.ext, ptr addrspace(5) %manyusers.1 + %manyusers.1 = getelementptr i64, ptr addrspace(5) %manyusers, i64 2 + %v0 = load i64, ptr addrspace(5) %manyusers.1 + %v0.add = add i64 %v0, 1 + store i64 %v0.add, ptr addrspace(5) %manyusers.1 - %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1 - %v1 = load i8, ptr addrspace(5) %manyusers.2 - %v1.ext = zext i8 %v0 to i32 - store i32 %v1.ext, ptr addrspace(5) %manyusers.2 + %manyusers.2 = getelementptr i64, ptr addrspace(5) %manyusers, i64 1 + %v1 = load i64, ptr addrspace(5) %manyusers.2 + %v1.add = add i64 %v0, 1 + store i64 %v1.add, ptr addrspace(5) %manyusers.2 ret void } -; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5) -; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4 -; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4 -; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4 -; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1 -; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4 -; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1 +; CHECK-LABEL: Analyzing: %stack = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+5]: store i64 32, ptr addrspace(5) %stack, align 8 +; CHECK-NEXT: [+1]: store i64 42, ptr addrspace(5) %stack, align 8 +; CHECK-NEXT: [+9]: store i64 32, ptr addrspace(5) %stack.1, align 8 +; CHECK-NEXT: [+5]: %outer = load i64, ptr addrspace(5) %stack.1, align 8 +; CHECK-NEXT: [+1]: store i64 64, ptr addrspace(5) %stack.2, align 8 +; CHECK-NEXT: [+9]: %inner = load i64, ptr addrspace(5) %stack.2, align 8 ; CHECK-NEXT: => Final Score:30 define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 { entry: ; should get a score of 1 %stack = alloca [4 x i64], align 4, addrspace(5) - %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4 - %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8 + %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 8 + %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 16 - store i32 42, ptr addrspace(5) %stack + store i64 42, ptr addrspace(5) %stack br label %loop.outer loop.outer: - store i32 32, ptr addrspace(5) %stack - %outer.cmp = load i1, ptr addrspace(5) %stack.1 + store i64 32, ptr addrspace(5) %stack + %outer = load i64, ptr addrspace(5) %stack.1 br label %loop.inner loop.inner: - store i32 32, ptr addrspace(5) %stack.1 - %inner.cmp = load i1, ptr addrspace(5) %stack.2 + store i64 32, ptr addrspace(5) %stack.1 + %inner = load i64, ptr addrspace(5) %stack.2 + %inner.cmp = icmp sge i64 %inner, 0 br i1 %inner.cmp, label %loop.inner, label %loop.outer exit: - store i32 64, ptr addrspace(5) %stack.2 + store i64 64, ptr addrspace(5) %stack.2 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll index 9fb73963153a2..aaec725f85890 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s ; GCN-LABEL: {{^}}float4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4 diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll index 8e4cc2b0236c0..a7090960518af 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll @@ -1,11 +1,11 @@ -; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s ; Show that what the alloca promotion pass will do for non-atomic load/store. ; OPT-LABEL: @vector_alloca_not_atomic( ; -; OPT: extractelement <3 x i32> , i64 %index -define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) { +; OPT: extractelement <3 x i32> , i32 %index +define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i32 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 @@ -13,7 +13,7 @@ entry: store i32 0, ptr addrspace(5) %alloca store i32 1, ptr addrspace(5) %a1 store i32 2, ptr addrspace(5) %a2 - %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index %data = load i32, ptr addrspace(5) %tmp store i32 %data, ptr addrspace(1) %out ret void @@ -26,7 +26,7 @@ entry: ; OPT: store i32 1, ptr addrspace(5) ; OPT: store i32 2, ptr addrspace(5) ; OPT: load atomic i32, ptr addrspace(5) -define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) { +define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i32 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 @@ -34,7 +34,7 @@ entry: store i32 0, ptr addrspace(5) %alloca store i32 1, ptr addrspace(5) %a1 store i32 2, ptr addrspace(5) %a2 - %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index %data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4 store i32 %data, ptr addrspace(1) %out ret void @@ -47,7 +47,7 @@ entry: ; OPT: store atomic i32 1, ptr addrspace(5) ; OPT: store atomic i32 2, ptr addrspace(5) ; OPT: load i32, ptr addrspace(5) -define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) { +define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i32 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 @@ -55,7 +55,7 @@ entry: store atomic i32 0, ptr addrspace(5) %alloca release, align 4 store atomic i32 1, ptr addrspace(5) %a1 release, align 4 store atomic i32 2, ptr addrspace(5) %a2 release, align 4 - %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index %data = load i32, ptr addrspace(5) %tmp store i32 %data, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll index 9c05f4d16cb4e..4a29f7e53e93a 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -72,7 +72,8 @@ entry: ; OPT-NOT: alloca ; OPT: bb2: ; OPT: %promotealloca = phi <6 x float> [ zeroinitializer, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10 +; OPT: [[TMP:%tmp7.*]] = load float, ptr addrspace(1) %tmp5, align 4 +; OPT: %0 = insertelement <6 x float> %promotealloca, float [[TMP]], i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x float> %0 to <6 x i32> ; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20 @@ -132,7 +133,8 @@ bb15: ; preds = %.preheader ; OPT-NOT: alloca ; OPT: bb2: ; OPT: %promotealloca = phi <6 x double> [ zeroinitializer, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10 +; OPT: [[TMP:%tmp7.*]] = load double, ptr addrspace(1) %tmp5, align 8 +; OPT: %0 = insertelement <6 x double> %promotealloca, double [[TMP]], i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x double> %0 to <6 x i64> ; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20