diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e628b122c9d848..b73a2377f1b03e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -635,7 +635,7 @@ class BoUpSLP { /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(); + InstructionCost getTreeCost(ArrayRef VectorizedVals = None); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -1549,10 +1549,12 @@ class BoUpSLP { private: /// Checks if all users of \p I are the part of the vectorization tree. - bool areAllUsersVectorized(Instruction *I) const; + bool areAllUsersVectorized(Instruction *I, + ArrayRef VectorizedVals) const; /// \returns the cost of the vectorizable entry. - InstructionCost getEntryCost(const TreeEntry *E); + InstructionCost getEntryCost(const TreeEntry *E, + ArrayRef VectorizedVals); /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth, @@ -3505,8 +3507,10 @@ bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, return ShouldKeepOrder; } -bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { - return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) { +bool BoUpSLP::areAllUsersVectorized(Instruction *I, + ArrayRef VectorizedVals) const { + return (I->hasOneUse() && is_contained(VectorizedVals, I)) || + llvm::all_of(I->users(), [this](User *U) { return ScalarToTreeEntry.count(U) > 0; }); } @@ -3597,7 +3601,8 @@ computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, return Cost; } -InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) { +InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, + ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); @@ -3626,16 +3631,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) { } // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; - auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, - VecTy](InstructionCost &Cost, bool IsGather) { + auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, + VectorizedVals](InstructionCost &Cost, + bool IsGather) { DenseMap ExtractVectorsTys; for (auto *V : VL) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (IsGather && (!areAllUsersVectorized(cast(V)) || - ScalarToTreeEntry.count(V))) + if (!areAllUsersVectorized(cast(V), VectorizedVals) || + (IsGather && ScalarToTreeEntry.count(V))) continue; auto *EE = cast(V); unsigned Idx = *getExtractIndex(EE); @@ -4389,7 +4395,7 @@ InstructionCost BoUpSLP::getSpillCost() const { return Cost; } -InstructionCost BoUpSLP::getTreeCost() { +InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -4399,7 +4405,7 @@ InstructionCost BoUpSLP::getTreeCost() { for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = *VectorizableTree[I].get(); - InstructionCost C = getEntryCost(&TE); + InstructionCost C = getEntryCost(&TE, VectorizedVals); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] @@ -4429,6 +4435,11 @@ InstructionCost BoUpSLP::getTreeCost() { if (isa(EU.Scalar->getType())) continue; + // Already counted the cost for external uses when tried to adjust the cost + // for extractelements, no need to add it again. + if (isa(EU.Scalar)) + continue; + // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. if (EU.User && isa(EU.User)) { @@ -5566,7 +5577,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Value *Lane = Builder.getInt32(ExternalUse.Lane); auto ExtractAndExtendIfNeeded = [&](Value *Vec) { if (Scalar->getType() != Vec->getType()) { - Value *Ex = Builder.CreateExtractElement(Vec, Lane); + Value *Ex; + // "Reuse" the existing extract to improve final codegen. + if (auto *ES = dyn_cast(Scalar)) { + Ex = Builder.CreateExtractElement(ES->getOperand(0), + ES->getOperand(1)); + } else { + Ex = Builder.CreateExtractElement(Vec, Lane); + } // If necessary, sign-extend or zero-extend ScalarRoot // to the larger type. if (!MinBWs.count(ScalarRoot)) @@ -5574,12 +5592,11 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { if (MinBWs[ScalarRoot].second) return Builder.CreateSExt(Ex, Scalar->getType()); return Builder.CreateZExt(Ex, Scalar->getType()); - } else { - assert(isa(Scalar->getType()) && - isa(Scalar) && - "In-tree scalar of vector type is not insertelement?"); - return Vec; } + assert(isa(Scalar->getType()) && + isa(Scalar) && + "In-tree scalar of vector type is not insertelement?"); + return Vec; }; // If User == nullptr, the Scalar is used as extra arg. Generate // ExtractElement instruction and update the record for this scalar in @@ -7651,7 +7668,8 @@ class HorizontalReduction { V.computeMinimumValueSizes(); // Estimate cost. - InstructionCost TreeCost = V.getTreeCost(); + InstructionCost TreeCost = + V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); InstructionCost ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); InstructionCost Cost = TreeCost + ReductionCost; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll index d1754c0bbc54a0..add973306f6b5d 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -8,7 +8,7 @@ define void @f1(<2 x i16> %x, i16* %a) { ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[X]], i32 0 ; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2 @@ -40,7 +40,7 @@ define void @f2(<2 x i16> %x, i16* %a) { ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 0 ; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 @@ -87,7 +87,7 @@ define void @f3(<2 x i16> %x, i16* %a) { ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll index 756676d7ee14ad..cfd6f59cb774ae 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -143,9 +143,9 @@ define void @extract_reverse_order(<2 x double>* %ptr.1, <4 x double>* %ptr.2) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0 ; CHECK-NEXT: call void @use(double [[TMP3]]) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: call void @use(double [[TMP4]]) ; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8 ; CHECK-NEXT: ret void