Skip to content

Commit

Permalink
[SLP]Improve cost estimation/emission of externally used extracteleme…
Browse files Browse the repository at this point in the history
…nts.

No need to recalculate the cost of extractelements, just no need to
compensate the cost of all extractelements, need to check before if this
is actually going to be removed at the vectorization. Also, no need to
 generate new extractelement instruction, we may just regenerate the
 original one. It may improve the final vectorization.

Differential Revision: https://reviews.llvm.org/D102933
  • Loading branch information
alexey-bataev committed Jun 3, 2021
1 parent bb5e1c6 commit 8c48d77
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 24 deletions.
56 changes: 37 additions & 19 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -635,7 +635,7 @@ class BoUpSLP {

/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
InstructionCost getTreeCost();
InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);

/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
Expand Down Expand Up @@ -1549,10 +1549,12 @@ class BoUpSLP {

private:
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I) const;
bool areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const;

/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E);
InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals);

/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
Expand Down Expand Up @@ -3505,8 +3507,10 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
return ShouldKeepOrder;
}

bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
bool BoUpSLP::areAllUsersVectorized(Instruction *I,
ArrayRef<Value *> VectorizedVals) const {
return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
llvm::all_of(I->users(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0;
});
}
Expand Down Expand Up @@ -3597,7 +3601,8 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
return Cost;
}

InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals) {
ArrayRef<Value*> VL = E->Scalars;

Type *ScalarTy = VL[0]->getType();
Expand Down Expand Up @@ -3626,16 +3631,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
}
// FIXME: it tries to fix a problem with MSVC buildbots.
TargetTransformInfo &TTIRef = *TTI;
auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL,
VecTy](InstructionCost &Cost, bool IsGather) {
auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
VectorizedVals](InstructionCost &Cost,
bool IsGather) {
DenseMap<Value *, int> ExtractVectorsTys;
for (auto *V : VL) {
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
// instruction as dead and remove its cost from the final cost of the
// vectorized tree.
if (IsGather && (!areAllUsersVectorized(cast<Instruction>(V)) ||
ScalarToTreeEntry.count(V)))
if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
(IsGather && ScalarToTreeEntry.count(V)))
continue;
auto *EE = cast<ExtractElementInst>(V);
unsigned Idx = *getExtractIndex(EE);
Expand Down Expand Up @@ -4389,7 +4395,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
return Cost;
}

InstructionCost BoUpSLP::getTreeCost() {
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
Expand All @@ -4399,7 +4405,7 @@ InstructionCost BoUpSLP::getTreeCost() {
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I].get();

InstructionCost C = getEntryCost(&TE);
InstructionCost C = getEntryCost(&TE, VectorizedVals);
Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for bundle that starts with " << *TE.Scalars[0]
Expand Down Expand Up @@ -4429,6 +4435,11 @@ InstructionCost BoUpSLP::getTreeCost() {
if (isa<FixedVectorType>(EU.Scalar->getType()))
continue;

// Already counted the cost for external uses when tried to adjust the cost
// for extractelements, no need to add it again.
if (isa<ExtractElementInst>(EU.Scalar))
continue;

// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
if (EU.User && isa<InsertElementInst>(EU.User)) {
Expand Down Expand Up @@ -5566,20 +5577,26 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Value *Lane = Builder.getInt32(ExternalUse.Lane);
auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
if (Scalar->getType() != Vec->getType()) {
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Value *Ex;
// "Reuse" the existing extract to improve final codegen.
if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
Ex = Builder.CreateExtractElement(ES->getOperand(0),
ES->getOperand(1));
} else {
Ex = Builder.CreateExtractElement(Vec, Lane);
}
// If necessary, sign-extend or zero-extend ScalarRoot
// to the larger type.
if (!MinBWs.count(ScalarRoot))
return Ex;
if (MinBWs[ScalarRoot].second)
return Builder.CreateSExt(Ex, Scalar->getType());
return Builder.CreateZExt(Ex, Scalar->getType());
} else {
assert(isa<FixedVectorType>(Scalar->getType()) &&
isa<InsertElementInst>(Scalar) &&
"In-tree scalar of vector type is not insertelement?");
return Vec;
}
assert(isa<FixedVectorType>(Scalar->getType()) &&
isa<InsertElementInst>(Scalar) &&
"In-tree scalar of vector type is not insertelement?");
return Vec;
};
// If User == nullptr, the Scalar is used as extra arg. Generate
// ExtractElement instruction and update the record for this scalar in
Expand Down Expand Up @@ -7651,7 +7668,8 @@ class HorizontalReduction {
V.computeMinimumValueSizes();

// Estimate cost.
InstructionCost TreeCost = V.getTreeCost();
InstructionCost TreeCost =
V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
InstructionCost ReductionCost =
getReductionCost(TTI, ReducedVals[i], ReduxWidth);
InstructionCost Cost = TreeCost + ReductionCost;
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
Expand Up @@ -8,7 +8,7 @@ define void @f1(<2 x i16> %x, i16* %a) {
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[X]], i32 0
; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]], align 2
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2
Expand Down Expand Up @@ -40,7 +40,7 @@ define void @f2(<2 x i16> %x, i16* %a) {
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 0
; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
Expand Down Expand Up @@ -87,7 +87,7 @@ define void @f3(<2 x i16> %x, i16* %a) {
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 1
; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
Expand Down
Expand Up @@ -143,9 +143,9 @@ define void @extract_reverse_order(<2 x double>* %ptr.1, <4 x double>* %ptr.2) {
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0
; CHECK-NEXT: call void @use(double [[TMP3]])
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1
; CHECK-NEXT: call void @use(double [[TMP4]])
; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8
; CHECK-NEXT: ret void
Expand Down

0 comments on commit 8c48d77

Please sign in to comment.