Skip to content

Commit

Permalink
[TTI] Add DemandedElts to getScalarizationOverhead
Browse files Browse the repository at this point in the history
The improvements to the x86 vector insert/extract element costs in D74976 resulted in the estimated costs for vector initialization and scalarization increasing higher than should be expected. This is particularly noticeable on pre-SSE4 targets where the available of legal INSERT_VECTOR_ELT ops is more limited.

This patch does 2 things:
1 - it implements X86TTIImpl::getScalarizationOverhead to more accurately represent the typical costs of a ISD::BUILD_VECTOR pattern.
2 - it adds a DemandedElts mask to getScalarizationOverhead to permit the SLP's BoUpSLP::getGatherCost to be rewritten to use it directly instead of accumulating raw vector insertion costs.

This fixes PR45418 where a v4i8 (zext'd to v4i32) was no longer vectorizing.

A future patch should extend X86TTIImpl::getScalarizationOverhead to tweak the EXTRACT_VECTOR_ELT scalarization costs as well.

Reviewed By: @craig.topper

Differential Revision: https://reviews.llvm.org/D78216
  • Loading branch information
RKSimon committed Apr 29, 2020
1 parent 42a56bf commit 090cae8
Show file tree
Hide file tree
Showing 22 changed files with 502 additions and 398 deletions.
21 changes: 14 additions & 7 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Expand Up @@ -611,8 +611,15 @@ class TargetTransformInfo {
/// should use coldcc calling convention.
bool useColdCCForColdCall(Function &F) const;

unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;

/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the demanded result elements need to be inserted and/or
/// extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) const;

/// Estimate the overhead of scalarizing an instructions unique
/// non-constant operands. The types of the arguments are ordinarily
/// scalar, in which case the costs are multiplied with VF.
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) const;

Expand Down Expand Up @@ -1231,8 +1238,8 @@ class TargetTransformInfo::Concept {
virtual bool shouldBuildLookupTables() = 0;
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
virtual bool useColdCCForColdCall(Function &F) = 0;
virtual unsigned getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) = 0;
virtual unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) = 0;
virtual unsigned
getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) = 0;
Expand Down Expand Up @@ -1556,9 +1563,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.useColdCCForColdCall(F);
}

unsigned getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) override {
return Impl.getScalarizationOverhead(Ty, Insert, Extract);
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) override {
return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) override {
Expand Down
3 changes: 2 additions & 1 deletion llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Expand Up @@ -239,7 +239,8 @@ class TargetTransformInfoImplBase {

bool useColdCCForColdCall(Function &F) { return false; }

unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) {
return 0;
}

Expand Down
19 changes: 17 additions & 2 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Expand Up @@ -548,12 +548,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
unsigned getRegisterBitWidth(bool Vector) const { return 32; }

/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the result needs to be inserted and/or extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
/// are set if the demanded result elements need to be inserted and/or
/// extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract) {
auto *VTy = cast<VectorType>(Ty);
assert(DemandedElts.getBitWidth() == VTy->getNumElements() &&
"Vector size mismatch");

unsigned Cost = 0;

for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
if (!DemandedElts[i])
continue;
if (Insert)
Cost += static_cast<T *>(this)->getVectorInstrCost(
Instruction::InsertElement, VTy, i);
Expand All @@ -565,6 +572,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}

/// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
auto *VTy = cast<VectorType>(Ty);
APInt DemandedElts = APInt::getAllOnesValue(VTy->getNumElements());
return static_cast<T *>(this)->getScalarizationOverhead(Ty, DemandedElts,
Insert, Extract);
}

/// Estimate the overhead of scalarizing an instructions unique
/// non-constant operands. The types of the arguments are ordinarily
/// scalar, in which case the costs are multiplied with VF.
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Expand Up @@ -368,9 +368,9 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
return TTIImpl->useColdCCForColdCall(F);
}

unsigned TargetTransformInfo::getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) const {
return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);
unsigned TargetTransformInfo::getScalarizationOverhead(
Type *Ty, const APInt &DemandedElts, bool Insert, bool Extract) const {
return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}

unsigned TargetTransformInfo::getOperandsScalarizationOverhead(
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
Expand Up @@ -115,9 +115,10 @@ unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
return (8 * ST.getVectorLength()) / ElemWidth;
}

unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) {
return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty,
const APInt &DemandedElts,
bool Insert, bool Extract) {
return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}

unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
Expand Up @@ -101,9 +101,10 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
return true;
}

unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,
unsigned VF);
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract);
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF);
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Value *> Args, FastMathFlags FMF,
Expand Down
75 changes: 67 additions & 8 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Expand Up @@ -2868,9 +2868,62 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}

unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
bool Extract) {
return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty,
const APInt &DemandedElts,
bool Insert, bool Extract) {
auto* VecTy = cast<VectorType>(Ty);
unsigned Cost = 0;

// For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
// cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
if (Insert) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
MVT MScalarTy = LT.second.getScalarType();

if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
(MScalarTy.isInteger() && ST->hasSSE41()) ||
(MScalarTy == MVT::f32 && ST->hasSSE41())) {
// For types we can insert directly, insertion into 128-bit sub vectors is
// cheap, followed by a cheap chain of concatenations.
if (LT.second.getSizeInBits() <= 128) {
Cost +=
BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
} else {
unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
Cost += DemandedElts.countPopulation();

// For vXf32 cases, insertion into the 0'th index in each v4f32
// 128-bit vector is free.
// NOTE: This assumes legalization widens vXf32 vectors.
if (MScalarTy == MVT::f32)
for (unsigned i = 0, e = VecTy->getNumElements(); i < e; i += 4)
if (DemandedElts[i])
Cost--;
}
} else if (LT.second.isVector()) {
// Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
// integer element as a SCALAR_TO_VECTOR, then we build the vector as a
// series of UNPCK followed by CONCAT_VECTORS - all of these can be
// considered cheap.
if (Ty->isIntOrIntVectorTy())
Cost += DemandedElts.countPopulation();

// Get the smaller of the legalized or original pow2-extended number of
// vector elements, which represents the number of unpacks we'll end up
// performing.
unsigned NumElts = LT.second.getVectorNumElements();
unsigned Pow2Elts = PowerOf2Ceil(VecTy->getNumElements());
Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
}
}

// TODO: Use default extraction for now, but we should investigate extending this
// to handle repeated subvector extraction.
if (Extract)
Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);

return Cost;
}

int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
Expand All @@ -2893,9 +2946,11 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,

// Assume that all other non-power-of-two numbers are scalarized.
if (!isPowerOf2_32(NumElem)) {
APInt DemandedElts = APInt::getAllOnesValue(NumElem);
int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
AddressSpace);
int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
int SplitCost = getScalarizationOverhead(Src, DemandedElts,
Opcode == Instruction::Load,
Opcode == Instruction::Store);
return NumElem * Cost + SplitCost;
}
Expand Down Expand Up @@ -2935,13 +2990,15 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
(IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
!isPowerOf2_32(NumElem)) {
// Scalarization
int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
APInt DemandedElts = APInt::getAllOnesValue(NumElem);
int MaskSplitCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
int BranchCost = getCFInstrCost(Instruction::Br);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);

int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
int ValueSplitCost =
getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
MaybeAlign(Alignment), AddressSpace);
Expand Down Expand Up @@ -3795,12 +3852,14 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
bool VariableMask, unsigned Alignment,
unsigned AddressSpace) {
unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
APInt DemandedElts = APInt::getAllOnesValue(VF);

int MaskUnpackCost = 0;
if (VariableMask) {
VectorType *MaskTy =
VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
MaskUnpackCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost =
getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
nullptr);
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/X86/X86TargetTransformInfo.h
Expand Up @@ -132,7 +132,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract);
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
Expand Down
14 changes: 9 additions & 5 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -5703,8 +5703,9 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
true, false);
ScalarCost +=
TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
APInt::getAllOnesValue(VF), true, false);
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
}

Expand All @@ -5720,7 +5721,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
Worklist.push_back(J);
else if (needsExtract(J, VF))
ScalarCost += TTI.getScalarizationOverhead(
ToVectorTy(J->getType(),VF), false, true);
ToVectorTy(J->getType(), VF), APInt::getAllOnesValue(VF), false,
true);
}

// Scale the total scalar cost by block probability.
Expand Down Expand Up @@ -6004,7 +6006,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(RetTy, true, false);
Cost += TTI.getScalarizationOverhead(RetTy, APInt::getAllOnesValue(VF),
true, false);

// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
Expand Down Expand Up @@ -6210,7 +6213,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// Return cost for branches around scalarized and predicated blocks.
Type *Vec_i1Ty =
VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
false, true) +
(TTI.getCFInstrCost(Instruction::Br) * VF));
} else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
// The back-edge branch will remain, as will all scalar branches.
Expand Down
37 changes: 34 additions & 3 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -3877,10 +3877,13 @@ int BoUpSLP::getTreeCost() {

int BoUpSLP::getGatherCost(VectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const {
int Cost = 0;
for (unsigned i = 0, e = Ty->getNumElements(); i < e; ++i)
unsigned NumElts = Ty->getNumElements();
APInt DemandedElts = APInt::getNullValue(NumElts);
for (unsigned i = 0; i < NumElts; ++i)
if (!ShuffledIndices.count(i))
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
DemandedElts.setBit(i);
int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
/*Extract*/ false);
if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
Expand Down Expand Up @@ -7018,6 +7021,34 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
"Expected insertelement or insertvalue instruction!");
UserCost = 0;
do {
// TODO: Use TTI's getScalarizationOverhead for sequence of inserts rather
// than sum of single inserts as the latter may overestimate cost.
// This work should imply improving cost estimation for extracts that
// added in for external (for vectorization tree) users.
// For example, in following case all extracts added in order to feed
// into external users (inserts), which in turn form sequence to build
// an aggregate that we do match here:
// %4 = extractelement <4 x i64> %3, i32 0
// %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
// %5 = extractelement <4 x i64> %3, i32 1
// %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
// %6 = extractelement <4 x i64> %3, i32 2
// %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
// %7 = extractelement <4 x i64> %3, i32 3
// %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
//
// Cost of this entire sequence is currently estimated as sum of single
// extracts (as this aggregate build sequence is an external to
// vectorization tree user) minus cost of the aggregate build.
// As this whole sequence will be optimized away we want the cost to be
// zero. But it is not quite possible using given approach (at least for
// X86) because inserts can be more expensive than extracts for longer
// vector lengths so the difference turns out not zero in such a case.
// Ideally we want to match this entire sequence and treat it as a no-op
// (i.e. do not count into final cost at all).
// Currently the difference tends to be negative thus adding a bias
// toward favoring vectorization. If we switch into using TTI interface
// the bias tendency will remain but will be lower.
Value *InsertedOperand;
if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
InsertedOperand = IE->getOperand(1);
Expand Down

0 comments on commit 090cae8

Please sign in to comment.