Skip to content

Commit

Permalink
[NFC] Make remaining cost functions in LoopVectorize.cpp use Instruct…
Browse files Browse the repository at this point in the history
…ionCost

A previous patch has already changed getInstructionCost to return
an InstructionCost type. This patch changes the other various
getXXXCost functions to return an InstructionCost too. This is a
non-functional change - I've added a few asserts that the costs
are valid in places where we're selecting between vector call
and intrinsic costs. However, since we don't yet return invalid
costs from any of the TTI implementations these asserts should
not fire.

See this patch for the introduction of the type: https://reviews.llvm.org/D91174
See this thread for context: http://lists.llvm.org/pipermail/llvm-dev/2020-November/146408.html

Differential Revision: https://reviews.llvm.org/D94065
  • Loading branch information
david-arm committed Jan 19, 2021
1 parent 7e1d222 commit c3ce262
Showing 1 changed file with 75 additions and 60 deletions.
135 changes: 75 additions & 60 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -1385,15 +1385,16 @@ class LoopVectorizationCostModel {
/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// instruction \p I and vector width \p VF.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
unsigned Cost) {
InstructionCost Cost) {
assert(VF.isVector() && "Expected VF >=2");
WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
}

/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// interleaving group \p Grp and vector width \p VF.
void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
ElementCount VF, InstWidening W, unsigned Cost) {
ElementCount VF, InstWidening W,
InstructionCost Cost) {
assert(VF.isVector() && "Expected VF >=2");
/// Broadcast this decicion to all instructions inside the group.
/// But the cost will be assigned to one instruction only.
Expand Down Expand Up @@ -1426,7 +1427,7 @@ class LoopVectorizationCostModel {

/// Return the vectorization cost for the given instruction \p I and vector
/// width \p VF.
unsigned getWideningCost(Instruction *I, ElementCount VF) {
InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
assert(VF.isVector() && "Expected VF >=2");
std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
Expand Down Expand Up @@ -1604,15 +1605,15 @@ class LoopVectorizationCostModel {
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
/// with factor VF. Return the cost of the instruction, including
/// scalarization overhead if it's needed.
unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);

/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
/// if it's needed. The flag NeedToScalarize shows if the call needs to be
/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.
unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
bool &NeedToScalarize);
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
bool &NeedToScalarize);

/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
Expand Down Expand Up @@ -1655,30 +1656,30 @@ class LoopVectorizationCostModel {
Type *&VectorTy);

/// Calculate vectorization cost of memory instruction \p I.
unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);

/// The cost computation for scalarized memory instruction.
unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);

/// The cost computation for interleaving group of memory instructions.
unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);

/// The cost computation for Gather/Scatter instruction.
unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);

/// The cost computation for widening instruction \p I with consecutive
/// memory access.
unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);

/// The cost calculation for Load/Store instruction \p I with uniform pointer -
/// Load: scalar load + broadcast.
/// Store: scalar store + (loop invariant value stored? 0 : extract of last
/// element)
unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);

/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);

/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
Expand Down Expand Up @@ -1766,7 +1767,7 @@ class LoopVectorizationCostModel {
/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
std::pair<InstWidening, unsigned>>;
std::pair<InstWidening, InstructionCost>>;

DecisionList WideningDecisions;

Expand Down Expand Up @@ -3701,9 +3702,9 @@ static void cse(BasicBlock *BB) {
}
}

unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
ElementCount VF,
bool &NeedToScalarize) {
InstructionCost
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
bool &NeedToScalarize) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
Expand All @@ -3715,8 +3716,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
TTI::TCK_RecipThroughput);
InstructionCost ScalarCallCost =
TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
if (VF.isScalar())
return ScalarCallCost;

Expand All @@ -3727,9 +3728,10 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,

// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);

unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
InstructionCost Cost =
ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;

// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
Expand All @@ -3741,17 +3743,18 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
return Cost;

// If the corresponding vector cost is cheaper, return its cost.
unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
TTI::TCK_RecipThroughput);
InstructionCost VectorCallCost =
TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
return VectorCallCost;
Cost = VectorCallCost;
}
return Cost;
}

unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
ElementCount VF) {
InstructionCost
LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
ElementCount VF) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");

Expand Down Expand Up @@ -4880,11 +4883,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
bool UseVectorIntrinsic =
ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert((UseVectorIntrinsic || !NeedToScalarize) &&
"Instruction should be scalarized elsewhere.");
assert(IntrinsicCost.isValid() && CallCost.isValid() &&
"Cannot have invalid costs while widening");

for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
Expand Down Expand Up @@ -6647,7 +6652,7 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
Legal->hasStride(I->getOperand(1));
}

unsigned
InstructionCost
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
ElementCount VF) {
assert(VF.isVector() &&
Expand All @@ -6665,7 +6670,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);

// Get the cost of the scalar memory instruction and address computation.
unsigned Cost =
InstructionCost Cost =
VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);

// Don't pass *I here, since it is scalar but will actually be part of a
Expand Down Expand Up @@ -6694,8 +6699,9 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
return Cost;
}

unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
ElementCount VF) {
InstructionCost
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
Expand All @@ -6706,7 +6712,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
const Align Alignment = getLoadStoreAlignment(I);
unsigned Cost = 0;
InstructionCost Cost = 0;
if (Legal->isMaskRequired(I))
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
CostKind);
Expand All @@ -6720,8 +6726,9 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
return Cost;
}

unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
assert(Legal->isUniformMemOp(*I));

Type *ValTy = getMemInstValueType(I);
Expand All @@ -6747,8 +6754,9 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
VF.getKnownMinValue() - 1));
}

unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
ElementCount VF) {
InstructionCost
LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
Expand All @@ -6760,8 +6768,9 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
TargetTransformInfo::TCK_RecipThroughput, I);
}

unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
ElementCount VF) {
InstructionCost
LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
ElementCount VF) {
Type *ValTy = getMemInstValueType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
Expand All @@ -6785,7 +6794,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
unsigned Cost = TTI.getInterleavedMemoryOpCost(
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);

Expand All @@ -6799,8 +6808,9 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}

unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
ElementCount VF) {
InstructionCost
LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
ElementCount VF) {
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
if (VF.isScalar()) {
Expand Down Expand Up @@ -6846,15 +6856,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return VectorizationCostTy(C, TypeNotScalarized);
}

unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
ElementCount VF) {
InstructionCost
LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
ElementCount VF) {

assert(!VF.isScalable() &&
"cannot compute scalarization overhead for scalable vectorization");
if (VF.isScalar())
return 0;

unsigned Cost = 0;
InstructionCost Cost = 0;
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Expand Down Expand Up @@ -6903,14 +6914,14 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// relying on instcombine to remove them.
// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
unsigned Cost = getUniformMemOpCost(&I, VF);
InstructionCost Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
continue;
}

// We assume that widening is the best solution when possible.
if (memoryInstructionCanBeWidened(&I, VF)) {
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
int ConsecutiveStride =
Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
Expand All @@ -6922,7 +6933,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
}

// Choose between Interleaving, Gather/Scatter or Scalarization.
unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
InstructionCost InterleaveCost = std::numeric_limits<int>::max();
unsigned NumAccesses = 1;
if (isAccessInterleaved(&I)) {
auto Group = getInterleavedAccessGroup(&I);
Expand All @@ -6937,17 +6948,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
InterleaveCost = getInterleaveGroupCost(&I, VF);
}

unsigned GatherScatterCost =
InstructionCost GatherScatterCost =
isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
: std::numeric_limits<unsigned>::max();
: std::numeric_limits<int>::max();

unsigned ScalarizationCost =
InstructionCost ScalarizationCost =
getMemInstScalarizationCost(&I, VF) * NumAccesses;

// Choose better solution for the current VF,
// write down this decision and use it during vectorization.
unsigned Cost;
InstructionCost Cost;
InstWidening Decision;
if (InterleaveCost <= GatherScatterCost &&
InterleaveCost < ScalarizationCost) {
Expand Down Expand Up @@ -7112,7 +7123,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
if (VF.isVector() && isScalarWithPredication(I)) {
unsigned Cost = 0;
InstructionCost Cost = 0;

// These instructions have a non-void type, so account for the phi nodes
// that we will create. This cost is likely to be zero. The phi node
Expand Down Expand Up @@ -7304,9 +7315,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::Call: {
bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
if (getVectorIntrinsicIDForCall(CI, TLI))
return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
if (getVectorIntrinsicIDForCall(CI, TLI)) {
InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
return std::min(CallCost, IntrinsicCost);
}
return CallCost;
}
case Instruction::ExtractValue:
Expand Down Expand Up @@ -8222,9 +8235,11 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
bool NeedToScalarize = false;
unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
bool UseVectorIntrinsic =
ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert(IntrinsicCost.isValid() && CallCost.isValid() &&
"Cannot have invalid costs while widening");
return UseVectorIntrinsic || !NeedToScalarize;
};

Expand Down

0 comments on commit c3ce262

Please sign in to comment.