Expand Up
@@ -3277,7 +3277,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
unsigned ScalarCallCost = TTI.getCallInstrCost (F, ScalarRetTy, ScalarTys);
unsigned ScalarCallCost = TTI.getCallInstrCost (F, ScalarRetTy, ScalarTys,
TTI::TCK_RecipThroughput);
if (VF == 1 )
return ScalarCallCost;
Expand All
@@ -3302,7 +3303,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
return Cost;
// If the corresponding vector cost is cheaper, return its cost.
unsigned VectorCallCost = TTI.getCallInstrCost (nullptr , RetTy, Tys);
unsigned VectorCallCost = TTI.getCallInstrCost (nullptr , RetTy, Tys,
TTI::TCK_RecipThroughput);
if (VectorCallCost < Cost) {
NeedToScalarize = false ;
return VectorCallCost;
Expand All
@@ -3320,7 +3322,9 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
FMF = FPMO->getFastMathFlags ();
SmallVector<Value *, 4 > Operands (CI->arg_operands ());
return TTI.getIntrinsicInstrCost (ID, CI->getType (), Operands, FMF, VF, CI);
return TTI.getIntrinsicInstrCost (ID, CI->getType (), Operands, FMF, VF,
TargetTransformInfo::TCK_RecipThroughput,
CI);
}
static Type *smallestIntegerVectorType (Type *T1, Type *T2) {
Expand Down
Expand Up
@@ -5832,7 +5836,8 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// vectorized loop where the user of it is a vectorized instruction.
const MaybeAlign Alignment = getLoadStoreAlignment (I);
Cost += VF * TTI.getMemoryOpCost (I->getOpcode (), ValTy->getScalarType (),
Alignment, AS);
Alignment, AS,
TTI::TCK_RecipThroughput);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
Expand Down
Expand Up
@@ -5860,16 +5865,19 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Value *Ptr = getLoadStorePointerOperand (I);
unsigned AS = getLoadStoreAddressSpace (I);
int ConsecutiveStride = Legal->isConsecutivePtr (Ptr );
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert ((ConsecutiveStride == 1 || ConsecutiveStride == -1 ) &&
" Stride should be 1 or -1 for consecutive memory access" );
const MaybeAlign Alignment = getLoadStoreAlignment (I);
unsigned Cost = 0 ;
if (Legal->isMaskRequired (I))
Cost += TTI.getMaskedMemoryOpCost (I->getOpcode (), VectorTy,
Alignment ? Alignment->value () : 0 , AS);
Alignment ? Alignment->value () : 0 , AS,
CostKind);
else
Cost += TTI.getMemoryOpCost (I->getOpcode (), VectorTy, Alignment, AS, I);
Cost += TTI.getMemoryOpCost (I->getOpcode (), VectorTy, Alignment, AS,
CostKind, I);
bool Reverse = ConsecutiveStride < 0 ;
if (Reverse)
Expand All
@@ -5883,16 +5891,19 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
auto *VectorTy = cast<VectorType>(ToVectorTy (ValTy, VF));
const MaybeAlign Alignment = getLoadStoreAlignment (I);
unsigned AS = getLoadStoreAddressSpace (I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost (ValTy) +
TTI.getMemoryOpCost (Instruction::Load, ValTy, Alignment, AS) +
TTI.getMemoryOpCost (Instruction::Load, ValTy, Alignment, AS,
CostKind) +
TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VectorTy);
}
StoreInst *SI = cast<StoreInst>(I);
bool isLoopInvariantStoreValue = Legal->isUniform (SI->getValueOperand ());
return TTI.getAddressComputationCost (ValTy) +
TTI.getMemoryOpCost (Instruction::Store, ValTy, Alignment, AS) +
TTI.getMemoryOpCost (Instruction::Store, ValTy, Alignment, AS,
CostKind) +
(isLoopInvariantStoreValue
? 0
: TTI.getVectorInstrCost (Instruction::ExtractElement, VectorTy,
Expand All
@@ -5909,7 +5920,9 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
return TTI.getAddressComputationCost (VectorTy) +
TTI.getGatherScatterOpCost (I->getOpcode (), VectorTy, Ptr ,
Legal->isMaskRequired (I),
Alignment ? Alignment->value () : 0 , I);
Alignment ? Alignment->value () : 0 ,
TargetTransformInfo::TCK_RecipThroughput,
I);
}
unsigned LoopVectorizationCostModel::getInterleaveGroupCost (Instruction *I,
Expand Down
Expand Up
@@ -5938,7 +5951,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Group->requiresScalarEpilogue () && !isScalarEpilogueAllowed ();
unsigned Cost = TTI.getInterleavedMemoryOpCost (
I->getOpcode (), WideVecTy, Group->getFactor (), Indices,
Group->getAlign ().value (), AS, Legal->isMaskRequired (I), UseMaskForGaps);
Group->getAlign ().value (), AS, TTI::TCK_RecipThroughput,
Legal->isMaskRequired (I), UseMaskForGaps);
if (Group->isReverse ()) {
// TODO: Add support for reversed masked interleaved access.
Expand All
@@ -5960,7 +5974,8 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
unsigned AS = getLoadStoreAddressSpace (I);
return TTI.getAddressComputationCost (ValTy) +
TTI.getMemoryOpCost (I->getOpcode (), ValTy, Alignment, AS, I);
TTI.getMemoryOpCost (I->getOpcode (), ValTy, Alignment, AS,
TTI::TCK_RecipThroughput, I);
}
return getWideningCost (I, VF);
}
Expand Down
Expand Up
@@ -6182,6 +6197,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
RetTy = IntegerType::get (RetTy->getContext (), MinBWs[I]);
VectorTy = isScalarAfterVectorization (I, VF) ? RetTy : ToVectorTy (RetTy, VF);
auto SE = PSE.getSE ();
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode ()) {
Expand Down
Expand Up
@@ -6238,7 +6254,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return (Phi->getNumIncomingValues () - 1 ) *
TTI.getCmpSelInstrCost (
Instruction::Select, ToVectorTy (Phi->getType (), VF),
ToVectorTy (Type::getInt1Ty (Phi->getContext ()), VF));
ToVectorTy (Type::getInt1Ty (Phi->getContext ()), VF),
CostKind);
return TTI.getCFInstrCost (Instruction::PHI);
}
Expand All
@@ -6260,7 +6277,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
Cost += VF * TTI.getCFInstrCost (Instruction::PHI);
// The cost of the non-predicated instruction.
Cost += VF * TTI.getArithmeticInstrCost (I->getOpcode (), RetTy);
Cost += VF * TTI.getArithmeticInstrCost (I->getOpcode (), RetTy, CostKind );
// The cost of insertelement and extractelement instructions needed for
// scalarization.
Expand Down
Expand Up
@@ -6301,13 +6318,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
SmallVector<const Value *, 4 > Operands (I->operand_values ());
unsigned N = isScalarAfterVectorization (I, VF) ? VF : 1 ;
return N * TTI.getArithmeticInstrCost (
I->getOpcode (), VectorTy, TargetTransformInfo::OK_AnyValue,
I->getOpcode (), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
unsigned N = isScalarAfterVectorization (I, VF) ? VF : 1 ;
return N * TTI.getArithmeticInstrCost (
I->getOpcode (), VectorTy, TargetTransformInfo::OK_AnyValue,
I->getOpcode (), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
I->getOperand (0 ), I);
Expand All
@@ -6320,7 +6339,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (!ScalarCond)
CondTy = VectorType::get (CondTy, VF);
return TTI.getCmpSelInstrCost (I->getOpcode (), VectorTy, CondTy, I);
return TTI.getCmpSelInstrCost (I->getOpcode (), VectorTy, CondTy,
CostKind, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Expand All
@@ -6329,7 +6349,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth (Op0AsInstruction, VF))
ValTy = IntegerType::get (ValTy->getContext (), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy (ValTy, VF);
return TTI.getCmpSelInstrCost (I->getOpcode (), VectorTy, nullptr , I);
return TTI.getCmpSelInstrCost (I->getOpcode (), VectorTy, nullptr , CostKind,
I);
}
case Instruction::Store:
case Instruction::Load: {
Expand Down
Expand Up
@@ -6362,7 +6383,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (isOptimizableIVTruncate (I, VF)) {
auto *Trunc = cast<TruncInst>(I);
return TTI.getCastInstrCost (Instruction::Trunc, Trunc->getDestTy (),
Trunc->getSrcTy (), Trunc);
Trunc->getSrcTy (), CostKind, Trunc);
}
Type *SrcScalarTy = I->getOperand (0 )->getType ();
Expand All
@@ -6388,7 +6409,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
unsigned N = isScalarAfterVectorization (I, VF) ? VF : 1 ;
return N * TTI.getCastInstrCost (I->getOpcode (), VectorTy, SrcVecTy, I);
return N * TTI.getCastInstrCost (I->getOpcode (), VectorTy, SrcVecTy,
CostKind, I);
}
case Instruction::Call: {
bool NeedToScalarize;
Expand All
@@ -6401,7 +6423,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
default :
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
return VF * TTI.getArithmeticInstrCost (Instruction::Mul, VectorTy) +
return VF * TTI.getArithmeticInstrCost (Instruction::Mul, VectorTy,
CostKind) +
getScalarizationOverhead (I, VF);
} // end of switch.
}
Expand Down