169 changes: 106 additions & 63 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Large diffs are not rendered by default.

41 changes: 29 additions & 12 deletions llvm/lib/Target/X86/X86TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
unsigned getMaxInterleaveFactor(unsigned VF);
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
Expand All @@ -128,66 +129,82 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
VectorType *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
bool Insert, bool Extract);
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);
unsigned AddressSpace,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);
unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
bool VariableMask, unsigned Alignment,
TTI::TargetCostKind CostKind,
const Instruction *I);
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
const SCEV *Ptr);

unsigned getAtomicMemIntrinsicMaxElementSize() const;

int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys, FastMathFlags FMF,
unsigned ScalarizationCostPassed = UINT_MAX,
const Instruction *I = nullptr);
int getIntrinsicInstrCost(
Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys,
FastMathFlags FMF, unsigned ScalarizationCostPassed = UINT_MAX,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
const Instruction *I = nullptr);

int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Value *> Args, FastMathFlags FMF,
unsigned VF = 1, const Instruction *I = nullptr);
unsigned VF = 1,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
const Instruction *I = nullptr);

int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
bool IsPairwiseForm);
bool IsPairwiseForm,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);

int getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);

int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsPairwiseForm, bool IsUnsigned);
bool IsPairwiseForm, bool IsUnsigned,
TTI::TargetCostKind CostKind);

int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false,
bool UseMaskForGaps = false);
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false,
bool UseMaskForGaps = false);
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false,
bool UseMaskForGaps = false);

int getIntImmCost(int64_t);

int getIntImmCost(const APInt &Imm, Type *Ty);
int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);

unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
TTI::TargetCostKind);

int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);
Type *Ty, TTI::TargetCostKind CostKind);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
bool canMacroFuseCmp();
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,10 +363,12 @@ void ConstantHoistingPass::collectConstantCandidates(
// instruction and operand index.
if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx,
ConstInt->getValue(), ConstInt->getType());
ConstInt->getValue(), ConstInt->getType(),
TargetTransformInfo::TCK_SizeAndLatency);
else
Cost = TTI->getIntImmCostInst(Inst->getOpcode(), Idx, ConstInt->getValue(),
ConstInt->getType());
ConstInt->getType(),
TargetTransformInfo::TCK_SizeAndLatency);

// Ignore cheap integer constants.
if (Cost > TargetTransformInfo::TCC_Basic) {
Expand Down Expand Up @@ -416,7 +418,8 @@ void ConstantHoistingPass::collectConstantCandidates(
// usually lowered to a load from constant pool. Such operation is unlikely
// to be cheaper than compute it by <Base + Offset>, which can be lowered to
// an ADD instruction or folded into Load/Store instruction.
int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy);
int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy,
TargetTransformInfo::TCK_SizeAndLatency);
ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV];
ConstCandMapType::iterator Itr;
bool Inserted;
Expand Down Expand Up @@ -582,7 +585,8 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
for (auto User : ConstCand->Uses) {
unsigned Opcode = User.Inst->getOpcode();
unsigned OpndIdx = User.OpndIdx;
Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty);
Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty,
TargetTransformInfo::TCK_SizeAndLatency);
LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");

for (auto C2 = S; C2 != E; ++C2) {
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1990,7 +1990,9 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
"non noop cast is found during rematerialization");

Type *SrcTy = CI->getOperand(0)->getType();
Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy,
TargetTransformInfo::TCK_SizeAndLatency,
CI);

} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
// Cost of the address calculation
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,8 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
continue;

int &MatCost = InsertResult.first->second.MatCost;
MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType(),
TargetTransformInfo::TCK_SizeAndLatency);
NonFreeMat |= MatCost != TTI.TCC_Free;
}
if (!NonFreeMat) {
Expand Down Expand Up @@ -283,12 +284,15 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
if (IID)
FoldedCost += TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
IncomingC->getType());
FoldedCost +=
TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
IncomingC->getType(),
TargetTransformInfo::TCK_SizeAndLatency);
else
FoldedCost +=
TTI.getIntImmCostInst(UserI->getOpcode(), Idx,
IncomingC->getValue(), IncomingC->getType());
IncomingC->getValue(), IncomingC->getType(),
TargetTransformInfo::TCK_SizeAndLatency);

// If we accumulate more folded cost for this incoming constant than
// materialized cost, then we'll regress any edge with this constant so
Expand Down
63 changes: 43 additions & 20 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3277,7 +3277,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
TTI::TCK_RecipThroughput);
if (VF == 1)
return ScalarCallCost;

Expand All @@ -3302,7 +3303,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
return Cost;

// If the corresponding vector cost is cheaper, return its cost.
unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
TTI::TCK_RecipThroughput);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
return VectorCallCost;
Expand All @@ -3320,7 +3322,9 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
FMF = FPMO->getFastMathFlags();

SmallVector<Value *, 4> Operands(CI->arg_operands());
return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF,
TargetTransformInfo::TCK_RecipThroughput,
CI);
}

static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
Expand Down Expand Up @@ -5832,7 +5836,8 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// vectorized loop where the user of it is a vectorized instruction.
const MaybeAlign Alignment = getLoadStoreAlignment(I);
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
Alignment, AS);
Alignment, AS,
TTI::TCK_RecipThroughput);

// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
Expand Down Expand Up @@ -5860,16 +5865,19 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
const MaybeAlign Alignment = getLoadStoreAlignment(I);
unsigned Cost = 0;
if (Legal->isMaskRequired(I))
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
Alignment ? Alignment->value() : 0, AS);
Alignment ? Alignment->value() : 0, AS,
CostKind);
else
Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
CostKind, I);

bool Reverse = ConsecutiveStride < 0;
if (Reverse)
Expand All @@ -5883,16 +5891,19 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
const MaybeAlign Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
}
StoreInst *SI = cast<StoreInst>(I);

bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
CostKind) +
(isLoopInvariantStoreValue
? 0
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
Expand All @@ -5909,7 +5920,9 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
return TTI.getAddressComputationCost(VectorTy) +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
Legal->isMaskRequired(I),
Alignment ? Alignment->value() : 0, I);
Alignment ? Alignment->value() : 0,
TargetTransformInfo::TCK_RecipThroughput,
I);
}

unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Expand Down Expand Up @@ -5938,7 +5951,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
unsigned Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
Group->getAlign().value(), AS, TTI::TCK_RecipThroughput,
Legal->isMaskRequired(I), UseMaskForGaps);

if (Group->isReverse()) {
// TODO: Add support for reversed masked interleaved access.
Expand All @@ -5960,7 +5974,8 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
unsigned AS = getLoadStoreAddressSpace(I);

return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
TTI::TCK_RecipThroughput, I);
}
return getWideningCost(I, VF);
}
Expand Down Expand Up @@ -6182,6 +6197,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
Expand Down Expand Up @@ -6238,7 +6254,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return (Phi->getNumIncomingValues() - 1) *
TTI.getCmpSelInstrCost(
Instruction::Select, ToVectorTy(Phi->getType(), VF),
ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
CostKind);

return TTI.getCFInstrCost(Instruction::PHI);
}
Expand All @@ -6260,7 +6277,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
Cost += VF * TTI.getCFInstrCost(Instruction::PHI);

// The cost of the non-predicated instruction.
Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);

// The cost of insertelement and extractelement instructions needed for
// scalarization.
Expand Down Expand Up @@ -6301,13 +6318,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
SmallVector<const Value *, 4> Operands(I->operand_values());
unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
I->getOpcode(), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
I->getOpcode(), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
I->getOperand(0), I);
Expand All @@ -6320,7 +6339,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);

return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
CostKind, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Expand All @@ -6329,7 +6349,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
I);
}
case Instruction::Store:
case Instruction::Load: {
Expand Down Expand Up @@ -6362,7 +6383,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (isOptimizableIVTruncate(I, VF)) {
auto *Trunc = cast<TruncInst>(I);
return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
Trunc->getSrcTy(), Trunc);
Trunc->getSrcTy(), CostKind, Trunc);
}

Type *SrcScalarTy = I->getOperand(0)->getType();
Expand All @@ -6388,7 +6409,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}

unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
CostKind, I);
}
case Instruction::Call: {
bool NeedToScalarize;
Expand All @@ -6401,7 +6423,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
default:
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
CostKind) +
getScalarizationOverhead(I, VF);
} // end of switch.
}
Expand Down
64 changes: 39 additions & 25 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3259,7 +3259,8 @@ getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
VectorType::get(Arg->getType(), VecTy->getNumElements()));

// If the corresponding vector call is cheaper, return its cost.
LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys);
LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
TTI::TCK_RecipThroughput);
}
return {IntrinsicCost, LibCost};
}
Expand All @@ -3273,6 +3274,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
ScalarTy = CI->getOperand(0)->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
Expand Down Expand Up @@ -3380,7 +3382,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Ext->getOpcode(), Ext->getType(), VecTy, i);
// Add back the cost of s|zext which is subtracted separately.
DeadCost += TTI->getCastInstrCost(
Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
Ext);
continue;
}
}
Expand All @@ -3404,7 +3407,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
int ScalarEltCost =
TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
Expand All @@ -3417,7 +3421,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
VecCost = ReuseShuffleCost +
TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
CostKind, VL0);
}
return VecCost - ScalarCost;
}
Expand All @@ -3426,13 +3431,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::Select: {
// Calculate the cost of this instruction.
int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
Builder.getInt1Ty(), VL0);
Builder.getInt1Ty(),
CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
CostKind, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::FNeg:
Expand Down Expand Up @@ -3493,13 +3500,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

SmallVector<const Value *, 4> Operands(VL0->operand_values());
int ScalarEltCost = TTI->getArithmeticInstrCost(
E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
Operands, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost = TTI->getArithmeticInstrCost(
E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
Operands, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
Expand All @@ -3509,26 +3518,30 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OK_UniformConstantValue;

int ScalarEltCost =
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
Op1VK, Op2VK);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost =
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
Op1VK, Op2VK);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());
int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
int VecLdCost =
TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
CostKind, VL0);
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecLdCost += TTI->getShuffleCost(
Expand All @@ -3543,12 +3556,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
MaybeAlign Alignment(SI->getAlignment());
int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
CostKind, VL0);
if (NeedToShuffleReuses)
ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
VecTy, Alignment, 0, VL0);
VecTy, Alignment, 0, CostKind, VL0);
if (IsReorder) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecStCost += TTI->getShuffleCost(
Expand All @@ -3570,7 +3584,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
FMF = FPMO->getFastMathFlags();

int ScalarEltCost =
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF, 1, CostKind);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
Expand All @@ -3596,34 +3610,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
if (NeedToShuffleReuses) {
for (unsigned Idx : E->ReuseShuffleIndices) {
Instruction *I = cast<Instruction>(VL[Idx]);
ReuseShuffleCost -= TTI->getInstructionCost(
I, TargetTransformInfo::TCK_RecipThroughput);
ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
}
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
ReuseShuffleCost += TTI->getInstructionCost(
I, TargetTransformInfo::TCK_RecipThroughput);
ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
}
}
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
ScalarCost += TTI->getInstructionCost(
I, TargetTransformInfo::TCK_RecipThroughput);
ScalarCost += TTI->getInstructionCost(I, CostKind);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
int VecCost = 0;
if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
} else {
Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
CostKind);
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
CostKind);
}
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
return ReuseShuffleCost + VecCost - ScalarCost;
Expand Down