Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,9 @@ class TargetTransformInfo {
LLVM_ABI InstructionCost getOperandsScalarizationOverhead(
ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const;

LLVM_ABI InstructionCost getCallScalarizationOverhead(CallInst *CI,
ElementCount VF) const;

/// If target has efficient vector element load/store instructions, it can
/// return true here so that insertion/extraction costs are not added to
/// the scalarization cost of a load/store.
Expand Down
5 changes: 5 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,11 @@ class TargetTransformInfoImplBase {
return 0;
}

virtual InstructionCost getCallScalarizationOverhead(CallInst *CI,
ElementCount VF) const {
return 0;
}

virtual bool supportsEfficientVectorElementLoadStore() const { return false; }

virtual bool supportsTailCalls() const { return true; }
Expand Down
48 changes: 28 additions & 20 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,30 +304,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
RTLIB::Libcall LC,
std::optional<unsigned> CallRetElementIndex = {}) const {
Type *RetTy = ICA.getReturnType();
// Vector variants of the intrinsic can be mapped to a vector library call.
auto const *LibInfo = ICA.getLibInfo();
if (!LibInfo || !isa<StructType>(RetTy) ||
!isVectorizedStructTy(cast<StructType>(RetTy)))
return std::nullopt;

// Find associated libcall.
const char *LCName = getTLI()->getLibcallName(LC);
if (!LCName)
return std::nullopt;

// Search for a corresponding vector variant.
LLVMContext &Ctx = RetTy->getContext();
ElementCount VF = getVectorizedTypeVF(RetTy);
VecDesc const *VD = nullptr;
for (bool Masked : {false, true}) {
if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
break;
}
VecDesc const *VD = getMultipleResultIntrinsicVectorLibCallDesc(ICA, LC);
if (!VD)
return std::nullopt;

// Cost the call + mask.
Type *RetTy = ICA.getReturnType();
ElementCount VF = getVectorizedTypeVF(RetTy);
LLVMContext &Ctx = RetTy->getContext();
auto Cost =
thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
if (VD->isMasked()) {
Expand Down Expand Up @@ -371,6 +355,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
using TargetTransformInfoImplBase::DL;

public:
VecDesc const *getMultipleResultIntrinsicVectorLibCallDesc(
const IntrinsicCostAttributes &ICA, RTLIB::Libcall LC) const {
Type *RetTy = ICA.getReturnType();
// Vector variants of the intrinsic can be mapped to a vector library call.
auto const *LibInfo = ICA.getLibInfo();
if (!LibInfo || !isa<StructType>(RetTy) ||
!isVectorizedStructTy(cast<StructType>(RetTy)))
return nullptr;

// Find associated libcall.
const char *LCName = getTLI()->getLibcallName(LC);
if (!LCName)
return nullptr;

// Search for a corresponding vector variant.
ElementCount VF = getVectorizedTypeVF(RetTy);
VecDesc const *VD = nullptr;
for (bool Masked : {false, true}) {
if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
break;
}
return VD;
}

/// \name Scalar TTI Implementations
/// @{
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,12 @@ InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead(
return TTIImpl->getOperandsScalarizationOverhead(Tys, CostKind);
}

InstructionCost
TargetTransformInfo::getCallScalarizationOverhead(CallInst *CI,
ElementCount VF) const {
return TTIImpl->getCallScalarizationOverhead(CI, VF);
}

bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
return TTIImpl->supportsEfficientVectorElementLoadStore();
}
Expand Down
58 changes: 55 additions & 3 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,13 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
cl::init(10), cl::Hidden);

static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
cl::init(15), cl::Hidden);
static cl::opt<unsigned>
SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15),
cl::Hidden);

static cl::opt<unsigned>
CallScalarizationCostMultiplier("call-scalarization-cost-multiplier",
cl::init(10), cl::Hidden);

static cl::opt<unsigned>
NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
Expand Down Expand Up @@ -594,6 +599,12 @@ static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
return InstructionCost::getInvalid();
}

static InstructionCost getCallScalarizationCost(ElementCount VF) {
if (VF.isScalable())
return InstructionCost::getInvalid();
return VF.getFixedValue() * CallScalarizationCostMultiplier;
}

InstructionCost
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) const {
Expand All @@ -606,6 +617,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (VTy->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();

InstructionCost BaseCost = 0;
switch (ICA.getID()) {
case Intrinsic::experimental_vector_histogram_add: {
InstructionCost HistCost = getHistogramCost(ST, ICA);
Expand Down Expand Up @@ -1004,10 +1016,44 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
case Intrinsic::asin:
case Intrinsic::acos:
case Intrinsic::atan:
case Intrinsic::atan2:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::tan:
case Intrinsic::sinh:
case Intrinsic::cosh:
case Intrinsic::tanh:
case Intrinsic::pow:
case Intrinsic::exp:
case Intrinsic::exp10:
case Intrinsic::exp2:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2: {
if (auto *FixedVTy = dyn_cast<FixedVectorType>(RetTy))
BaseCost = getCallScalarizationCost(FixedVTy->getElementCount());
break;
Comment on lines +1036 to +1038
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we more directly just consider the cost of scalarizing the operands and vectorizing the result types or is that already included somewhere else?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could add it to the cost of scalarisation of the return type, but I didn't do this for two reasons:

  1. Does the return type guarantee to represent the VF of the loop in all cases? Probably for math calls this is true as we'll always return a value.
  2. We'd also have to pass in an extra Instruction pointer to provide context so that we only increase the cost for calls. It felt a bit awkward expanding the interface for this one case.

}
case Intrinsic::sincos:
case Intrinsic::sincospi: {
Comment on lines +1040 to +1041
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this include Intrinsic::modf too? (which is also supported)

Type *FirstRetTy = getContainedTypes(RetTy).front();
if (auto *FixedVTy = dyn_cast<FixedVectorType>(FirstRetTy)) {
EVT ScalarVT = getTLI()->getValueType(DL, FirstRetTy).getScalarType();
RTLIB::Libcall LC = ICA.getID() == Intrinsic::sincos
? RTLIB::getSINCOS(ScalarVT)
: RTLIB::getSINCOSPI(ScalarVT);
if (!getMultipleResultIntrinsicVectorLibCallDesc(ICA, LC))
BaseCost = getCallScalarizationCost(FixedVTy->getElementCount());
}
break;
}
default:
break;
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
return BaseCost + BaseT::getIntrinsicInstrCost(ICA, CostKind);
}

/// The function will remove redundant reinterprets casting in the presence
Expand Down Expand Up @@ -4045,6 +4091,12 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
}

InstructionCost
AArch64TTIImpl::getCallScalarizationOverhead(CallInst *CI,
ElementCount VF) const {
return getCallScalarizationCost(VF);
}

std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,9 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
ArrayRef<Value *> VL = {}) const override;

InstructionCost getCallScalarizationOverhead(CallInst *CI,
ElementCount VF) const override;

/// Return the cost of the scaling factor used in the addressing
/// mode represented by AM for this target, for a load/store
/// of the specified type.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5775,6 +5775,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
ScalarizationCost += TTI.getCallScalarizationOverhead(CI, VF);
ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
} else {
// There is no point attempting to calculate the scalar cost for a
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3159,6 +3159,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
/*Extract=*/false, Ctx.CostKind);
}
}
ScalarizationCost +=
Ctx.TTI.getCallScalarizationOverhead(cast<CallInst>(UI), VF);
// Skip operands that do not require extraction/scalarization and do not
// incur any overhead.
SmallPtrSet<const VPValue *, 4> UniqueOperands;
Expand Down
38 changes: 26 additions & 12 deletions llvm/test/Analysis/CostModel/AArch64/sincos.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -call-scalarization-cost-multiplier=1 -passes="print<cost-model>" \
; RUN: -cost-kind=throughput 2>&1 -disable-output | FileCheck --check-prefix=CHECK-LOW-SCALARIZATION-COST %s
; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB

define void @sincos() {
Expand All @@ -8,31 +10,43 @@ define void @sincos() {
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
;
; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
;
; CHECK: Cost Model: Found an estimated cost of 116 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
; CHECK: Cost Model: Found an estimated cost of 92 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
; CHECK: Cost Model: Found an estimated cost of 44 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
; CHECK: Cost Model: Found an estimated cost of 184 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
;
; CHECK-LOW-SCALARIZATION-COST-LABEL: 'sincos'
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 44 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 26 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 11 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
;
; CHECK-VECLIB-LABEL: 'sincos'
; CHECK-VECLIB: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
;
; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 116 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
;
; CHECK-VECLIB: Cost Model: Found an estimated cost of 20 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 184 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
Expand Down
Loading