llvm · david-arm · Sep 15, 2025 · fhahn · Sep 15, 2025 · david-arm
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -968,6 +968,9 @@ class TargetTransformInfo {
   LLVM_ABI InstructionCost getOperandsScalarizationOverhead(
       ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const;
 
+  LLVM_ABI InstructionCost getCallScalarizationOverhead(CallInst *CI,
+                                                        ElementCount VF) const;
+
   /// If target has efficient vector element load/store instructions, it can
   /// return true here so that insertion/extraction costs are not added to
   /// the scalarization cost of a load/store.

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -464,6 +464,11 @@ class TargetTransformInfoImplBase {
     return 0;
   }
 
+  virtual InstructionCost getCallScalarizationOverhead(CallInst *CI,
+                                                       ElementCount VF) const {
+    return 0;
+  }
+
   virtual bool supportsEfficientVectorElementLoadStore() const { return false; }
 
   virtual bool supportsTailCalls() const { return true; }

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -304,30 +304,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
       RTLIB::Libcall LC,
       std::optional<unsigned> CallRetElementIndex = {}) const {
-    Type *RetTy = ICA.getReturnType();
-    // Vector variants of the intrinsic can be mapped to a vector library call.
-    auto const *LibInfo = ICA.getLibInfo();
-    if (!LibInfo || !isa<StructType>(RetTy) ||
-        !isVectorizedStructTy(cast<StructType>(RetTy)))
-      return std::nullopt;
-
-    // Find associated libcall.
-    const char *LCName = getTLI()->getLibcallName(LC);
-    if (!LCName)
-      return std::nullopt;
-
-    // Search for a corresponding vector variant.
-    LLVMContext &Ctx = RetTy->getContext();
-    ElementCount VF = getVectorizedTypeVF(RetTy);
-    VecDesc const *VD = nullptr;
-    for (bool Masked : {false, true}) {
-      if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
-        break;
-    }
+    VecDesc const *VD = getMultipleResultIntrinsicVectorLibCallDesc(ICA, LC);
     if (!VD)
       return std::nullopt;
 
     // Cost the call + mask.
+    Type *RetTy = ICA.getReturnType();
+    ElementCount VF = getVectorizedTypeVF(RetTy);
+    LLVMContext &Ctx = RetTy->getContext();
     auto Cost =
         thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
     if (VD->isMasked()) {
@@ -371,6 +355,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   using TargetTransformInfoImplBase::DL;
 
 public:
+  VecDesc const *getMultipleResultIntrinsicVectorLibCallDesc(
+      const IntrinsicCostAttributes &ICA, RTLIB::Libcall LC) const {
+    Type *RetTy = ICA.getReturnType();
+    // Vector variants of the intrinsic can be mapped to a vector library call.
+    auto const *LibInfo = ICA.getLibInfo();
+    if (!LibInfo || !isa<StructType>(RetTy) ||
+        !isVectorizedStructTy(cast<StructType>(RetTy)))
+      return nullptr;
+
+    // Find associated libcall.
+    const char *LCName = getTLI()->getLibcallName(LC);
+    if (!LCName)
+      return nullptr;
+
+    // Search for a corresponding vector variant.
+    ElementCount VF = getVectorizedTypeVF(RetTy);
+    VecDesc const *VD = nullptr;
+    for (bool Masked : {false, true}) {
+      if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
+        break;
+    }
+    return VD;
+  }
+
   /// \name Scalar TTI Implementations
   /// @{
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -641,6 +641,12 @@ InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead(
   return TTIImpl->getOperandsScalarizationOverhead(Tys, CostKind);
 }
 
+InstructionCost
+TargetTransformInfo::getCallScalarizationOverhead(CallInst *CI,
+                                                  ElementCount VF) const {
+  return TTIImpl->getCallScalarizationOverhead(CI, VF);
+}
+
 bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
   return TTIImpl->supportsEfficientVectorElementLoadStore();
 }

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -46,8 +46,13 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
                                             cl::init(10), cl::Hidden);
 
-static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
-                                                  cl::init(15), cl::Hidden);
+static cl::opt<unsigned>
+    SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15),
+                             cl::Hidden);
+
+static cl::opt<unsigned>
+    CallScalarizationCostMultiplier("call-scalarization-cost-multiplier",
+                                    cl::init(10), cl::Hidden);
 
 static cl::opt<unsigned>
     NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
@@ -594,6 +599,12 @@ static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
   return InstructionCost::getInvalid();
 }
 
+static InstructionCost getCallScalarizationCost(ElementCount VF) {
+  if (VF.isScalable())
+    return InstructionCost::getInvalid();
+  return VF.getFixedValue() * CallScalarizationCostMultiplier;
+}
+
 InstructionCost
 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) const {
@@ -606,6 +617,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     if (VTy->getElementCount() == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
 
+  InstructionCost BaseCost = 0;
   switch (ICA.getID()) {
   case Intrinsic::experimental_vector_histogram_add: {
     InstructionCost HistCost = getHistogramCost(ST, ICA);
@@ -1004,10 +1016,44 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
+  case Intrinsic::asin:
+  case Intrinsic::acos:
+  case Intrinsic::atan:
+  case Intrinsic::atan2:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::tan:
+  case Intrinsic::sinh:
+  case Intrinsic::cosh:
+  case Intrinsic::tanh:
+  case Intrinsic::pow:
+  case Intrinsic::exp:
+  case Intrinsic::exp10:
+  case Intrinsic::exp2:
+  case Intrinsic::log:
+  case Intrinsic::log10:
+  case Intrinsic::log2: {
+    if (auto *FixedVTy = dyn_cast<FixedVectorType>(RetTy))
+      BaseCost = getCallScalarizationCost(FixedVTy->getElementCount());
+    break;
+  }
+  case Intrinsic::sincos:
+  case Intrinsic::sincospi: {
+    Type *FirstRetTy = getContainedTypes(RetTy).front();
+    if (auto *FixedVTy = dyn_cast<FixedVectorType>(FirstRetTy)) {
+      EVT ScalarVT = getTLI()->getValueType(DL, FirstRetTy).getScalarType();
+      RTLIB::Libcall LC = ICA.getID() == Intrinsic::sincos
+                              ? RTLIB::getSINCOS(ScalarVT)
+                              : RTLIB::getSINCOSPI(ScalarVT);
+      if (!getMultipleResultIntrinsicVectorLibCallDesc(ICA, LC))
+        BaseCost = getCallScalarizationCost(FixedVTy->getElementCount());
+    }
+    break;
+  }
   default:
     break;
   }
-  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+  return BaseCost + BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
 /// The function will remove redundant reinterprets casting in the presence
@@ -4045,6 +4091,12 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
   return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
 }
 
+InstructionCost
+AArch64TTIImpl::getCallScalarizationOverhead(CallInst *CI,
+                                             ElementCount VF) const {
+  return getCallScalarizationCost(VF);
+}
+
 std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
     Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
     TTI::OperandValueInfo Op2Info, bool IncludeTrunc,

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -479,6 +479,9 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
       ArrayRef<Value *> VL = {}) const override;
 
+  InstructionCost getCallScalarizationOverhead(CallInst *CI,
+                                               ElementCount VF) const override;
+
   /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5775,6 +5775,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
         // Compute costs of unpacking argument values for the scalar calls and
         // packing the return values to a vector.
         InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
+        ScalarizationCost += TTI.getCallScalarizationOverhead(CI, VF);
         ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
       } else {
         // There is no point attempting to calculate the scalar cost for a

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3159,6 +3159,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
               /*Extract=*/false, Ctx.CostKind);
         }
       }
+      ScalarizationCost +=
+          Ctx.TTI.getCallScalarizationOverhead(cast<CallInst>(UI), VF);
       // Skip operands that do not require extraction/scalarization and do not
       // incur any overhead.
       SmallPtrSet<const VPValue *, 4> UniqueOperands;

diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
+; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -call-scalarization-cost-multiplier=1 -passes="print<cost-model>" \
+; RUN:   -cost-kind=throughput 2>&1 -disable-output | FileCheck --check-prefix=CHECK-LOW-SCALARIZATION-COST %s
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB
 
 define void @sincos() {
@@ -8,31 +10,43 @@ define void @sincos() {
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
-; CHECK:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
-; CHECK:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
-; CHECK:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
-; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
-; CHECK:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
+; CHECK:  Cost Model: Found an estimated cost of 116 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; CHECK:  Cost Model: Found an estimated cost of 92 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; CHECK:  Cost Model: Found an estimated cost of 44 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; CHECK:  Cost Model: Found an estimated cost of 20 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; CHECK:  Cost Model: Found an estimated cost of 184 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
 ;
+; CHECK-LOW-SCALARIZATION-COST-LABEL: 'sincos'
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 44 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 26 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 11 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
+;
 ; CHECK-VECLIB-LABEL: 'sincos'
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
-; CHECK-VECLIB:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 116 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
-; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
-; CHECK-VECLIB:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 20 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 184 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)