diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a6f4e51e258ab..e1adf36940ac6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -968,6 +968,9 @@ class TargetTransformInfo {
   LLVM_ABI InstructionCost getOperandsScalarizationOverhead(
       ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const;
 
+  LLVM_ABI InstructionCost getCallScalarizationOverhead(CallInst *CI,
+                                                        ElementCount VF) const;
+
   /// If target has efficient vector element load/store instructions, it can
   /// return true here so that insertion/extraction costs are not added to
   /// the scalarization cost of a load/store.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 566e1cf51631a..f7c5080d49266 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -464,6 +464,11 @@ class TargetTransformInfoImplBase {
     return 0;
   }
 
+  virtual InstructionCost getCallScalarizationOverhead(CallInst *CI,
+                                                       ElementCount VF) const {
+    return 0;
+  }
+
   virtual bool supportsEfficientVectorElementLoadStore() const { return false; }
 
   virtual bool supportsTailCalls() const { return true; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index dce423fc1b18b..3dd9fa5f97995 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -304,30 +304,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
       RTLIB::Libcall LC,
       std::optional<unsigned> CallRetElementIndex = {}) const {
-    Type *RetTy = ICA.getReturnType();
-    // Vector variants of the intrinsic can be mapped to a vector library call.
-    auto const *LibInfo = ICA.getLibInfo();
-    if (!LibInfo || !isa<StructType>(RetTy) ||
-        !isVectorizedStructTy(cast<StructType>(RetTy)))
-      return std::nullopt;
-
-    // Find associated libcall.
-    const char *LCName = getTLI()->getLibcallName(LC);
-    if (!LCName)
-      return std::nullopt;
-
-    // Search for a corresponding vector variant.
-    LLVMContext &Ctx = RetTy->getContext();
-    ElementCount VF = getVectorizedTypeVF(RetTy);
-    VecDesc const *VD = nullptr;
-    for (bool Masked : {false, true}) {
-      if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
-        break;
-    }
+    VecDesc const *VD = getMultipleResultIntrinsicVectorLibCallDesc(ICA, LC);
     if (!VD)
       return std::nullopt;
 
     // Cost the call + mask.
+    Type *RetTy = ICA.getReturnType();
+    ElementCount VF = getVectorizedTypeVF(RetTy);
+    LLVMContext &Ctx = RetTy->getContext();
     auto Cost =
         thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
     if (VD->isMasked()) {
@@ -371,6 +355,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   using TargetTransformInfoImplBase::DL;
 
 public:
+  VecDesc const *getMultipleResultIntrinsicVectorLibCallDesc(
+      const IntrinsicCostAttributes &ICA, RTLIB::Libcall LC) const {
+    Type *RetTy = ICA.getReturnType();
+    // Vector variants of the intrinsic can be mapped to a vector library call.
+    auto const *LibInfo = ICA.getLibInfo();
+    if (!LibInfo || !isa<StructType>(RetTy) ||
+        !isVectorizedStructTy(cast<StructType>(RetTy)))
+      return nullptr;
+
+    // Find associated libcall.
+    const char *LCName = getTLI()->getLibcallName(LC);
+    if (!LCName)
+      return nullptr;
+
+    // Search for a corresponding vector variant.
+    ElementCount VF = getVectorizedTypeVF(RetTy);
+    VecDesc const *VD = nullptr;
+    for (bool Masked : {false, true}) {
+      if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
+        break;
+    }
+    return VD;
+  }
+
   /// \name Scalar TTI Implementations
   /// @{
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 09b50c5270e57..045616c8839e8 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -641,6 +641,12 @@ InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead(
   return TTIImpl->getOperandsScalarizationOverhead(Tys, CostKind);
 }
 
+InstructionCost
+TargetTransformInfo::getCallScalarizationOverhead(CallInst *CI,
+                                                  ElementCount VF) const {
+  return TTIImpl->getCallScalarizationOverhead(CI, VF);
+}
+
 bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
   return TTIImpl->supportsEfficientVectorElementLoadStore();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 92321a76dbd80..befaa1b68d4b7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -46,8 +46,13 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
                                             cl::init(10), cl::Hidden);
 
-static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
-                                                  cl::init(15), cl::Hidden);
+static cl::opt<unsigned>
+    SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15),
+                             cl::Hidden);
+
+static cl::opt<unsigned>
+    CallScalarizationCostMultiplier("call-scalarization-cost-multiplier",
+                                    cl::init(10), cl::Hidden);
 
 static cl::opt<unsigned>
     NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
@@ -594,6 +599,12 @@ static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
   return InstructionCost::getInvalid();
 }
 
+static InstructionCost getCallScalarizationCost(ElementCount VF) {
+  if (VF.isScalable())
+    return InstructionCost::getInvalid();
+  return VF.getFixedValue() * CallScalarizationCostMultiplier;
+}
+
 InstructionCost
 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) const {
@@ -606,6 +617,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     if (VTy->getElementCount() == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
 
+  InstructionCost BaseCost = 0;
   switch (ICA.getID()) {
   case Intrinsic::experimental_vector_histogram_add: {
     InstructionCost HistCost = getHistogramCost(ST, ICA);
@@ -1004,10 +1016,44 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
+  case Intrinsic::asin:
+  case Intrinsic::acos:
+  case Intrinsic::atan:
+  case Intrinsic::atan2:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::tan:
+  case Intrinsic::sinh:
+  case Intrinsic::cosh:
+  case Intrinsic::tanh:
+  case Intrinsic::pow:
+  case Intrinsic::exp:
+  case Intrinsic::exp10:
+  case Intrinsic::exp2:
+  case Intrinsic::log:
+  case Intrinsic::log10:
+  case Intrinsic::log2: {
+    if (auto *FixedVTy = dyn_cast<FixedVectorType>(RetTy))
+      BaseCost = getCallScalarizationCost(FixedVTy->getElementCount());
+    break;
+  }
+  case Intrinsic::sincos:
+  case Intrinsic::sincospi: {
+    Type *FirstRetTy = getContainedTypes(RetTy).front();
+    if (auto *FixedVTy = dyn_cast<FixedVectorType>(FirstRetTy)) {
+      EVT ScalarVT = getTLI()->getValueType(DL, FirstRetTy).getScalarType();
+      RTLIB::Libcall LC = ICA.getID() == Intrinsic::sincos
+                              ? RTLIB::getSINCOS(ScalarVT)
+                              : RTLIB::getSINCOSPI(ScalarVT);
+      if (!getMultipleResultIntrinsicVectorLibCallDesc(ICA, LC))
+        BaseCost = getCallScalarizationCost(FixedVTy->getElementCount());
+    }
+    break;
+  }
   default:
     break;
   }
-  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+  return BaseCost + BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
 /// The function will remove redundant reinterprets casting in the presence
@@ -4045,6 +4091,12 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
   return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
 }
 
+InstructionCost
+AArch64TTIImpl::getCallScalarizationOverhead(CallInst *CI,
+                                             ElementCount VF) const {
+  return getCallScalarizationCost(VF);
+}
+
 std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
     Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
     TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index fe2e849258e3f..aadd3c28d7b65 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -479,6 +479,9 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
       ArrayRef<Value *> VL = {}) const override;
 
+  InstructionCost getCallScalarizationOverhead(CallInst *CI,
+                                               ElementCount VF) const override;
+
   /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c04b5cb10eac2..7d4e98b3be746 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5775,6 +5775,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
         // Compute costs of unpacking argument values for the scalar calls and
         // packing the return values to a vector.
         InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
+        ScalarizationCost += TTI.getCallScalarizationOverhead(CI, VF);
         ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
       } else {
         // There is no point attempting to calculate the scalar cost for a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bf51489543098..93069536416bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3159,6 +3159,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
               /*Extract=*/false, Ctx.CostKind);
         }
       }
+      ScalarizationCost +=
+          Ctx.TTI.getCallScalarizationOverhead(cast<CallInst>(UI), VF);
       // Skip operands that do not require extraction/scalarization and do not
       // incur any overhead.
       SmallPtrSet<const VPValue *, 4> UniqueOperands;
diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
index 32408acb582d0..f11c4c84eeb45 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
+; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -call-scalarization-cost-multiplier=1 -passes="print<cost-model>" \
+; RUN:   -cost-kind=throughput 2>&1 -disable-output | FileCheck --check-prefix=CHECK-LOW-SCALARIZATION-COST %s
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB
 
 define void @sincos() {
@@ -8,31 +10,43 @@ define void @sincos() {
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
-; CHECK:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
-; CHECK:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
-; CHECK:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
-; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
-; CHECK:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
+; CHECK:  Cost Model: Found an estimated cost of 116 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; CHECK:  Cost Model: Found an estimated cost of 92 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; CHECK:  Cost Model: Found an estimated cost of 44 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; CHECK:  Cost Model: Found an estimated cost of 20 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; CHECK:  Cost Model: Found an estimated cost of 184 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
 ;
+; CHECK-LOW-SCALARIZATION-COST-LABEL: 'sincos'
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 44 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 26 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 11 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
+; CHECK-LOW-SCALARIZATION-COST:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
+;
 ; CHECK-VECLIB-LABEL: 'sincos'
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
-; CHECK-VECLIB:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 116 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
-; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
-; CHECK-VECLIB:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 20 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; CHECK-VECLIB:  Cost Model: Found an estimated cost of 184 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index 544ef5c82c7ac..1dda7c2826b67 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|modf|extractvalue|store)" --version 5
 ; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve < %s -S -o - -debug-only=loop-vectorize 2>%t.1 | FileCheck %s --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve -vector-library=ArmPL < %s -S -o - -debug-only=loop-vectorize 2>%t.2 | FileCheck %s --check-prefix=CHECK-ARMPL
+; RUN: opt -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve < %s -S -o - -debug-only=loop-vectorize 2>%t.2 | FileCheck %s --check-prefix=CHECK-ARMPL
 ; RUN: FileCheck --input-file=%t.1 --check-prefix=CHECK-COST %s
 ; RUN: FileCheck --input-file=%t.2 --check-prefix=CHECK-COST-ARMPL %s
 ; REQUIRES: asserts
 
 ; CHECK-COST-LABEL: sincos_f32
 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
-; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
-; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of 98 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 
 ; CHECK-COST-ARMPL-LABEL: sincos_f32
 ; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
-; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
@@ -27,11 +27,16 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK:  [[ENTRY:.*:]]
 ; CHECK:  [[VECTOR_PH:.*:]]
 ; CHECK:  [[VECTOR_BODY:.*:]]
-; CHECK:    [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
-; CHECK:    [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0
-; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
-; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
-; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
+; CHECK:    [[TMP5:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[TMP3:%.*]])
+; CHECK:    [[TMP6:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[TMP4:%.*]])
+; CHECK:    [[TMP7:%.*]] = extractvalue { float, float } [[TMP5]], 0
+; CHECK:    [[TMP8:%.*]] = extractvalue { float, float } [[TMP6]], 0
+; CHECK:    [[TMP9:%.*]] = extractvalue { float, float } [[TMP5]], 1
+; CHECK:    [[TMP10:%.*]] = extractvalue { float, float } [[TMP6]], 1
+; CHECK:    store float [[TMP7]], ptr [[TMP11:%.*]], align 4
+; CHECK:    store float [[TMP8]], ptr [[TMP12:%.*]], align 4
+; CHECK:    store float [[TMP9]], ptr [[TMP13:%.*]], align 4
+; CHECK:    store float [[TMP10]], ptr [[TMP14:%.*]], align 4
 ; CHECK:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
@@ -91,7 +96,7 @@ exit:
 
 ; CHECK-COST-LABEL: sincos_f64
 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { double, double } @llvm.sincos.f64(double %in_val)
-; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 
@@ -107,11 +112,16 @@ define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK:  [[ENTRY:.*:]]
 ; CHECK:  [[VECTOR_PH:.*:]]
 ; CHECK:  [[VECTOR_BODY:.*:]]
-; CHECK:    [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
-; CHECK:    [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0
-; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
-; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
-; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
+; CHECK:    [[TMP5:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[TMP3:%.*]])
+; CHECK:    [[TMP6:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[TMP4:%.*]])
+; CHECK:    [[TMP7:%.*]] = extractvalue { double, double } [[TMP5]], 0
+; CHECK:    [[TMP8:%.*]] = extractvalue { double, double } [[TMP6]], 0
+; CHECK:    [[TMP9:%.*]] = extractvalue { double, double } [[TMP5]], 1
+; CHECK:    [[TMP10:%.*]] = extractvalue { double, double } [[TMP6]], 1
+; CHECK:    store double [[TMP7]], ptr [[TMP11:%.*]], align 8
+; CHECK:    store double [[TMP8]], ptr [[TMP12:%.*]], align 8
+; CHECK:    store double [[TMP9]], ptr [[TMP13:%.*]], align 8
+; CHECK:    store double [[TMP10]], ptr [[TMP14:%.*]], align 8
 ; CHECK:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
@@ -171,15 +181,15 @@ exit:
 
 ; CHECK-COST-LABEL: predicated_sincos
 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
-; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
-; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of 98 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 
 ; CHECK-COST-ARMPL-LABEL: predicated_sincos
 ; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
-; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
@@ -416,15 +426,15 @@ exit:
 
 ; CHECK-COST-LABEL: sincospi_f32
 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincospi.f32(float %in_val)
-; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
-; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
+; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
+; CHECK-COST: Cost of 98 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 
 ; CHECK-COST-ARMPL-LABEL: sincospi_f32
 ; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { float, float } @llvm.sincospi.f32(float %in_val)
-; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
@@ -436,11 +446,16 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK:  [[ENTRY:.*:]]
 ; CHECK:  [[VECTOR_PH:.*:]]
 ; CHECK:  [[VECTOR_BODY:.*:]]
-; CHECK:    [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
-; CHECK:    [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0
-; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
-; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
-; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
+; CHECK:    [[TMP5:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[TMP3:%.*]])
+; CHECK:    [[TMP6:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[TMP4:%.*]])
+; CHECK:    [[TMP7:%.*]] = extractvalue { float, float } [[TMP5]], 0
+; CHECK:    [[TMP8:%.*]] = extractvalue { float, float } [[TMP6]], 0
+; CHECK:    [[TMP9:%.*]] = extractvalue { float, float } [[TMP5]], 1
+; CHECK:    [[TMP10:%.*]] = extractvalue { float, float } [[TMP6]], 1
+; CHECK:    store float [[TMP7]], ptr [[TMP11:%.*]], align 4
+; CHECK:    store float [[TMP8]], ptr [[TMP12:%.*]], align 4
+; CHECK:    store float [[TMP9]], ptr [[TMP13:%.*]], align 4
+; CHECK:    store float [[TMP10]], ptr [[TMP14:%.*]], align 4
 ; CHECK:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
@@ -500,7 +515,7 @@ exit:
 
 ; CHECK-COST-LABEL: sincospi_f64
 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { double, double } @llvm.sincospi.f64(double %in_val)
-; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
+; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>)
 
@@ -516,11 +531,16 @@ define void @sincospi_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK:  [[ENTRY:.*:]]
 ; CHECK:  [[VECTOR_PH:.*:]]
 ; CHECK:  [[VECTOR_BODY:.*:]]
-; CHECK:    [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
-; CHECK:    [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0
-; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
-; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
-; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
+; CHECK:    [[TMP5:%.*]] = tail call { double, double } @llvm.sincospi.f64(double [[TMP3:%.*]])
+; CHECK:    [[TMP6:%.*]] = tail call { double, double } @llvm.sincospi.f64(double [[TMP4:%.*]])
+; CHECK:    [[TMP7:%.*]] = extractvalue { double, double } [[TMP5]], 0
+; CHECK:    [[TMP8:%.*]] = extractvalue { double, double } [[TMP6]], 0
+; CHECK:    [[TMP9:%.*]] = extractvalue { double, double } [[TMP5]], 1
+; CHECK:    [[TMP10:%.*]] = extractvalue { double, double } [[TMP6]], 1
+; CHECK:    store double [[TMP7]], ptr [[TMP11:%.*]], align 8
+; CHECK:    store double [[TMP8]], ptr [[TMP12:%.*]], align 8
+; CHECK:    store double [[TMP9]], ptr [[TMP13:%.*]], align 8
+; CHECK:    store double [[TMP10]], ptr [[TMP14:%.*]], align 8
 ; CHECK:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
index dabff1beefb38..cca5b387cb7c9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=1 -mattr=+sve -mtriple aarch64-unknown-linux-gnu \
-; RUN:     -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s
+; RUN:     -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-NORMAL-COST
 ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS
 ; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-target-instruction-cost=1 -mattr=+sve -mtriple aarch64-unknown-linux-gnu \
-; RUN:     -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s
+; RUN:     -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-COST1
 ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS
 
 define void @vec_load(i64 %N, ptr nocapture %a, ptr nocapture readonly %b) {
@@ -105,9 +105,11 @@ for.end:
 ; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin
 ; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
-; CHECK: @vec_sin_no_mapping
-; CHECK: call fast <2 x float> @llvm.sin.v2f32
-; CHECK-NOT: <vscale x
+; CHECK-NORMAL-COST-LABEL: @vec_sin_no_mapping
+; CHECK-NORMAL-COST-NOT: vector.body
+; CHECK-COST1-LABEL: @vec_sin_no_mapping
+; CHECK-COST1: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-COST1-NOT: <vscale x
 entry:
   br label %for.body
 
@@ -133,7 +135,7 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin
 ; CHECK-REMARKS-NEXT: t.c:3:40: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping_ite(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
-; CHECK: @vec_sin_no_mapping_ite
+; CHECK-LABEL: @vec_sin_no_mapping_ite
 ; CHECK-NOT: <vscale x
 ; CHECK: ret
 entry:
@@ -169,9 +171,11 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin
 ; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_fixed_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
-; CHECK: @vec_sin_fixed_mapping
-; CHECK: call fast <2 x float> @llvm.sin.v2f32
-; CHECK-NOT: <vscale x
+; CHECK-NORMAL-COST-LABEL: @vec_sin_fixed_mapping
+; CHECK-NORMAL-COST-NOT: vector.body
+; CHECK-COST1-LABEL: @vec_sin_fixed_mapping
+; CHECK-COST1: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-COST1-NOT: <vscale x
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
index bdbbfdfa97427..094ae1945774d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
@@ -15,11 +15,11 @@ target triple = "aarch64--linux-gnu"
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 98 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 202 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 
@@ -64,15 +64,15 @@ exit:
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
 ;
-; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 46 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 98 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 202 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 
@@ -118,15 +118,15 @@ exit:
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
 ;
-; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 46 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
-; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 98 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
-; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 202 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
 ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
index 9edd6ce53ec5d..35d1461f31642 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -46,41 +46,18 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ;
 ; NARROW-LABEL: @test_widen(
 ; NARROW-NEXT:  entry:
-; NARROW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NARROW:       vector.ph:
-; NARROW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NARROW:       vector.body:
-; NARROW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NARROW-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
-; NARROW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; NARROW-NEXT:    [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float>
-; NARROW-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
-; NARROW-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; NARROW-NEXT:    [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]]
-; NARROW-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
-; NARROW-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1
-; NARROW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NARROW-NEXT:    store <2 x float> [[TMP7]], ptr [[TMP8]], align 4
-; NARROW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; NARROW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; NARROW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; NARROW:       middle.block:
-; NARROW-NEXT:    br label [[SCALAR_PH]]
-; NARROW:       scalar.ph:
-; NARROW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; NARROW-NEXT:    br label [[FOR_BODY:%.*]]
 ; NARROW:       for.body:
-; NARROW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NARROW-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NARROW-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDVARS_IV]]
 ; NARROW-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
 ; NARROW-NEXT:    [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
-; NARROW-NEXT:    [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1]]
-; NARROW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1:[0-9]+]]
+; NARROW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDVARS_IV]]
 ; NARROW-NEXT:    store float [[CALL]], ptr [[ARRAYIDX]], align 4
 ; NARROW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NARROW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
-; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; NARROW:       for.cond.cleanup:
 ; NARROW-NEXT:    ret void
 ;