diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index a6f4e51e258ab..e1adf36940ac6 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -968,6 +968,9 @@ class TargetTransformInfo { LLVM_ABI InstructionCost getOperandsScalarizationOverhead( ArrayRef Tys, TTI::TargetCostKind CostKind) const; + LLVM_ABI InstructionCost getCallScalarizationOverhead(CallInst *CI, + ElementCount VF) const; + /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to /// the scalarization cost of a load/store. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 566e1cf51631a..f7c5080d49266 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -464,6 +464,11 @@ class TargetTransformInfoImplBase { return 0; } + virtual InstructionCost getCallScalarizationOverhead(CallInst *CI, + ElementCount VF) const { + return 0; + } + virtual bool supportsEfficientVectorElementLoadStore() const { return false; } virtual bool supportsTailCalls() const { return true; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index dce423fc1b18b..3dd9fa5f97995 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -304,30 +304,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind, RTLIB::Libcall LC, std::optional CallRetElementIndex = {}) const { - Type *RetTy = ICA.getReturnType(); - // Vector variants of the intrinsic can be mapped to a vector library call. - auto const *LibInfo = ICA.getLibInfo(); - if (!LibInfo || !isa(RetTy) || - !isVectorizedStructTy(cast(RetTy))) - return std::nullopt; - - // Find associated libcall. - const char *LCName = getTLI()->getLibcallName(LC); - if (!LCName) - return std::nullopt; - - // Search for a corresponding vector variant. - LLVMContext &Ctx = RetTy->getContext(); - ElementCount VF = getVectorizedTypeVF(RetTy); - VecDesc const *VD = nullptr; - for (bool Masked : {false, true}) { - if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked))) - break; - } + VecDesc const *VD = getMultipleResultIntrinsicVectorLibCallDesc(ICA, LC); if (!VD) return std::nullopt; // Cost the call + mask. + Type *RetTy = ICA.getReturnType(); + ElementCount VF = getVectorizedTypeVF(RetTy); + LLVMContext &Ctx = RetTy->getContext(); auto Cost = thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind); if (VD->isMasked()) { @@ -371,6 +355,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { using TargetTransformInfoImplBase::DL; public: + VecDesc const *getMultipleResultIntrinsicVectorLibCallDesc( + const IntrinsicCostAttributes &ICA, RTLIB::Libcall LC) const { + Type *RetTy = ICA.getReturnType(); + // Vector variants of the intrinsic can be mapped to a vector library call. + auto const *LibInfo = ICA.getLibInfo(); + if (!LibInfo || !isa(RetTy) || + !isVectorizedStructTy(cast(RetTy))) + return nullptr; + + // Find associated libcall. + const char *LCName = getTLI()->getLibcallName(LC); + if (!LCName) + return nullptr; + + // Search for a corresponding vector variant. + ElementCount VF = getVectorizedTypeVF(RetTy); + VecDesc const *VD = nullptr; + for (bool Masked : {false, true}) { + if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked))) + break; + } + return VD; + } + /// \name Scalar TTI Implementations /// @{ bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 09b50c5270e57..045616c8839e8 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -641,6 +641,12 @@ InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead( return TTIImpl->getOperandsScalarizationOverhead(Tys, CostKind); } +InstructionCost +TargetTransformInfo::getCallScalarizationOverhead(CallInst *CI, + ElementCount VF) const { + return TTIImpl->getCallScalarizationOverhead(CI, VF); +} + bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { return TTIImpl->supportsEfficientVectorElementLoadStore(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 92321a76dbd80..befaa1b68d4b7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -46,8 +46,13 @@ static cl::opt SVEGatherOverhead("sve-gather-overhead", cl::init(10), static cl::opt SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden); -static cl::opt SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", - cl::init(15), cl::Hidden); +static cl::opt + SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), + cl::Hidden); + +static cl::opt + CallScalarizationCostMultiplier("call-scalarization-cost-multiplier", + cl::init(10), cl::Hidden); static cl::opt NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), @@ -594,6 +599,12 @@ static InstructionCost getHistogramCost(const AArch64Subtarget *ST, return InstructionCost::getInvalid(); } +static InstructionCost getCallScalarizationCost(ElementCount VF) { + if (VF.isScalable()) + return InstructionCost::getInvalid(); + return VF.getFixedValue() * CallScalarizationCostMultiplier; +} + InstructionCost AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const { @@ -606,6 +617,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (VTy->getElementCount() == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); + InstructionCost BaseCost = 0; switch (ICA.getID()) { case Intrinsic::experimental_vector_histogram_add: { InstructionCost HistCost = getHistogramCost(ST, ICA); @@ -1004,10 +1016,44 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::asin: + case Intrinsic::acos: + case Intrinsic::atan: + case Intrinsic::atan2: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::tan: + case Intrinsic::sinh: + case Intrinsic::cosh: + case Intrinsic::tanh: + case Intrinsic::pow: + case Intrinsic::exp: + case Intrinsic::exp10: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: { + if (auto *FixedVTy = dyn_cast(RetTy)) + BaseCost = getCallScalarizationCost(FixedVTy->getElementCount()); + break; + } + case Intrinsic::sincos: + case Intrinsic::sincospi: { + Type *FirstRetTy = getContainedTypes(RetTy).front(); + if (auto *FixedVTy = dyn_cast(FirstRetTy)) { + EVT ScalarVT = getTLI()->getValueType(DL, FirstRetTy).getScalarType(); + RTLIB::Libcall LC = ICA.getID() == Intrinsic::sincos + ? RTLIB::getSINCOS(ScalarVT) + : RTLIB::getSINCOSPI(ScalarVT); + if (!getMultipleResultIntrinsicVectorLibCallDesc(ICA, LC)) + BaseCost = getCallScalarizationCost(FixedVTy->getElementCount()); + } + break; + } default: break; } - return BaseT::getIntrinsicInstrCost(ICA, CostKind); + return BaseCost + BaseT::getIntrinsicInstrCost(ICA, CostKind); } /// The function will remove redundant reinterprets casting in the presence @@ -4045,6 +4091,12 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead( return DemandedElts.popcount() * (Insert + Extract) * VecInstCost; } +InstructionCost +AArch64TTIImpl::getCallScalarizationOverhead(CallInst *CI, + ElementCount VF) const { + return getCallScalarizationCost(VF); +} + std::optional AArch64TTIImpl::getFP16BF16PromoteCost( Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index fe2e849258e3f..aadd3c28d7b65 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -479,6 +479,9 @@ class AArch64TTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind, bool ForPoisonSrc = true, ArrayRef VL = {}) const override; + InstructionCost getCallScalarizationOverhead(CallInst *CI, + ElementCount VF) const override; + /// Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c04b5cb10eac2..7d4e98b3be746 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5775,6 +5775,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); + ScalarizationCost += TTI.getCallScalarizationOverhead(CI, VF); ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; } else { // There is no point attempting to calculate the scalar cost for a diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bf51489543098..93069536416bb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3159,6 +3159,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, /*Extract=*/false, Ctx.CostKind); } } + ScalarizationCost += + Ctx.TTI.getCallScalarizationOverhead(cast(UI), VF); // Skip operands that do not require extraction/scalarization and do not // incur any overhead. SmallPtrSet UniqueOperands; diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll index 32408acb582d0..f11c4c84eeb45 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos" ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -call-scalarization-cost-multiplier=1 -passes="print" \ +; RUN: -cost-kind=throughput 2>&1 -disable-output | FileCheck --check-prefix=CHECK-LOW-SCALARIZATION-COST %s ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB define void @sincos() { @@ -8,31 +10,43 @@ define void @sincos() { ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; -; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) -; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) -; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) -; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) -; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; +; CHECK: Cost Model: Found an estimated cost of 116 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; CHECK: Cost Model: Found an estimated cost of 92 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; CHECK: Cost Model: Found an estimated cost of 44 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; CHECK: Cost Model: Found an estimated cost of 184 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { , } @llvm.sincos.nxv8f16( poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { , } @llvm.sincos.nxv4f32( poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { , } @llvm.sincos.nxv2f64( poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv1f128 = call { , } @llvm.sincos.nxv1f128( poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv8f32 = call { , } @llvm.sincos.nxv8f32( poison) ; +; CHECK-LOW-SCALARIZATION-COST-LABEL: 'sincos' +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 44 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 26 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 11 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv8f16 = call { , } @llvm.sincos.nxv8f16( poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv4f32 = call { , } @llvm.sincos.nxv4f32( poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv2f64 = call { , } @llvm.sincos.nxv2f64( poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv1f128 = call { , } @llvm.sincos.nxv1f128( poison) +; CHECK-LOW-SCALARIZATION-COST: Cost Model: Invalid cost for instruction: %nxv8f32 = call { , } @llvm.sincos.nxv8f32( poison) +; ; CHECK-VECLIB-LABEL: 'sincos' ; CHECK-VECLIB: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; -; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 116 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) -; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) -; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; +; CHECK-VECLIB: Cost Model: Found an estimated cost of 20 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; CHECK-VECLIB: Cost Model: Found an estimated cost of 184 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { , } @llvm.sincos.nxv8f16( poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { , } @llvm.sincos.nxv4f32( poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { , } @llvm.sincos.nxv2f64( poison) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll index 544ef5c82c7ac..1dda7c2826b67 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|modf|extractvalue|store)" --version 5 ; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve < %s -S -o - -debug-only=loop-vectorize 2>%t.1 | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve -vector-library=ArmPL < %s -S -o - -debug-only=loop-vectorize 2>%t.2 | FileCheck %s --check-prefix=CHECK-ARMPL +; RUN: opt -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve < %s -S -o - -debug-only=loop-vectorize 2>%t.2 | FileCheck %s --check-prefix=CHECK-ARMPL ; RUN: FileCheck --input-file=%t.1 --check-prefix=CHECK-COST %s ; RUN: FileCheck --input-file=%t.2 --check-prefix=CHECK-COST-ARMPL %s ; REQUIRES: asserts ; CHECK-COST-LABEL: sincos_f32 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val) -; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) -; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of 98 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST-ARMPL-LABEL: sincos_f32 ; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val) -; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) @@ -27,11 +27,16 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali ; CHECK: [[ENTRY:.*:]] ; CHECK: [[VECTOR_PH:.*:]] ; CHECK: [[VECTOR_BODY:.*:]] -; CHECK: [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]]) -; CHECK: [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0 -; CHECK: [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1 -; CHECK: store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4 -; CHECK: store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4 +; CHECK: [[TMP5:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[TMP3:%.*]]) +; CHECK: [[TMP6:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[TMP4:%.*]]) +; CHECK: [[TMP7:%.*]] = extractvalue { float, float } [[TMP5]], 0 +; CHECK: [[TMP8:%.*]] = extractvalue { float, float } [[TMP6]], 0 +; CHECK: [[TMP9:%.*]] = extractvalue { float, float } [[TMP5]], 1 +; CHECK: [[TMP10:%.*]] = extractvalue { float, float } [[TMP6]], 1 +; CHECK: store float [[TMP7]], ptr [[TMP11:%.*]], align 4 +; CHECK: store float [[TMP8]], ptr [[TMP12:%.*]], align 4 +; CHECK: store float [[TMP9]], ptr [[TMP13:%.*]], align 4 +; CHECK: store float [[TMP10]], ptr [[TMP14:%.*]], align 4 ; CHECK: [[MIDDLE_BLOCK:.*:]] ; CHECK: [[SCALAR_PH:.*:]] ; CHECK: [[FOR_BODY:.*:]] @@ -91,7 +96,7 @@ exit: ; CHECK-COST-LABEL: sincos_f64 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { double, double } @llvm.sincos.f64(double %in_val) -; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) @@ -107,11 +112,16 @@ define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali ; CHECK: [[ENTRY:.*:]] ; CHECK: [[VECTOR_PH:.*:]] ; CHECK: [[VECTOR_BODY:.*:]] -; CHECK: [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[WIDE_LOAD:%.*]]) -; CHECK: [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0 -; CHECK: [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1 -; CHECK: store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8 -; CHECK: store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8 +; CHECK: [[TMP5:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[TMP3:%.*]]) +; CHECK: [[TMP6:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[TMP4:%.*]]) +; CHECK: [[TMP7:%.*]] = extractvalue { double, double } [[TMP5]], 0 +; CHECK: [[TMP8:%.*]] = extractvalue { double, double } [[TMP6]], 0 +; CHECK: [[TMP9:%.*]] = extractvalue { double, double } [[TMP5]], 1 +; CHECK: [[TMP10:%.*]] = extractvalue { double, double } [[TMP6]], 1 +; CHECK: store double [[TMP7]], ptr [[TMP11:%.*]], align 8 +; CHECK: store double [[TMP8]], ptr [[TMP12:%.*]], align 8 +; CHECK: store double [[TMP9]], ptr [[TMP13:%.*]], align 8 +; CHECK: store double [[TMP10]], ptr [[TMP14:%.*]], align 8 ; CHECK: [[MIDDLE_BLOCK:.*:]] ; CHECK: [[SCALAR_PH:.*:]] ; CHECK: [[FOR_BODY:.*:]] @@ -171,15 +181,15 @@ exit: ; CHECK-COST-LABEL: predicated_sincos ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val) -; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) -; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST: Cost of 98 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST-ARMPL-LABEL: predicated_sincos ; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val) -; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>) @@ -416,15 +426,15 @@ exit: ; CHECK-COST-LABEL: sincospi_f32 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincospi.f32(float %in_val) -; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) -; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) +; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) +; CHECK-COST: Cost of 98 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST-ARMPL-LABEL: sincospi_f32 ; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincospi.f32(float %in_val) -; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) +; CHECK-COST-ARMPL: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) @@ -436,11 +446,16 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa ; CHECK: [[ENTRY:.*:]] ; CHECK: [[VECTOR_PH:.*:]] ; CHECK: [[VECTOR_BODY:.*:]] -; CHECK: [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> [[WIDE_LOAD:%.*]]) -; CHECK: [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0 -; CHECK: [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1 -; CHECK: store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4 -; CHECK: store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4 +; CHECK: [[TMP5:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[TMP3:%.*]]) +; CHECK: [[TMP6:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[TMP4:%.*]]) +; CHECK: [[TMP7:%.*]] = extractvalue { float, float } [[TMP5]], 0 +; CHECK: [[TMP8:%.*]] = extractvalue { float, float } [[TMP6]], 0 +; CHECK: [[TMP9:%.*]] = extractvalue { float, float } [[TMP5]], 1 +; CHECK: [[TMP10:%.*]] = extractvalue { float, float } [[TMP6]], 1 +; CHECK: store float [[TMP7]], ptr [[TMP11:%.*]], align 4 +; CHECK: store float [[TMP8]], ptr [[TMP12:%.*]], align 4 +; CHECK: store float [[TMP9]], ptr [[TMP13:%.*]], align 4 +; CHECK: store float [[TMP10]], ptr [[TMP14:%.*]], align 4 ; CHECK: [[MIDDLE_BLOCK:.*:]] ; CHECK: [[SCALAR_PH:.*:]] ; CHECK: [[FOR_BODY:.*:]] @@ -500,7 +515,7 @@ exit: ; CHECK-COST-LABEL: sincospi_f64 ; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { double, double } @llvm.sincospi.f64(double %in_val) -; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) +; CHECK-COST: Cost of 46 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) ; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincospi(ir<%in_val>) @@ -516,11 +531,16 @@ define void @sincospi_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa ; CHECK: [[ENTRY:.*:]] ; CHECK: [[VECTOR_PH:.*:]] ; CHECK: [[VECTOR_BODY:.*:]] -; CHECK: [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> [[WIDE_LOAD:%.*]]) -; CHECK: [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0 -; CHECK: [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1 -; CHECK: store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8 -; CHECK: store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8 +; CHECK: [[TMP5:%.*]] = tail call { double, double } @llvm.sincospi.f64(double [[TMP3:%.*]]) +; CHECK: [[TMP6:%.*]] = tail call { double, double } @llvm.sincospi.f64(double [[TMP4:%.*]]) +; CHECK: [[TMP7:%.*]] = extractvalue { double, double } [[TMP5]], 0 +; CHECK: [[TMP8:%.*]] = extractvalue { double, double } [[TMP6]], 0 +; CHECK: [[TMP9:%.*]] = extractvalue { double, double } [[TMP5]], 1 +; CHECK: [[TMP10:%.*]] = extractvalue { double, double } [[TMP6]], 1 +; CHECK: store double [[TMP7]], ptr [[TMP11:%.*]], align 8 +; CHECK: store double [[TMP8]], ptr [[TMP12:%.*]], align 8 +; CHECK: store double [[TMP9]], ptr [[TMP13:%.*]], align 8 +; CHECK: store double [[TMP10]], ptr [[TMP14:%.*]], align 8 ; CHECK: [[MIDDLE_BLOCK:.*:]] ; CHECK: [[SCALAR_PH:.*:]] ; CHECK: [[FOR_BODY:.*:]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll index dabff1beefb38..cca5b387cb7c9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll @@ -1,8 +1,8 @@ ; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=1 -mattr=+sve -mtriple aarch64-unknown-linux-gnu \ -; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-NORMAL-COST ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS ; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-target-instruction-cost=1 -mattr=+sve -mtriple aarch64-unknown-linux-gnu \ -; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-COST1 ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS define void @vec_load(i64 %N, ptr nocapture %a, ptr nocapture readonly %b) { @@ -105,9 +105,11 @@ for.end: ; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin ; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store define void @vec_sin_no_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) { -; CHECK: @vec_sin_no_mapping -; CHECK: call fast <2 x float> @llvm.sin.v2f32 -; CHECK-NOT: @llvm.sin.v2f32 +; CHECK-COST1-NOT: @llvm.sin.v2f32 -; CHECK-NOT: @llvm.sin.v2f32 +; CHECK-COST1-NOT: = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> ; -; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 98 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> ; -; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 202 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> @@ -64,15 +64,15 @@ exit: ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_a = extractvalue { half, half } %call, 0 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_b = extractvalue { half, half } %call, 1 ; -; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 46 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> ; -; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 98 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> ; -; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 202 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> @@ -118,15 +118,15 @@ exit: ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_a = extractvalue { half, half } %call, 0 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %extract_b = extractvalue { half, half } %call, 1 ; -; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 46 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>) ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> ; -; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 98 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>) ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> ; -; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) +; CHECK-COST: Cost of 202 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>) ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0> ; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1> ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index 9edd6ce53ec5d..35d1461f31642 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -46,41 +46,18 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { ; ; NARROW-LABEL: @test_widen( ; NARROW-NEXT: entry: -; NARROW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; NARROW: vector.ph: -; NARROW-NEXT: br label [[VECTOR_BODY:%.*]] -; NARROW: vector.body: -; NARROW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NARROW-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] -; NARROW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 -; NARROW-NEXT: [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float> -; NARROW-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 -; NARROW-NEXT: [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]] -; NARROW-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; NARROW-NEXT: [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]] -; NARROW-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 -; NARROW-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1 -; NARROW-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; NARROW-NEXT: store <2 x float> [[TMP7]], ptr [[TMP8]], align 4 -; NARROW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; NARROW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; NARROW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; NARROW: middle.block: -; NARROW-NEXT: br label [[SCALAR_PH]] -; NARROW: scalar.ph: -; NARROW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; NARROW-NEXT: br label [[FOR_BODY:%.*]] ; NARROW: for.body: -; NARROW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; NARROW-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]] +; NARROW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NARROW-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDVARS_IV]] ; NARROW-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8 ; NARROW-NEXT: [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float -; NARROW-NEXT: [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1]] -; NARROW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; NARROW-NEXT: [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1:[0-9]+]] +; NARROW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDVARS_IV]] ; NARROW-NEXT: store float [[CALL]], ptr [[ARRAYIDX]], align 4 ; NARROW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; NARROW-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 -; NARROW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NARROW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; NARROW: for.cond.cleanup: ; NARROW-NEXT: ret void ;