diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index a9b103b545762..b4b3736c36d57 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -865,40 +865,6 @@ class TargetTransformInfo { ///< a vector of the same type as the input vectors. }; - /// Kind of the reduction data. - enum ReductionKind { - RK_None, /// Not a reduction. - RK_Arithmetic, /// Binary reduction data. - RK_MinMax, /// Min/max reduction data. - RK_UnsignedMinMax, /// Unsigned min/max reduction data. - }; - - /// Contains opcode + LHS/RHS parts of the reduction operations. - struct ReductionData { - ReductionData() = delete; - ReductionData(ReductionKind Kind, unsigned Opcode, Value *LHS, Value *RHS) - : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) { - assert(Kind != RK_None && "expected binary or min/max reduction only."); - } - unsigned Opcode = 0; - Value *LHS = nullptr; - Value *RHS = nullptr; - ReductionKind Kind = RK_None; - bool hasSameData(ReductionData &RD) const { - return Kind == RD.Kind && Opcode == RD.Opcode; - } - }; - - static ReductionKind matchPairwiseReduction( - const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty); - - static ReductionKind matchVectorSplittingReduction( - const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty); - - static ReductionKind matchVectorReduction(const ExtractElementInst *ReduxRoot, - unsigned &Opcode, VectorType *&Ty, - bool &IsPairwise); - /// Additional information about an operand's possible values. enum OperandValueKind { OK_AnyValue, // Operand can have any value. @@ -1180,25 +1146,16 @@ class TargetTransformInfo { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, bool UseMaskForCond = false, bool UseMaskForGaps = false) const; - /// Calculate the cost of performing a vector reduction. + /// Calculate the cost of vector reduction intrinsics. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar - /// value using the operation denoted by \p Opcode. The form of the reduction - /// can either be a pairwise reduction or a reduction that splits the vector - /// at every reduction level. - /// - /// Pairwise: - /// (v0, v1, v2, v3) - /// ((v0+v1), (v2+v3), undef, undef) - /// Split: - /// (v0, v1, v2, v3) - /// ((v0+v2), (v1+v3), undef, undef) + /// value using the operation denoted by \p Opcode. InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, + unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; InstructionCost getMinMaxReductionCost( - VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, + VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// Calculate the cost of an extended reduction pattern, similar to @@ -1661,11 +1618,9 @@ class TargetTransformInfo::Concept { bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0; virtual InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwiseForm, TTI::TargetCostKind CostKind) = 0; virtual InstructionCost - getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwiseForm, bool IsUnsigned, + getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) = 0; virtual InstructionCost getExtendedAddReductionCost( bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, @@ -2164,17 +2119,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { } InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwiseForm, TTI::TargetCostKind CostKind) override { - return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm, - CostKind); + return Impl.getArithmeticReductionCost(Opcode, Ty, CostKind); } InstructionCost - getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwiseForm, bool IsUnsigned, + getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) override { - return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned, - CostKind); + return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); } InstructionCost getExtendedAddReductionCost( bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index eff4a7cdb85b3..6c7e59d088534 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -621,12 +621,12 @@ class TargetTransformInfoImplBase { return 0; } - InstructionCost getArithmeticReductionCost(unsigned, VectorType *, bool, + InstructionCost getArithmeticReductionCost(unsigned, VectorType *, TTI::TargetCostKind) const { return 1; } - InstructionCost getMinMaxReductionCost(VectorType *, VectorType *, bool, bool, + InstructionCost getMinMaxReductionCost(VectorType *, VectorType *, bool, TTI::TargetCostKind) const { return 1; } @@ -1112,26 +1112,6 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { if (CI) Idx = CI->getZExtValue(); - // Try to match a reduction (a series of shufflevector and vector ops - // followed by an extractelement). - unsigned RdxOpcode; - VectorType *RdxType; - bool IsPairwise; - switch (TTI::matchVectorReduction(EEI, RdxOpcode, RdxType, IsPairwise)) { - case TTI::RK_Arithmetic: - return TargetTTI->getArithmeticReductionCost(RdxOpcode, RdxType, - IsPairwise, CostKind); - case TTI::RK_MinMax: - return TargetTTI->getMinMaxReductionCost( - RdxType, cast(CmpInst::makeCmpResultType(RdxType)), - IsPairwise, /*IsUnsigned=*/false, CostKind); - case TTI::RK_UnsignedMinMax: - return TargetTTI->getMinMaxReductionCost( - RdxType, cast(CmpInst::makeCmpResultType(RdxType)), - IsPairwise, /*IsUnsigned=*/true, CostKind); - case TTI::RK_None: - break; - } return TargetTTI->getVectorInstrCost(Opcode, U->getOperand(0)->getType(), Idx); } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 35c439c12b201..66e0f5f2e7a97 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1654,33 +1654,26 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } case Intrinsic::vector_reduce_add: return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, - /*IsPairwiseForm=*/false, CostKind); case Intrinsic::vector_reduce_mul: return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, - /*IsPairwiseForm=*/false, CostKind); case Intrinsic::vector_reduce_and: return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, - /*IsPairwiseForm=*/false, CostKind); case Intrinsic::vector_reduce_or: return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, - /*IsPairwiseForm=*/false, CostKind); case Intrinsic::vector_reduce_xor: return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, - /*IsPairwiseForm=*/false, CostKind); case Intrinsic::vector_reduce_fadd: // FIXME: Add new flag for cost of strict reductions. return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, - /*IsPairwiseForm=*/false, CostKind); case Intrinsic::vector_reduce_fmul: // FIXME: Add new flag for cost of strict reductions. return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, - /*IsPairwiseForm=*/false, CostKind); case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: @@ -1688,13 +1681,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { case Intrinsic::vector_reduce_fmin: return thisT()->getMinMaxReductionCost( VecOpTy, cast(CmpInst::makeCmpResultType(VecOpTy)), - /*IsPairwiseForm=*/false, /*IsUnsigned=*/false, CostKind); case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: return thisT()->getMinMaxReductionCost( VecOpTy, cast(CmpInst::makeCmpResultType(VecOpTy)), - /*IsPairwiseForm=*/false, /*IsUnsigned=*/true, CostKind); case Intrinsic::abs: case Intrinsic::smax: @@ -1998,9 +1989,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return 0; } - /// Try to calculate arithmetic and shuffle op costs for reduction operations. + /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. /// We're assuming that reduction operation are performing the following way: - /// 1. Non-pairwise reduction + /// /// %val1 = shufflevector %val, %undef, /// /// \----------------v-------------/ \----------v------------/ @@ -2016,25 +2007,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { /// n/4 elements 3*n/4 elements /// %red2 = op %red1, val2 - working with the vector of /// length n/2, the resulting vector has length n/4 etc. - /// 2. Pairwise reduction: - /// Everything is the same except for an additional shuffle operation which - /// is used to produce operands for pairwise kind of reductions. - /// %val1 = shufflevector %val, %undef, - /// - /// \-------------v----------/ \----------v------------/ - /// n/2 elements n/2 elements - /// %val2 = shufflevector %val, %undef, - /// - /// \-------------v----------/ \----------v------------/ - /// n/2 elements n/2 elements - /// %red1 = op %val1, val2 - /// Again, the operation is performed on vector, but the resulting - /// vector %red1 is vector. /// /// The cost model should take into account that the actual length of the /// vector is reduced on each iteration. InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwise, TTI::TargetCostKind CostKind) { Type *ScalarTy = Ty->getElementType(); unsigned NumVecElts = cast(Ty)->getNumElements(); @@ -2065,9 +2041,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { while (NumVecElts > MVTLen) { NumVecElts /= 2; VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); - // Assume the pairwise shuffles add a cost. - ShuffleCost += (IsPairwise + 1) * - thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, + ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); Ty = SubTy; @@ -2081,13 +2055,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // reduction operations are performed on the vectors with the same // architecture-dependent length. - // Non pairwise reductions need one shuffle per reduction level. Pairwise - // reductions need two shuffles on every level, but the last one. On that - // level one of the shuffles is <0, u, u, ...> which is identity. - unsigned NumShuffles = NumReduxLevels; - if (IsPairwise && NumReduxLevels >= 1) - NumShuffles += NumReduxLevels - 1; - ShuffleCost += NumShuffles * thisT()->getShuffleCost( + // By default reductions need one shuffle per reduction level. + ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); return ShuffleCost + ArithCost + @@ -2097,7 +2066,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { /// Try to calculate op costs for min/max reduction operations. /// \param CondTy Conditional type for the Select instruction. InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, + bool IsUnsigned, TTI::TargetCostKind CostKind) { Type *ScalarTy = Ty->getElementType(); Type *ScalarCondTy = CondTy->getElementType(); @@ -2123,9 +2092,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); - // Assume the pairwise shuffles add a cost. - ShuffleCost += (IsPairwise + 1) * - thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, + ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); MinMaxCost += thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, @@ -2142,14 +2109,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // operations performed on the current platform. That's why several final // reduction opertions are perfomed on the vectors with the same // architecture-dependent length. - - // Non pairwise reductions need one shuffle per reduction level. Pairwise - // reductions need two shuffles on every level, but the last one. On that - // level one of the shuffles is <0, u, u, ...> which is identity. - unsigned NumShuffles = NumReduxLevels; - if (IsPairwise && NumReduxLevels >= 1) - NumShuffles += NumReduxLevels - 1; - ShuffleCost += NumShuffles * thisT()->getShuffleCost( + ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); MinMaxCost += NumReduxLevels * @@ -2169,8 +2129,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Without any native support, this is equivalent to the cost of // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) VectorType *ExtTy = VectorType::get(ResTy, Ty); - InstructionCost RedCost = thisT()->getArithmeticReductionCost( - Instruction::Add, ExtTy, false, CostKind); + InstructionCost RedCost = + thisT()->getArithmeticReductionCost(Instruction::Add, ExtTy, CostKind); InstructionCost MulCost = 0; InstructionCost ExtCost = thisT()->getCastInstrCost( IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3b89a9807863f..7e68b8f754ba5 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -894,19 +894,18 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const { } InstructionCost TargetTransformInfo::getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, - TTI::TargetCostKind CostKind) const { + unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const { InstructionCost Cost = - TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm, CostKind); + TTIImpl->getArithmeticReductionCost(Opcode, Ty, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } InstructionCost TargetTransformInfo::getMinMaxReductionCost( - VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, + VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) const { - InstructionCost Cost = TTIImpl->getMinMaxReductionCost( - Ty, CondTy, IsPairwiseForm, IsUnsigned, CostKind); + InstructionCost Cost = + TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } @@ -1057,291 +1056,6 @@ TargetTransformInfo::getInstructionLatency(const Instruction *I) const { return TTIImpl->getInstructionLatency(I); } -static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft, - unsigned Level) { - // We don't need a shuffle if we just want to have element 0 in position 0 of - // the vector. - if (!SI && Level == 0 && IsLeft) - return true; - else if (!SI) - return false; - - SmallVector Mask( - cast(SI->getType())->getNumElements(), -1); - - // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether - // we look at the left or right side. - for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2) - Mask[i] = val; - - ArrayRef ActualMask = SI->getShuffleMask(); - return Mask == ActualMask; -} - -static Optional getReductionData(Instruction *I) { - Value *L, *R; - if (m_BinOp(m_Value(L), m_Value(R)).match(I)) - return TTI::ReductionData(TTI::RK_Arithmetic, I->getOpcode(), L, R); - if (auto *SI = dyn_cast(I)) { - if (m_SMin(m_Value(L), m_Value(R)).match(SI) || - m_SMax(m_Value(L), m_Value(R)).match(SI) || - m_OrdFMin(m_Value(L), m_Value(R)).match(SI) || - m_OrdFMax(m_Value(L), m_Value(R)).match(SI) || - m_UnordFMin(m_Value(L), m_Value(R)).match(SI) || - m_UnordFMax(m_Value(L), m_Value(R)).match(SI)) { - auto *CI = cast(SI->getCondition()); - return TTI::ReductionData(TTI::RK_MinMax, CI->getOpcode(), L, R); - } - if (m_UMin(m_Value(L), m_Value(R)).match(SI) || - m_UMax(m_Value(L), m_Value(R)).match(SI)) { - auto *CI = cast(SI->getCondition()); - return TTI::ReductionData(TTI::RK_UnsignedMinMax, CI->getOpcode(), L, R); - } - } - return llvm::None; -} - -static TTI::ReductionKind matchPairwiseReductionAtLevel(Instruction *I, - unsigned Level, - unsigned NumLevels) { - // Match one level of pairwise operations. - // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, - // <4 x i32> - // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, - // <4 x i32> - // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 - if (!I) - return TTI::RK_None; - - assert(I->getType()->isVectorTy() && "Expecting a vector type"); - - Optional RD = getReductionData(I); - if (!RD) - return TTI::RK_None; - - ShuffleVectorInst *LS = dyn_cast(RD->LHS); - if (!LS && Level) - return TTI::RK_None; - ShuffleVectorInst *RS = dyn_cast(RD->RHS); - if (!RS && Level) - return TTI::RK_None; - - // On level 0 we can omit one shufflevector instruction. - if (!Level && !RS && !LS) - return TTI::RK_None; - - // Shuffle inputs must match. - Value *NextLevelOpL = LS ? LS->getOperand(0) : nullptr; - Value *NextLevelOpR = RS ? RS->getOperand(0) : nullptr; - Value *NextLevelOp = nullptr; - if (NextLevelOpR && NextLevelOpL) { - // If we have two shuffles their operands must match. - if (NextLevelOpL != NextLevelOpR) - return TTI::RK_None; - - NextLevelOp = NextLevelOpL; - } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) { - // On the first level we can omit the shufflevector <0, undef,...>. So the - // input to the other shufflevector <1, undef> must match with one of the - // inputs to the current binary operation. - // Example: - // %NextLevelOpL = shufflevector %R, <1, undef ...> - // %BinOp = fadd %NextLevelOpL, %R - if (NextLevelOpL && NextLevelOpL != RD->RHS) - return TTI::RK_None; - else if (NextLevelOpR && NextLevelOpR != RD->LHS) - return TTI::RK_None; - - NextLevelOp = NextLevelOpL ? RD->RHS : RD->LHS; - } else - return TTI::RK_None; - - // Check that the next levels binary operation exists and matches with the - // current one. - if (Level + 1 != NumLevels) { - if (!isa(NextLevelOp)) - return TTI::RK_None; - Optional NextLevelRD = - getReductionData(cast(NextLevelOp)); - if (!NextLevelRD || !RD->hasSameData(*NextLevelRD)) - return TTI::RK_None; - } - - // Shuffle mask for pairwise operation must match. - if (matchPairwiseShuffleMask(LS, /*IsLeft=*/true, Level)) { - if (!matchPairwiseShuffleMask(RS, /*IsLeft=*/false, Level)) - return TTI::RK_None; - } else if (matchPairwiseShuffleMask(RS, /*IsLeft=*/true, Level)) { - if (!matchPairwiseShuffleMask(LS, /*IsLeft=*/false, Level)) - return TTI::RK_None; - } else { - return TTI::RK_None; - } - - if (++Level == NumLevels) - return RD->Kind; - - // Match next level. - return matchPairwiseReductionAtLevel(dyn_cast(NextLevelOp), Level, - NumLevels); -} - -TTI::ReductionKind TTI::matchPairwiseReduction( - const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty) { - if (!EnableReduxCost) - return TTI::RK_None; - - // Need to extract the first element. - ConstantInt *CI = dyn_cast(ReduxRoot->getOperand(1)); - unsigned Idx = ~0u; - if (CI) - Idx = CI->getZExtValue(); - if (Idx != 0) - return TTI::RK_None; - - auto *RdxStart = dyn_cast(ReduxRoot->getOperand(0)); - if (!RdxStart) - return TTI::RK_None; - Optional RD = getReductionData(RdxStart); - if (!RD) - return TTI::RK_None; - - auto *VecTy = cast(RdxStart->getType()); - unsigned NumVecElems = VecTy->getNumElements(); - if (!isPowerOf2_32(NumVecElems)) - return TTI::RK_None; - - // We look for a sequence of shuffle,shuffle,add triples like the following - // that builds a pairwise reduction tree. - // - // (X0, X1, X2, X3) - // (X0 + X1, X2 + X3, undef, undef) - // ((X0 + X1) + (X2 + X3), undef, undef, undef) - // - // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, - // <4 x i32> - // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, - // <4 x i32> - // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 - // %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, - // <4 x i32> - // %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, - // <4 x i32> - // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 - // %r = extractelement <4 x float> %bin.rdx8, i32 0 - if (matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)) == - TTI::RK_None) - return TTI::RK_None; - - Opcode = RD->Opcode; - Ty = VecTy; - - return RD->Kind; -} - -static std::pair -getShuffleAndOtherOprd(Value *L, Value *R) { - ShuffleVectorInst *S = nullptr; - - if ((S = dyn_cast(L))) - return std::make_pair(R, S); - - S = dyn_cast(R); - return std::make_pair(L, S); -} - -TTI::ReductionKind TTI::matchVectorSplittingReduction( - const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty) { - - if (!EnableReduxCost) - return TTI::RK_None; - - // Need to extract the first element. - ConstantInt *CI = dyn_cast(ReduxRoot->getOperand(1)); - unsigned Idx = ~0u; - if (CI) - Idx = CI->getZExtValue(); - if (Idx != 0) - return TTI::RK_None; - - auto *RdxStart = dyn_cast(ReduxRoot->getOperand(0)); - if (!RdxStart) - return TTI::RK_None; - Optional RD = getReductionData(RdxStart); - if (!RD) - return TTI::RK_None; - - auto *VecTy = cast(ReduxRoot->getOperand(0)->getType()); - unsigned NumVecElems = VecTy->getNumElements(); - if (!isPowerOf2_32(NumVecElems)) - return TTI::RK_None; - - // We look for a sequence of shuffles and adds like the following matching one - // fadd, shuffle vector pair at a time. - // - // %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, - // <4 x i32> - // %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf - // %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, - // <4 x i32> - // %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 - // %r = extractelement <4 x float> %bin.rdx8, i32 0 - - unsigned MaskStart = 1; - Instruction *RdxOp = RdxStart; - SmallVector ShuffleMask(NumVecElems, 0); - unsigned NumVecElemsRemain = NumVecElems; - while (NumVecElemsRemain - 1) { - // Check for the right reduction operation. - if (!RdxOp) - return TTI::RK_None; - Optional RDLevel = getReductionData(RdxOp); - if (!RDLevel || !RDLevel->hasSameData(*RD)) - return TTI::RK_None; - - Value *NextRdxOp; - ShuffleVectorInst *Shuffle; - std::tie(NextRdxOp, Shuffle) = - getShuffleAndOtherOprd(RDLevel->LHS, RDLevel->RHS); - - // Check the current reduction operation and the shuffle use the same value. - if (Shuffle == nullptr) - return TTI::RK_None; - if (Shuffle->getOperand(0) != NextRdxOp) - return TTI::RK_None; - - // Check that shuffle masks matches. - for (unsigned j = 0; j != MaskStart; ++j) - ShuffleMask[j] = MaskStart + j; - // Fill the rest of the mask with -1 for undef. - std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1); - - ArrayRef Mask = Shuffle->getShuffleMask(); - if (ShuffleMask != Mask) - return TTI::RK_None; - - RdxOp = dyn_cast(NextRdxOp); - NumVecElemsRemain /= 2; - MaskStart *= 2; - } - - Opcode = RD->Opcode; - Ty = VecTy; - return RD->Kind; -} - -TTI::ReductionKind -TTI::matchVectorReduction(const ExtractElementInst *Root, unsigned &Opcode, - VectorType *&Ty, bool &IsPairwise) { - TTI::ReductionKind RdxKind = matchVectorSplittingReduction(Root, Opcode, Ty); - if (RdxKind != TTI::ReductionKind::RK_None) { - IsPairwise = false; - return RdxKind; - } - IsPairwise = true; - return matchPairwiseReduction(Root, Opcode, Ty); -} - InstructionCost TargetTransformInfo::getInstructionThroughput(const Instruction *I) const { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 6005852f1710e..af34a5ea94867 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1759,11 +1759,10 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( InstructionCost AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, + bool IsUnsigned, TTI::TargetCostKind CostKind) { if (!isa(Ty)) - return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, - CostKind); + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); assert((isa(Ty) && isa(CondTy)) && "Both vector needs to be scalable"); @@ -1785,10 +1784,7 @@ AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, } InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( - unsigned Opcode, VectorType *ValTy, bool IsPairwise, - TTI::TargetCostKind CostKind) { - assert(!IsPairwise && "Cannot be pair wise to continue"); - + unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); InstructionCost LegalizationCost = 0; if (LT.first > 1) { @@ -1814,15 +1810,9 @@ InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( InstructionCost AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, TTI::TargetCostKind CostKind) { - if (isa(ValTy)) - return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm, - CostKind); - if (IsPairwiseForm) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); + return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -1894,8 +1884,7 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, } break; } - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); } InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 83956fbbe05bd..1de650bd32200 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -158,12 +158,11 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned Index); InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, + bool IsUnsigned, TTI::TargetCostKind CostKind); InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, TTI::TargetCostKind CostKind); InstructionCost getSpliceCost(VectorType *Tp, int Index); @@ -306,7 +305,7 @@ class AArch64TTIImpl : public BasicTTIImplBase { ElementCount VF) const; InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, + unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 67886b9681aeb..f5a34ec944215 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -843,16 +843,13 @@ InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode, InstructionCost GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwise, TTI::TargetCostKind CostKind) { EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). - if (IsPairwise || - !ST->hasVOP3PInsts() || - OrigTy.getScalarSizeInBits() != 16) - return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind); + if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) + return BaseT::getArithmeticReductionCost(Opcode, Ty, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getFullRateInstrCost(); @@ -860,17 +857,14 @@ GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, InstructionCost GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, + bool IsUnsigned, TTI::TargetCostKind CostKind) { EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). - if (IsPairwise || - !ST->hasVOP3PInsts() || - OrigTy.getScalarSizeInBits() != 16) - return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, - CostKind); + if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getHalfRateInstrCost(CostKind); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index e4e816a7a74ef..c6cf31b35ceec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -212,13 +212,13 @@ class GCNTTIImpl final : public BasicTTIImplBase { int getInlinerVectorBonusPercent() { return 0; } InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, bool IsPairwise, + unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); InstructionCost getMinMaxReductionCost( - VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, + VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); }; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index f8951434ed8ce..762ed8e6666e6 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1594,13 +1594,11 @@ InstructionCost ARMTTIImpl::getGatherScatterOpCost( InstructionCost ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, TTI::TargetCostKind CostKind) { EVT ValVT = TLI->getValueType(DL, ValTy); int ISD = TLI->InstructionOpcodeToISD(Opcode); if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1612,8 +1610,7 @@ ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind); } InstructionCost diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index ae79832ed3167..989f01006cfea 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -257,7 +257,6 @@ class ARMTTIImpl : public BasicTTIImplBase { const Instruction *I = nullptr); InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, TTI::TargetCostKind CostKind); InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index ff8d2d57e8f18..1d802e04198a3 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3737,12 +3737,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, InstructionCost X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwise, TTI::TargetCostKind CostKind) { - // Just use the default implementation for pair reductions. - if (IsPairwise) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind); - // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. @@ -3813,7 +3808,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, TargetTransformInfo::CastContextHint::None, CostKind) + - getArithmeticReductionCost(Opcode, WideVecTy, IsPairwise, CostKind); + getArithmeticReductionCost(Opcode, WideVecTy, CostKind); } InstructionCost ArithmeticCost = 0; @@ -3909,8 +3904,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) return ArithmeticCost + Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind); } unsigned NumVecElts = ValVTy->getNumElements(); @@ -3919,8 +3913,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, // Special case power of 2 reductions where the scalar type isn't changed // by type legalization. if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind); InstructionCost ReductionCost = 0; @@ -4118,13 +4111,8 @@ InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, InstructionCost X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, + bool IsUnsigned, TTI::TargetCostKind CostKind) { - // Just use the default implementation for pair reductions. - if (IsPairwise) - return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, - CostKind); - std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -4240,8 +4228,7 @@ X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, // by type legalization. if (!isPowerOf2_32(ValVTy->getNumElements()) || ScalarSize != MTy.getScalarSizeInBits()) - return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, - CostKind); + return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); // Now handle reduction with the legal type, taking into account size changes // at each level. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 6d95119992b62..ae6c4ad21140b 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -181,13 +181,13 @@ class X86TTIImpl : public BasicTTIImplBase { TTI::TargetCostKind CostKind); InstructionCost getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, + unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned); InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwiseForm, bool IsUnsigned, + bool IsUnsigned, TTI::TargetCostKind CostKind); InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e4fb3055f0410..13243f77dd45f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7176,8 +7176,8 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost( const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[cast(ReductionPhi)]; - InstructionCost BaseCost = TTI.getArithmeticReductionCost( - RdxDesc.getOpcode(), VectorTy, false, CostKind); + InstructionCost BaseCost = + TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy, CostKind); // Get the operand that was not the reduction chain and match it to one of the // patterns, returning the better cost if it is found. diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 01f6873388cd1..2ec32156a15f2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7903,17 +7903,15 @@ class HorizontalReduction { case RecurKind::FAdd: case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, - /*IsPairwiseForm=*/false); + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); break; } case RecurKind::FMax: case RecurKind::FMin: { auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); - VectorCost = - TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*pairwise=*/false, /*unsigned=*/false); + VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*unsigned=*/false); ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, @@ -7927,9 +7925,7 @@ class HorizontalReduction { auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); bool IsUnsigned = RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; - VectorCost = - TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*IsPairwiseForm=*/false, IsUnsigned); + VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned); ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll index 1812a074f86bb..f52a6a384a487 100644 --- a/llvm/test/Analysis/CostModel/X86/reduction.ll +++ b/llvm/test/Analysis/CostModel/X86/reduction.ll @@ -7,9 +7,7 @@ ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM -; Check that we recognize the tree starting at the extractelement as a -; reduction. -; NOTE: We're only really interested in the extractelement cost, which represents the entire reduction. +; These are old tests for matching reduction costs from extract elements - something that has now been removed. define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; SSE2-LABEL: 'reduction_cost_float' @@ -17,7 +15,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'reduction_cost_float' @@ -25,7 +23,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'reduction_cost_float' @@ -33,7 +31,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX-LABEL: 'reduction_cost_float' @@ -41,7 +39,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'reduction_cost_float' @@ -49,7 +47,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -69,7 +67,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX1-LABEL: 'reduction_cost_int' @@ -79,7 +77,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX2-LABEL: 'reduction_cost_int' @@ -89,7 +87,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'reduction_cost_int' @@ -99,7 +97,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, @@ -127,7 +125,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -138,7 +136,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -149,7 +147,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -160,7 +158,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -171,7 +169,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -199,7 +197,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -210,7 +208,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -221,7 +219,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -232,7 +230,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -243,7 +241,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -270,7 +268,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -280,7 +278,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -290,7 +288,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -300,7 +298,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -310,7 +308,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -332,31 +330,31 @@ define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1 ; SSE2-LABEL: 'no_pairwise_reduction2double' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'no_pairwise_reduction2double' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'no_pairwise_reduction2double' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX-LABEL: 'no_pairwise_reduction2double' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SLM-LABEL: 'no_pairwise_reduction2double' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> @@ -372,7 +370,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4float' @@ -380,7 +378,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction4float' @@ -388,7 +386,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX-LABEL: 'no_pairwise_reduction4float' @@ -396,7 +394,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'no_pairwise_reduction4float' @@ -404,7 +402,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -422,7 +420,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4double' @@ -430,7 +428,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'no_pairwise_reduction4double' @@ -438,7 +436,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX1-LABEL: 'no_pairwise_reduction4double' @@ -446,7 +444,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX2-LABEL: 'no_pairwise_reduction4double' @@ -454,7 +452,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SLM-LABEL: 'no_pairwise_reduction4double' @@ -462,7 +460,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> @@ -482,7 +480,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8float' @@ -492,7 +490,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction8float' @@ -502,7 +500,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX1-LABEL: 'no_pairwise_reduction8float' @@ -512,7 +510,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX2-LABEL: 'no_pairwise_reduction8float' @@ -522,7 +520,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'no_pairwise_reduction8float' @@ -532,7 +530,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> @@ -547,22 +545,16 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { } define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { -; SSE-LABEL: 'no_pairwise_reduction2i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r -; -; AVX-LABEL: 'no_pairwise_reduction2i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; CHECK-LABEL: 'no_pairwise_reduction2i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'no_pairwise_reduction2i64' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> @@ -578,7 +570,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'no_pairwise_reduction4i32' @@ -586,7 +578,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> @@ -604,7 +596,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; AVX1-LABEL: 'no_pairwise_reduction4i64' @@ -612,7 +604,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; AVX2-LABEL: 'no_pairwise_reduction4i64' @@ -620,7 +612,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'no_pairwise_reduction4i64' @@ -628,7 +620,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> @@ -648,7 +640,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8i16' @@ -658,7 +650,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSE42-LABEL: 'no_pairwise_reduction8i16' @@ -668,7 +660,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; AVX-LABEL: 'no_pairwise_reduction8i16' @@ -678,7 +670,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SLM-LABEL: 'no_pairwise_reduction8i16' @@ -688,7 +680,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> @@ -710,7 +702,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX1-LABEL: 'no_pairwise_reduction8i32' @@ -720,7 +712,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX2-LABEL: 'no_pairwise_reduction8i32' @@ -730,7 +722,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'no_pairwise_reduction8i32' @@ -740,7 +732,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> @@ -759,35 +751,35 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction2double' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction2double' ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX-LABEL: 'pairwise_reduction2double' ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SLM-LABEL: 'pairwise_reduction2double' ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> @@ -806,7 +798,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction4float' @@ -816,7 +808,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction4float' @@ -826,7 +818,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX-LABEL: 'pairwise_reduction4float' @@ -836,7 +828,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'pairwise_reduction4float' @@ -846,7 +838,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -868,7 +860,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction4double' @@ -878,7 +870,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction4double' @@ -888,7 +880,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX1-LABEL: 'pairwise_reduction4double' @@ -898,7 +890,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX2-LABEL: 'pairwise_reduction4double' @@ -908,7 +900,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SLM-LABEL: 'pairwise_reduction4double' @@ -918,7 +910,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> @@ -943,7 +935,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction8float' @@ -956,7 +948,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction8float' @@ -969,7 +961,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX1-LABEL: 'pairwise_reduction8float' @@ -982,7 +974,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX2-LABEL: 'pairwise_reduction8float' @@ -995,7 +987,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'pairwise_reduction8float' @@ -1008,7 +1000,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> @@ -1030,14 +1022,14 @@ define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'pairwise_reduction2i64' ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> @@ -1056,7 +1048,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'pairwise_reduction4i32' @@ -1066,7 +1058,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> @@ -1088,7 +1080,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; AVX1-LABEL: 'pairwise_reduction4i64' @@ -1098,7 +1090,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; AVX2-LABEL: 'pairwise_reduction4i64' @@ -1108,7 +1100,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'pairwise_reduction4i64' @@ -1118,7 +1110,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> @@ -1143,7 +1135,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSSE3-LABEL: 'pairwise_reduction8i16' @@ -1156,7 +1148,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSE42-LABEL: 'pairwise_reduction8i16' @@ -1169,7 +1161,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; AVX-LABEL: 'pairwise_reduction8i16' @@ -1182,7 +1174,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SLM-LABEL: 'pairwise_reduction8i16' @@ -1195,7 +1187,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> @@ -1223,7 +1215,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX1-LABEL: 'pairwise_reduction8i32' @@ -1236,7 +1228,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX2-LABEL: 'pairwise_reduction8i32' @@ -1249,7 +1241,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'pairwise_reduction8i32' @@ -1262,7 +1254,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32>