From 29a470e3804ca216d4e76c88a38086eb61c200f9 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 14 Apr 2021 07:49:32 -0700 Subject: [PATCH] [COST]Improve cost model for shuffles in SLP. Introduced masks where they are not added and improved target dependent cost models to avoid returning of the incorrect cost results after adding masks. Differential Revision: https://reviews.llvm.org/D100486 --- llvm/include/llvm/Analysis/VectorUtils.h | 2 +- llvm/lib/Analysis/VectorUtils.cpp | 4 +- .../SelectionDAG/LegalizeVectorTypes.cpp | 2 +- .../lib/Target/X86/X86TargetTransformInfo.cpp | 51 ++++++ .../Transforms/Vectorize/SLPVectorizer.cpp | 35 ++-- llvm/test/Analysis/CostModel/X86/reduction.ll | 168 +++++++++--------- .../CostModel/X86/shuffle-single-src.ll | 100 +++++------ .../Transforms/SLPVectorizer/X86/PR39774.ll | 2 +- 8 files changed, 213 insertions(+), 151 deletions(-) diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index 868781197eda0..d6cafd5d04b73 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -413,7 +413,7 @@ bool widenShuffleMaskElts(int Scale, ArrayRef Mask, void processShuffleMasks( ArrayRef Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref NoInputAction, - function_ref, unsigned)> SingleInputAction, + function_ref, unsigned, unsigned)> SingleInputAction, function_ref, unsigned, unsigned)> ManyInputsAction); /// Compute a map of integer instructions to their minimum legal type diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 6dace5abe29cd..5f8fa13a7c411 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -499,7 +499,7 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef Mask, void llvm::processShuffleMasks( ArrayRef Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref NoInputAction, - function_ref, unsigned)> SingleInputAction, + function_ref, unsigned, unsigned)> SingleInputAction, function_ref, unsigned, unsigned)> ManyInputsAction) { SmallVector>> Res(NumOfDestRegs); // Try to perform better estimation of the permutation. @@ -543,7 +543,7 @@ void llvm::processShuffleMasks( auto *It = find_if(Dest, [](ArrayRef Mask) { return !Mask.empty(); }); unsigned SrcReg = std::distance(Dest.begin(), It); - SingleInputAction(*It, SrcReg); + SingleInputAction(*It, SrcReg, I); break; } default: { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index a7e54817fb60d..e9cadc763533b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2523,7 +2523,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, /*NumOfUsedRegs=*/1, [&Output, &DAG = DAG, NewVT]() { Output = DAG.getUNDEF(NewVT); }, [&Output, &DAG = DAG, NewVT, &DL, &Inputs, - &BuildVector](ArrayRef Mask, unsigned Idx) { + &BuildVector](ArrayRef Mask, unsigned Idx, unsigned /*Unused*/) { if (Inputs[Idx]->getOpcode() == ISD::BUILD_VECTOR) Output = BuildVector(Inputs[Idx], Inputs[Idx], Mask); else diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index b018843f939e1..7ad530fa028af 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1224,6 +1224,57 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), LegalVT.getVectorNumElements()); + if (!Mask.empty() && NumOfDests.isValid()) { + // Try to perform better estimation of the permutation. + // 1. Split the source/destination vectors into real registers. + // 2. Do the mask analysis to identify which real registers are + // permuted. If more than 1 source registers are used for the + // destination register building, the cost for this destination register + // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one + // source register is used, build mask and calculate the cost as a cost + // of PermuteSingleSrc. + // Also, for the single register permute we try to identify if the + // destination register is just a copy of the source register or the + // copy of the previous destination register (the cost is + // TTI::TCC_Basic). If the source register is just reused, the cost for + // this operation is 0. + unsigned E = *NumOfDests.getValue(); + unsigned PrevSrcReg = 0; + ArrayRef PrevRegMask; + InstructionCost Cost = 0; + processShuffleMasks( + Mask, NumOfSrcs, E, E, []() {}, + [this, SingleOpTy, &PrevSrcReg, &PrevRegMask, + &Cost](ArrayRef RegMask, unsigned SrcReg, unsigned DestReg) { + if (!ShuffleVectorInst::isIdentityMask(RegMask)) { + // Check if the previous register can be just copied to the next + // one. + if (PrevRegMask.empty() || PrevSrcReg != SrcReg || + PrevRegMask != RegMask) + Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, + RegMask, 0, nullptr); + else + // Just a copy of previous destination register. + Cost += TTI::TCC_Basic; + return; + } + if (SrcReg != DestReg && + any_of(RegMask, [](int I) { return I != UndefMaskElem; })) { + // Just a copy of the source register. + Cost += TTI::TCC_Basic; + } + PrevSrcReg = SrcReg; + PrevRegMask = RegMask; + }, + [this, SingleOpTy, &Cost](ArrayRef RegMask, + unsigned /*Unused*/, + unsigned /*Unused*/) { + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, + 0, nullptr); + }); + return Cost; + } + InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, None, 0, nullptr); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 00661d1e8923f..a60cea3cb4cee 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5059,6 +5059,7 @@ computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, // Process extracts in blocks of EltsPerVector to check if the source vector // operand can be re-used directly. If not, add the cost of creating a shuffle // to extract the values into a vector register. + SmallVector RegMask(EltsPerVector, UndefMaskElem); for (auto *V : VL) { ++Idx; @@ -5068,6 +5069,7 @@ computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, // Reached the start of a new vector registers. if (Idx % EltsPerVector == 0) { + RegMask.assign(EltsPerVector, UndefMaskElem); AllConsecutive = true; continue; } @@ -5079,6 +5081,7 @@ computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, unsigned PrevIdx = *getExtractIndex(cast(VL[Idx - 1])); AllConsecutive &= PrevIdx + 1 == CurrentIdx && CurrentIdx % EltsPerVector == Idx % EltsPerVector; + RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; } if (AllConsecutive) @@ -5093,7 +5096,7 @@ computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, // cost to extract the a vector with EltsPerVector elements. Cost += TTI.getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(VecTy->getElementType(), EltsPerVector)); + FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask); } return Cost; } @@ -5880,16 +5883,21 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI::CastContextHint::None, CostKind); } - SmallVector Mask; - buildShuffleEntryMask( - E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, - [E](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return isAlternateInstruction(I, E->getMainOp(), E->getAltOp()); - }, - Mask); - CommonCost = - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask); + if (E->ReuseShuffleIndices.empty()) { + CommonCost = + TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy); + } else { + SmallVector Mask; + buildShuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask); + CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + FinalVecTy, Mask); + } LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return CommonCost + VecCost - ScalarCost; } @@ -6278,7 +6286,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { Cost += SpillCost + ExtractCost; if (FirstUsers.size() == 1) { int Limit = ShuffleMask.front().size() * 2; - if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && + if (!all_of(ShuffleMask.front(), + [Limit](int Idx) { return Idx < Limit; }) || !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, @@ -6327,6 +6336,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { << "SLP: Current total cost = " << Cost << "\n"); Cost -= InsertCost; for (int I = 2, E = FirstUsers.size(); I < E; ++I) { + if (ShuffleMask[I].empty()) + continue; // Other elements - permutation of 2 vectors (the initial one and the // next Ith incoming vector). unsigned VF = ShuffleMask[I].size(); diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll index 979dd281addc0..63faab7c25ff0 100644 --- a/llvm/test/Analysis/CostModel/X86/reduction.ll +++ b/llvm/test/Analysis/CostModel/X86/reduction.ll @@ -61,11 +61,11 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; SSE-LABEL: 'reduction_cost_int' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -91,11 +91,11 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'reduction_cost_int' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -416,25 +416,25 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-LABEL: 'no_pairwise_reduction4double' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4double' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'no_pairwise_reduction4double' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r @@ -456,9 +456,9 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SLM-LABEL: 'no_pairwise_reduction4double' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r @@ -474,31 +474,31 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-LABEL: 'no_pairwise_reduction8float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction8float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r @@ -524,11 +524,11 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'no_pairwise_reduction8float' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r @@ -592,9 +592,9 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-LABEL: 'no_pairwise_reduction4i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r @@ -616,9 +616,9 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'no_pairwise_reduction4i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r @@ -696,11 +696,11 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-LABEL: 'no_pairwise_reduction8i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -726,11 +726,11 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'no_pairwise_reduction8i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -854,31 +854,31 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-LABEL: 'pairwise_reduction4double' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction4double' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction4double' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r @@ -904,11 +904,11 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SLM-LABEL: 'pairwise_reduction4double' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r @@ -926,40 +926,40 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-LABEL: 'pairwise_reduction8float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction8float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction8float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r @@ -991,14 +991,14 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'pairwise_reduction8float' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r @@ -1074,11 +1074,11 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-LABEL: 'pairwise_reduction4i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r @@ -1104,11 +1104,11 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'pairwise_reduction4i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r @@ -1206,14 +1206,14 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-LABEL: 'pairwise_reduction8i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -1245,14 +1245,14 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'pairwise_reduction8i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll index bb9d80a30f4d0..5dc0591c1e646 100644 --- a/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll @@ -21,29 +21,29 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> ; SSE-LABEL: 'test_vXf64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXf64' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf64' @@ -64,25 +64,25 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) ; SSE-LABEL: 'test_vXi64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXi64' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi64' @@ -101,29 +101,29 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr ; SSE-LABEL: 'test_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXf32' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf32' @@ -144,17 +144,17 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, ; SSE-LABEL: 'test_vXi32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXi32' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi32' @@ -162,15 +162,15 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi32' @@ -194,27 +194,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXi16' @@ -222,8 +222,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi16' @@ -231,8 +231,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi16' @@ -240,8 +240,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512F-LABEL: 'test_vXi16' @@ -250,7 +250,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, < ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'test_vXi16' @@ -286,8 +286,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi8' @@ -295,8 +295,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi8' @@ -304,8 +304,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXi8' @@ -314,7 +314,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi8' @@ -323,7 +323,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi8' @@ -332,7 +332,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512F-LABEL: 'test_vXi8' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll index 7668747a75ace..44f1fb1dda352 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 | FileCheck %s --check-prefix=CHECK +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 | FileCheck %s --check-prefix=CHECK ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 -slp-min-tree-size=6 | FileCheck %s --check-prefix=FORCE_REDUCTION define void @Test(i32) {