diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8d022623e40d7e..792a86ccb9ba21 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1542,6 +1542,14 @@ class BoUpSLP { getGatherCost(FixedVectorType *Ty, const DenseSet &ShuffledIndices) const; + /// Checks if the gathered \p VL can be represented as shuffle(s) of previous + /// tree entries. + /// \returns ShuffleKind, if gathered values can be represented as shuffles of + /// previous tree entries. \p Mask is filled with the shuffle mask. + Optional + isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, + SmallVectorImpl &Entries); + /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. @@ -2540,6 +2548,17 @@ void BoUpSLP::buildTree(ArrayRef Roots, buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); } +static int findLaneForValue(ArrayRef Scalars, + ArrayRef ReuseShuffleIndices, Value *V) { + unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); + assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); + if (!ReuseShuffleIndices.empty()) { + FoundLane = std::distance(ReuseShuffleIndices.begin(), + find(ReuseShuffleIndices, FoundLane)); + } + return FoundLane; +} + void BoUpSLP::buildTree(ArrayRef Roots, ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst) { @@ -2560,12 +2579,8 @@ void BoUpSLP::buildTree(ArrayRef Roots, // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - int FoundLane = Lane; - if (!Entry->ReuseShuffleIndices.empty()) { - FoundLane = - std::distance(Entry->ReuseShuffleIndices.begin(), - llvm::find(Entry->ReuseShuffleIndices, FoundLane)); - } + int FoundLane = + findLaneForValue(Entry->Scalars, Entry->ReuseShuffleIndices, Scalar); // Check if the scalar is externally used as an extra arg. auto ExtI = ExternallyUsedValues.find(Scalar); @@ -3560,7 +3575,27 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { return ReuseShuffleCost + Cost; } } - return ReuseShuffleCost + getGatherCost(VL); + InstructionCost GatherCost = 0; + SmallVector Mask; + SmallVector Entries; + Optional Shuffle = + isGatherShuffledEntry(E, Mask, Entries); + if (Shuffle.hasValue()) { + if (ShuffleVectorInst::isIdentityMask(Mask)) { + LLVM_DEBUG( + dbgs() + << "SLP: perfect diamond match for gather bundle that starts with " + << *VL.front() << ".\n"); + } else { + LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() + << " entries for bundle that starts with " + << *VL.front() << ".\n"); + GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask); + } + } else { + GatherCost = getGatherCost(VL); + } + return ReuseShuffleCost + GatherCost; } assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && @@ -4216,6 +4251,80 @@ InstructionCost BoUpSLP::getTreeCost() { return Cost; } +Optional +BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, + SmallVectorImpl &Entries) { + auto *VLIt = find_if(VectorizableTree, + [TE](const std::unique_ptr &EntryPtr) { + return EntryPtr.get() == TE; + }); + assert(VLIt != VectorizableTree.end() && + "Gathered values should be in the tree."); + Mask.assign(TE->Scalars.size(), UndefMaskElem); + Entries.clear(); + DenseMap Used; + int NumShuffles = 0; + for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { + Value *V = TE->Scalars[I]; + const TreeEntry *VTE = getTreeEntry(V); + if (!VTE) { + // Check if it is used in one of the gathered entries. + const auto *It = + find_if(make_range(VectorizableTree.begin(), VLIt), + [V](const std::unique_ptr &EntryPtr) { + return EntryPtr->State == TreeEntry::NeedToGather && + is_contained(EntryPtr->Scalars, V); + }); + if (It != VLIt) + VTE = It->get(); + } + if (VTE) { + auto Res = Used.try_emplace(VTE, NumShuffles); + if (Res.second) { + Entries.push_back(VTE); + ++NumShuffles; + if (NumShuffles > 2) + return None; + if (NumShuffles == 2) { + unsigned FirstSz = Entries.front()->Scalars.size(); + if (!Entries.front()->ReuseShuffleIndices.empty()) + FirstSz = Entries.front()->ReuseShuffleIndices.size(); + unsigned SecondSz = Entries.back()->Scalars.size(); + if (!Entries.back()->ReuseShuffleIndices.empty()) + SecondSz = Entries.back()->ReuseShuffleIndices.size(); + if (FirstSz != SecondSz) + return None; + } + } + int FoundLane = + findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V); + // Extra check required by isSingleSourceMaskImpl function (called by + // ShuffleVectorInst::isSingleSourceMask). + if (FoundLane >= E * 2) + return None; + unsigned Sz = VTE->Scalars.size(); + if (!VTE->ReuseShuffleIndices.empty()) + Sz = VTE->ReuseShuffleIndices.size(); + Mask[I] = Res.first->second * Sz + FoundLane; + continue; + } + return None; + } + if (NumShuffles == 1) { + if (ShuffleVectorInst::isReverseMask(Mask)) + return TargetTransformInfo::SK_Reverse; + return TargetTransformInfo::SK_PermuteSingleSrc; + } + if (NumShuffles == 2) { + if (ShuffleVectorInst::isSelectMask(Mask)) + return TargetTransformInfo::SK_Select; + if (ShuffleVectorInst::isTransposeMask(Mask)) + return TargetTransformInfo::SK_Transpose; + return TargetTransformInfo::SK_PermuteTwoSrc; + } + return None; +} + InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, const DenseSet &ShuffledIndices) const { @@ -4346,13 +4455,8 @@ Value *BoUpSLP::gather(ArrayRef VL) { // Add to our 'need-to-extract' list. if (TreeEntry *Entry = getTreeEntry(Val)) { // Find which lane we need to extract. - unsigned FoundLane = std::distance(Entry->Scalars.begin(), - find(Entry->Scalars, Val)); - assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane"); - if (!Entry->ReuseShuffleIndices.empty()) { - FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), - find(Entry->ReuseShuffleIndices, FoundLane)); - } + int FoundLane = + findLaneForValue(Entry->Scalars, Entry->ReuseShuffleIndices, Val); ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); } } @@ -4499,7 +4603,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); - Value *Vec = gather(E->Scalars); + Value *Vec; + SmallVector Mask; + SmallVector Entries; + Optional Shuffle = + isGatherShuffledEntry(E, Mask, Entries); + if (Shuffle.hasValue()) { + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue, Mask); + } else { + Vec = gather(E->Scalars); + } if (NeedToShuffleReuses) { ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll index 57db62ace2063f..31c63d31f4df18 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu" ; REMARK-LABEL: Function: gather_multiple_use ; REMARK: Args: ; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; REMARK-NEXT: - Cost: '-7' +; REMARK-NEXT: - Cost: '-16' ; ; REMARK-NOT: Function: gather_load diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll index 1a4cbb16b9a915..c57f83ecb18690 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -4,124 +4,52 @@ define i32 @bar() local_unnamed_addr { ; CHECK-LABEL: @bar( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD103:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB104:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[ADD105:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB106:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15 -; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535 -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]] -; CHECK-NEXT: [[SHR_I64:%.*]] = lshr i32 [[ADD105]], 15 -; CHECK-NEXT: [[AND_I65:%.*]] = and i32 [[SHR_I64]], 65537 -; CHECK-NEXT: [[MUL_I66:%.*]] = mul nuw i32 [[AND_I65]], 65535 -; CHECK-NEXT: [[ADD_I67:%.*]] = add i32 [[MUL_I66]], [[ADD105]] -; CHECK-NEXT: [[XOR_I68:%.*]] = xor i32 [[ADD_I67]], [[MUL_I66]] -; CHECK-NEXT: [[SHR_I69:%.*]] = lshr i32 [[SUB104]], 15 -; CHECK-NEXT: [[AND_I70:%.*]] = and i32 [[SHR_I69]], 65537 -; CHECK-NEXT: [[MUL_I71:%.*]] = mul nuw i32 [[AND_I70]], 65535 -; CHECK-NEXT: [[ADD_I72:%.*]] = add i32 [[MUL_I71]], [[SUB104]] -; CHECK-NEXT: [[XOR_I73:%.*]] = xor i32 [[ADD_I72]], [[MUL_I71]] -; CHECK-NEXT: [[SHR_I74:%.*]] = lshr i32 [[SUB106]], 15 -; CHECK-NEXT: [[AND_I75:%.*]] = and i32 [[SHR_I74]], 65537 -; CHECK-NEXT: [[MUL_I76:%.*]] = mul nuw i32 [[AND_I75]], 65535 -; CHECK-NEXT: [[ADD_I77:%.*]] = add i32 [[MUL_I76]], [[SUB106]] -; CHECK-NEXT: [[XOR_I78:%.*]] = xor i32 [[ADD_I77]], [[MUL_I76]] -; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I68]], [[XOR_I]] -; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I73]] -; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I78]] ; CHECK-NEXT: [[ADD78_1:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB86_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]] -; CHECK-NEXT: [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]] -; CHECK-NEXT: [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15 -; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; CHECK-NEXT: [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]] -; CHECK-NEXT: [[SHR_I64_1:%.*]] = lshr i32 [[ADD105_1]], 15 -; CHECK-NEXT: [[AND_I65_1:%.*]] = and i32 [[SHR_I64_1]], 65537 -; CHECK-NEXT: [[MUL_I66_1:%.*]] = mul nuw i32 [[AND_I65_1]], 65535 -; CHECK-NEXT: [[ADD_I67_1:%.*]] = add i32 [[MUL_I66_1]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I68_1:%.*]] = xor i32 [[ADD_I67_1]], [[MUL_I66_1]] -; CHECK-NEXT: [[SHR_I69_1:%.*]] = lshr i32 [[SUB104_1]], 15 -; CHECK-NEXT: [[AND_I70_1:%.*]] = and i32 [[SHR_I69_1]], 65537 -; CHECK-NEXT: [[MUL_I71_1:%.*]] = mul nuw i32 [[AND_I70_1]], 65535 -; CHECK-NEXT: [[ADD_I72_1:%.*]] = add i32 [[MUL_I71_1]], [[SUB104_1]] -; CHECK-NEXT: [[XOR_I73_1:%.*]] = xor i32 [[ADD_I72_1]], [[MUL_I71_1]] -; CHECK-NEXT: [[SHR_I74_1:%.*]] = lshr i32 [[SUB106_1]], 15 -; CHECK-NEXT: [[AND_I75_1:%.*]] = and i32 [[SHR_I74_1]], 65537 -; CHECK-NEXT: [[MUL_I76_1:%.*]] = mul nuw i32 [[AND_I75_1]], 65535 -; CHECK-NEXT: [[ADD_I77_1:%.*]] = add i32 [[MUL_I76_1]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I78_1:%.*]] = xor i32 [[ADD_I77_1]], [[MUL_I76_1]] -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I68_1]], [[ADD113]] -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I73_1]] -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I78_1]] ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[ADD103_2:%.*]] = add nsw i32 undef, [[ADD78_2]] -; CHECK-NEXT: [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], undef -; CHECK-NEXT: [[ADD105_2:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB106_2:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15 -; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 -; CHECK-NEXT: [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535 -; CHECK-NEXT: [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] -; CHECK-NEXT: [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]] -; CHECK-NEXT: [[SHR_I64_2:%.*]] = lshr i32 [[ADD105_2]], 15 -; CHECK-NEXT: [[AND_I65_2:%.*]] = and i32 [[SHR_I64_2]], 65537 -; CHECK-NEXT: [[MUL_I66_2:%.*]] = mul nuw i32 [[AND_I65_2]], 65535 -; CHECK-NEXT: [[ADD_I67_2:%.*]] = add i32 [[MUL_I66_2]], [[ADD105_2]] -; CHECK-NEXT: [[XOR_I68_2:%.*]] = xor i32 [[ADD_I67_2]], [[MUL_I66_2]] -; CHECK-NEXT: [[SHR_I69_2:%.*]] = lshr i32 [[SUB104_2]], 15 -; CHECK-NEXT: [[AND_I70_2:%.*]] = and i32 [[SHR_I69_2]], 65537 -; CHECK-NEXT: [[MUL_I71_2:%.*]] = mul nuw i32 [[AND_I70_2]], 65535 -; CHECK-NEXT: [[ADD_I72_2:%.*]] = add i32 [[MUL_I71_2]], [[SUB104_2]] -; CHECK-NEXT: [[XOR_I73_2:%.*]] = xor i32 [[ADD_I72_2]], [[MUL_I71_2]] -; CHECK-NEXT: [[SHR_I74_2:%.*]] = lshr i32 [[SUB106_2]], 15 -; CHECK-NEXT: [[AND_I75_2:%.*]] = and i32 [[SHR_I74_2]], 65537 -; CHECK-NEXT: [[MUL_I76_2:%.*]] = mul nuw i32 [[AND_I75_2]], 65535 -; CHECK-NEXT: [[ADD_I77_2:%.*]] = add i32 [[MUL_I76_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I78_2:%.*]] = xor i32 [[ADD_I77_2]], [[MUL_I76_2]] -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I68_2]], [[ADD113_1]] -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]] -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I73_2]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I78_2]] ; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[ADD103_3:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB104_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], undef -; CHECK-NEXT: [[SUB106_3:%.*]] = sub nsw i32 undef, [[SUB102_3]] -; CHECK-NEXT: [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15 -; CHECK-NEXT: [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537 -; CHECK-NEXT: [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535 -; CHECK-NEXT: [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]] -; CHECK-NEXT: [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]] -; CHECK-NEXT: [[SHR_I64_3:%.*]] = lshr i32 [[ADD105_3]], 15 -; CHECK-NEXT: [[AND_I65_3:%.*]] = and i32 [[SHR_I64_3]], 65537 -; CHECK-NEXT: [[MUL_I66_3:%.*]] = mul nuw i32 [[AND_I65_3]], 65535 -; CHECK-NEXT: [[ADD_I67_3:%.*]] = add i32 [[MUL_I66_3]], [[ADD105_3]] -; CHECK-NEXT: [[XOR_I68_3:%.*]] = xor i32 [[ADD_I67_3]], [[MUL_I66_3]] -; CHECK-NEXT: [[SHR_I69_3:%.*]] = lshr i32 [[SUB104_3]], 15 -; CHECK-NEXT: [[AND_I70_3:%.*]] = and i32 [[SHR_I69_3]], 65537 -; CHECK-NEXT: [[MUL_I71_3:%.*]] = mul nuw i32 [[AND_I70_3]], 65535 -; CHECK-NEXT: [[ADD_I72_3:%.*]] = add i32 [[MUL_I71_3]], [[SUB104_3]] -; CHECK-NEXT: [[XOR_I73_3:%.*]] = xor i32 [[ADD_I72_3]], [[MUL_I71_3]] -; CHECK-NEXT: [[SHR_I74_3:%.*]] = lshr i32 [[SUB106_3]], 15 -; CHECK-NEXT: [[AND_I75_3:%.*]] = and i32 [[SHR_I74_3]], 65537 -; CHECK-NEXT: [[MUL_I76_3:%.*]] = mul nuw i32 [[AND_I75_3]], 65535 -; CHECK-NEXT: [[ADD_I77_3:%.*]] = add i32 [[MUL_I76_3]], [[SUB106_3]] -; CHECK-NEXT: [[XOR_I78_3:%.*]] = xor i32 [[ADD_I77_3]], [[MUL_I76_3]] -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I68_3]], [[ADD113_2]] -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]] -; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I73_3]] -; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I78_3]] -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_3]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 undef, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[SUB102_1]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 undef, i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 undef, i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 undef, i32 5 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 undef, i32 6 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 7 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD78_1]], i32 8 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB86_1]], i32 9 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 undef, i32 10 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[ADD78_2]], i32 11 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 undef, i32 12 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 undef, i32 13 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 undef, i32 14 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 undef, i32 15 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> , i32 [[SUB86_1]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 undef, i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 undef, i32 4 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 undef, i32 5 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 undef, i32 6 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[ADD78_1]], i32 7 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[ADD94_1]], i32 8 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[SUB102_1]], i32 9 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[ADD78_2]], i32 10 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 undef, i32 11 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 undef, i32 12 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 undef, i32 13 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 undef, i32 14 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[SUB102_3]], i32 15 +; CHECK-NEXT: [[TMP30:%.*]] = add nsw <16 x i32> [[TMP15]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = sub nsw <16 x i32> [[TMP15]], [[TMP29]] +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i32> [[TMP30]], <16 x i32> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = lshr <16 x i32> [[TMP32]], +; CHECK-NEXT: [[TMP34:%.*]] = and <16 x i32> [[TMP33]], +; CHECK-NEXT: [[TMP35:%.*]] = mul nuw <16 x i32> [[TMP34]], +; CHECK-NEXT: [[TMP36:%.*]] = add <16 x i32> [[TMP35]], [[TMP32]] +; CHECK-NEXT: [[TMP37:%.*]] = xor <16 x i32> [[TMP36]], [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP37]]) +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP38]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]