diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index df06e9cbfada3f..8d022623e40d7e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1542,14 +1542,6 @@ class BoUpSLP { getGatherCost(FixedVectorType *Ty, const DenseSet &ShuffledIndices) const; - /// Checks if the gathered \p VL can be represented as shuffle(s) of previous - /// tree entries. - /// \returns ShuffleKind, if gathered values can be represented as shuffles of - /// previous tree entries. \p Mask is filled with the shuffle mask. - Optional - isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, - SmallVectorImpl &Entries); - /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. @@ -2548,17 +2540,6 @@ void BoUpSLP::buildTree(ArrayRef Roots, buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); } -static int findLaneForValue(ArrayRef Scalars, - ArrayRef ReuseShuffleIndices, Value *V) { - unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); - assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); - if (!ReuseShuffleIndices.empty()) { - FoundLane = std::distance(ReuseShuffleIndices.begin(), - find(ReuseShuffleIndices, FoundLane)); - } - return FoundLane; -} - void BoUpSLP::buildTree(ArrayRef Roots, ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst) { @@ -2579,8 +2560,12 @@ void BoUpSLP::buildTree(ArrayRef Roots, // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - int FoundLane = - findLaneForValue(Entry->Scalars, Entry->ReuseShuffleIndices, Scalar); + int FoundLane = Lane; + if (!Entry->ReuseShuffleIndices.empty()) { + FoundLane = + std::distance(Entry->ReuseShuffleIndices.begin(), + llvm::find(Entry->ReuseShuffleIndices, FoundLane)); + } // Check if the scalar is externally used as an extra arg. auto ExtI = ExternallyUsedValues.find(Scalar); @@ -3575,27 +3560,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { return ReuseShuffleCost + Cost; } } - InstructionCost GatherCost = 0; - SmallVector Mask; - SmallVector Entries; - Optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle.hasValue()) { - if (ShuffleVectorInst::isIdentityMask(Mask)) { - LLVM_DEBUG( - dbgs() - << "SLP: perfect diamond match for gather bundle that starts with " - << *VL.front() << ".\n"); - } else { - LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() - << " entries for bundle that starts with " - << *VL.front() << ".\n"); - GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask); - } - } else { - GatherCost = getGatherCost(VL); - } - return ReuseShuffleCost + GatherCost; + return ReuseShuffleCost + getGatherCost(VL); } assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && @@ -4251,76 +4216,6 @@ InstructionCost BoUpSLP::getTreeCost() { return Cost; } -Optional -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, - SmallVectorImpl &Entries) { - auto *VLIt = find_if(VectorizableTree, - [TE](const std::unique_ptr &EntryPtr) { - return EntryPtr.get() == TE; - }); - assert(VLIt != VectorizableTree.end() && - "Gathered values should be in the tree."); - Mask.assign(TE->Scalars.size(), UndefMaskElem); - Entries.clear(); - DenseMap Used; - int NumShuffles = 0; - for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { - Value *V = TE->Scalars[I]; - const TreeEntry *VTE = getTreeEntry(V); - if (!VTE) { - // Check if it is used in one of the gathered entries. - const auto *It = - find_if(make_range(VectorizableTree.begin(), VLIt), - [V](const std::unique_ptr &EntryPtr) { - return EntryPtr->State == TreeEntry::NeedToGather && - is_contained(EntryPtr->Scalars, V); - }); - if (It != VLIt) - VTE = It->get(); - } - if (VTE) { - auto Res = Used.try_emplace(VTE, NumShuffles); - if (Res.second) { - Entries.push_back(VTE); - ++NumShuffles; - if (NumShuffles > 2) - return None; - if (NumShuffles == 2) { - unsigned FirstSz = Entries.front()->Scalars.size(); - if (!Entries.front()->ReuseShuffleIndices.empty()) - FirstSz = Entries.front()->ReuseShuffleIndices.size(); - unsigned SecondSz = Entries.back()->Scalars.size(); - if (!Entries.back()->ReuseShuffleIndices.empty()) - SecondSz = Entries.back()->ReuseShuffleIndices.size(); - if (FirstSz != SecondSz) - return None; - } - } - int FoundLane = - findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V); - unsigned Sz = VTE->Scalars.size(); - if (!VTE->ReuseShuffleIndices.empty()) - Sz = VTE->ReuseShuffleIndices.size(); - Mask[I] = Res.first->second * Sz + FoundLane; - continue; - } - return None; - } - if (NumShuffles == 1) { - if (ShuffleVectorInst::isReverseMask(Mask)) - return TargetTransformInfo::SK_Reverse; - return TargetTransformInfo::SK_PermuteSingleSrc; - } - if (NumShuffles == 2) { - if (ShuffleVectorInst::isSelectMask(Mask)) - return TargetTransformInfo::SK_Select; - if (ShuffleVectorInst::isTransposeMask(Mask)) - return TargetTransformInfo::SK_Transpose; - return TargetTransformInfo::SK_PermuteTwoSrc; - } - return None; -} - InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, const DenseSet &ShuffledIndices) const { @@ -4451,8 +4346,13 @@ Value *BoUpSLP::gather(ArrayRef VL) { // Add to our 'need-to-extract' list. if (TreeEntry *Entry = getTreeEntry(Val)) { // Find which lane we need to extract. - int FoundLane = - findLaneForValue(Entry->Scalars, Entry->ReuseShuffleIndices, Val); + unsigned FoundLane = std::distance(Entry->Scalars.begin(), + find(Entry->Scalars, Val)); + assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane"); + if (!Entry->ReuseShuffleIndices.empty()) { + FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), + find(Entry->ReuseShuffleIndices, FoundLane)); + } ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); } } @@ -4599,19 +4499,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); - Value *Vec; - SmallVector Mask; - SmallVector Entries; - Optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle.hasValue()) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - } else { - Vec = gather(E->Scalars); - } + Value *Vec = gather(E->Scalars); if (NeedToShuffleReuses) { ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll index 31c63d31f4df18..57db62ace2063f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu" ; REMARK-LABEL: Function: gather_multiple_use ; REMARK: Args: ; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; REMARK-NEXT: - Cost: '-16' +; REMARK-NEXT: - Cost: '-7' ; ; REMARK-NOT: Function: gather_load diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll index c57f83ecb18690..1a4cbb16b9a915 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -4,52 +4,124 @@ define i32 @bar() local_unnamed_addr { ; CHECK-LABEL: @bar( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD103:%.*]] = add nsw i32 undef, undef +; CHECK-NEXT: [[SUB104:%.*]] = sub nsw i32 undef, undef +; CHECK-NEXT: [[ADD105:%.*]] = add nsw i32 undef, undef +; CHECK-NEXT: [[SUB106:%.*]] = sub nsw i32 undef, undef +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15 +; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 +; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535 +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] +; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]] +; CHECK-NEXT: [[SHR_I64:%.*]] = lshr i32 [[ADD105]], 15 +; CHECK-NEXT: [[AND_I65:%.*]] = and i32 [[SHR_I64]], 65537 +; CHECK-NEXT: [[MUL_I66:%.*]] = mul nuw i32 [[AND_I65]], 65535 +; CHECK-NEXT: [[ADD_I67:%.*]] = add i32 [[MUL_I66]], [[ADD105]] +; CHECK-NEXT: [[XOR_I68:%.*]] = xor i32 [[ADD_I67]], [[MUL_I66]] +; CHECK-NEXT: [[SHR_I69:%.*]] = lshr i32 [[SUB104]], 15 +; CHECK-NEXT: [[AND_I70:%.*]] = and i32 [[SHR_I69]], 65537 +; CHECK-NEXT: [[MUL_I71:%.*]] = mul nuw i32 [[AND_I70]], 65535 +; CHECK-NEXT: [[ADD_I72:%.*]] = add i32 [[MUL_I71]], [[SUB104]] +; CHECK-NEXT: [[XOR_I73:%.*]] = xor i32 [[ADD_I72]], [[MUL_I71]] +; CHECK-NEXT: [[SHR_I74:%.*]] = lshr i32 [[SUB106]], 15 +; CHECK-NEXT: [[AND_I75:%.*]] = and i32 [[SHR_I74]], 65537 +; CHECK-NEXT: [[MUL_I76:%.*]] = mul nuw i32 [[AND_I75]], 65535 +; CHECK-NEXT: [[ADD_I77:%.*]] = add i32 [[MUL_I76]], [[SUB106]] +; CHECK-NEXT: [[XOR_I78:%.*]] = xor i32 [[ADD_I77]], [[MUL_I76]] +; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I68]], [[XOR_I]] +; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I73]] +; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I78]] ; CHECK-NEXT: [[ADD78_1:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB86_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef +; CHECK-NEXT: [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]] +; CHECK-NEXT: [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]] +; CHECK-NEXT: [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]] +; CHECK-NEXT: [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]] +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15 +; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 +; CHECK-NEXT: [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535 +; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] +; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]] +; CHECK-NEXT: [[SHR_I64_1:%.*]] = lshr i32 [[ADD105_1]], 15 +; CHECK-NEXT: [[AND_I65_1:%.*]] = and i32 [[SHR_I64_1]], 65537 +; CHECK-NEXT: [[MUL_I66_1:%.*]] = mul nuw i32 [[AND_I65_1]], 65535 +; CHECK-NEXT: [[ADD_I67_1:%.*]] = add i32 [[MUL_I66_1]], [[ADD105_1]] +; CHECK-NEXT: [[XOR_I68_1:%.*]] = xor i32 [[ADD_I67_1]], [[MUL_I66_1]] +; CHECK-NEXT: [[SHR_I69_1:%.*]] = lshr i32 [[SUB104_1]], 15 +; CHECK-NEXT: [[AND_I70_1:%.*]] = and i32 [[SHR_I69_1]], 65537 +; CHECK-NEXT: [[MUL_I71_1:%.*]] = mul nuw i32 [[AND_I70_1]], 65535 +; CHECK-NEXT: [[ADD_I72_1:%.*]] = add i32 [[MUL_I71_1]], [[SUB104_1]] +; CHECK-NEXT: [[XOR_I73_1:%.*]] = xor i32 [[ADD_I72_1]], [[MUL_I71_1]] +; CHECK-NEXT: [[SHR_I74_1:%.*]] = lshr i32 [[SUB106_1]], 15 +; CHECK-NEXT: [[AND_I75_1:%.*]] = and i32 [[SHR_I74_1]], 65537 +; CHECK-NEXT: [[MUL_I76_1:%.*]] = mul nuw i32 [[AND_I75_1]], 65535 +; CHECK-NEXT: [[ADD_I77_1:%.*]] = add i32 [[MUL_I76_1]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I78_1:%.*]] = xor i32 [[ADD_I77_1]], [[MUL_I76_1]] +; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I68_1]], [[ADD113]] +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I73_1]] +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I78_1]] ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef +; CHECK-NEXT: [[ADD103_2:%.*]] = add nsw i32 undef, [[ADD78_2]] +; CHECK-NEXT: [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], undef +; CHECK-NEXT: [[ADD105_2:%.*]] = add nsw i32 undef, undef +; CHECK-NEXT: [[SUB106_2:%.*]] = sub nsw i32 undef, undef +; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15 +; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 +; CHECK-NEXT: [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535 +; CHECK-NEXT: [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] +; CHECK-NEXT: [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]] +; CHECK-NEXT: [[SHR_I64_2:%.*]] = lshr i32 [[ADD105_2]], 15 +; CHECK-NEXT: [[AND_I65_2:%.*]] = and i32 [[SHR_I64_2]], 65537 +; CHECK-NEXT: [[MUL_I66_2:%.*]] = mul nuw i32 [[AND_I65_2]], 65535 +; CHECK-NEXT: [[ADD_I67_2:%.*]] = add i32 [[MUL_I66_2]], [[ADD105_2]] +; CHECK-NEXT: [[XOR_I68_2:%.*]] = xor i32 [[ADD_I67_2]], [[MUL_I66_2]] +; CHECK-NEXT: [[SHR_I69_2:%.*]] = lshr i32 [[SUB104_2]], 15 +; CHECK-NEXT: [[AND_I70_2:%.*]] = and i32 [[SHR_I69_2]], 65537 +; CHECK-NEXT: [[MUL_I71_2:%.*]] = mul nuw i32 [[AND_I70_2]], 65535 +; CHECK-NEXT: [[ADD_I72_2:%.*]] = add i32 [[MUL_I71_2]], [[SUB104_2]] +; CHECK-NEXT: [[XOR_I73_2:%.*]] = xor i32 [[ADD_I72_2]], [[MUL_I71_2]] +; CHECK-NEXT: [[SHR_I74_2:%.*]] = lshr i32 [[SUB106_2]], 15 +; CHECK-NEXT: [[AND_I75_2:%.*]] = and i32 [[SHR_I74_2]], 65537 +; CHECK-NEXT: [[MUL_I76_2:%.*]] = mul nuw i32 [[AND_I75_2]], 65535 +; CHECK-NEXT: [[ADD_I77_2:%.*]] = add i32 [[MUL_I76_2]], [[SUB106_2]] +; CHECK-NEXT: [[XOR_I78_2:%.*]] = xor i32 [[ADD_I77_2]], [[MUL_I76_2]] +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I68_2]], [[ADD113_1]] +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]] +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I73_2]] +; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I78_2]] ; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_3]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[SUB102_1]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 undef, i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 undef, i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 undef, i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 undef, i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD78_1]], i32 8 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB86_1]], i32 9 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 undef, i32 10 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[ADD78_2]], i32 11 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 undef, i32 12 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 undef, i32 13 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 undef, i32 14 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 undef, i32 15 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> , i32 [[SUB86_1]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 undef, i32 3 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 undef, i32 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 undef, i32 5 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 undef, i32 6 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[ADD78_1]], i32 7 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[ADD94_1]], i32 8 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[SUB102_1]], i32 9 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[ADD78_2]], i32 10 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 undef, i32 11 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 undef, i32 12 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 undef, i32 13 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 undef, i32 14 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[SUB102_3]], i32 15 -; CHECK-NEXT: [[TMP30:%.*]] = add nsw <16 x i32> [[TMP15]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = sub nsw <16 x i32> [[TMP15]], [[TMP29]] -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i32> [[TMP30]], <16 x i32> [[TMP31]], <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = lshr <16 x i32> [[TMP32]], -; CHECK-NEXT: [[TMP34:%.*]] = and <16 x i32> [[TMP33]], -; CHECK-NEXT: [[TMP35:%.*]] = mul nuw <16 x i32> [[TMP34]], -; CHECK-NEXT: [[TMP36:%.*]] = add <16 x i32> [[TMP35]], [[TMP32]] -; CHECK-NEXT: [[TMP37:%.*]] = xor <16 x i32> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP37]]) -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP38]], 16 +; CHECK-NEXT: [[ADD103_3:%.*]] = add nsw i32 undef, undef +; CHECK-NEXT: [[SUB104_3:%.*]] = sub nsw i32 undef, undef +; CHECK-NEXT: [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], undef +; CHECK-NEXT: [[SUB106_3:%.*]] = sub nsw i32 undef, [[SUB102_3]] +; CHECK-NEXT: [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15 +; CHECK-NEXT: [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537 +; CHECK-NEXT: [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535 +; CHECK-NEXT: [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]] +; CHECK-NEXT: [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]] +; CHECK-NEXT: [[SHR_I64_3:%.*]] = lshr i32 [[ADD105_3]], 15 +; CHECK-NEXT: [[AND_I65_3:%.*]] = and i32 [[SHR_I64_3]], 65537 +; CHECK-NEXT: [[MUL_I66_3:%.*]] = mul nuw i32 [[AND_I65_3]], 65535 +; CHECK-NEXT: [[ADD_I67_3:%.*]] = add i32 [[MUL_I66_3]], [[ADD105_3]] +; CHECK-NEXT: [[XOR_I68_3:%.*]] = xor i32 [[ADD_I67_3]], [[MUL_I66_3]] +; CHECK-NEXT: [[SHR_I69_3:%.*]] = lshr i32 [[SUB104_3]], 15 +; CHECK-NEXT: [[AND_I70_3:%.*]] = and i32 [[SHR_I69_3]], 65537 +; CHECK-NEXT: [[MUL_I71_3:%.*]] = mul nuw i32 [[AND_I70_3]], 65535 +; CHECK-NEXT: [[ADD_I72_3:%.*]] = add i32 [[MUL_I71_3]], [[SUB104_3]] +; CHECK-NEXT: [[XOR_I73_3:%.*]] = xor i32 [[ADD_I72_3]], [[MUL_I71_3]] +; CHECK-NEXT: [[SHR_I74_3:%.*]] = lshr i32 [[SUB106_3]], 15 +; CHECK-NEXT: [[AND_I75_3:%.*]] = and i32 [[SHR_I74_3]], 65537 +; CHECK-NEXT: [[MUL_I76_3:%.*]] = mul nuw i32 [[AND_I75_3]], 65535 +; CHECK-NEXT: [[ADD_I77_3:%.*]] = add i32 [[MUL_I76_3]], [[SUB106_3]] +; CHECK-NEXT: [[XOR_I78_3:%.*]] = xor i32 [[ADD_I77_3]], [[MUL_I76_3]] +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I68_3]], [[ADD113_2]] +; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]] +; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I73_3]] +; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I78_3]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]