From b69c75d53f8d687a7c6d6d0b8e1239ddaaf43804 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 5 Jul 2022 09:29:11 +0200 Subject: [PATCH] Revert "[VectorCombine] Improve shuffle select shuffle-of-shuffles" This reverts commit 19a1e20b8a0f69da2a871eae6cbd03d1314ee02d. Clang crashes while linking bullet from llvm-test-suite in ReleaseLTO-g cmake configuration. --- .../Transforms/Vectorize/VectorCombine.cpp | 164 +++-------- .../VectorCombine/AArch64/select-shuffle.ll | 260 ++++++++---------- 2 files changed, 142 insertions(+), 282 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 319d40f04634e..90598937affcb 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1249,20 +1249,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || VT != Op0->getType()) return false; - auto *SVI0A = dyn_cast(Op0->getOperand(0)); - auto *SVI0B = dyn_cast(Op0->getOperand(1)); - auto *SVI1A = dyn_cast(Op1->getOperand(0)); - auto *SVI1B = dyn_cast(Op1->getOperand(1)); - SmallPtrSet InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); + auto *SVI0A = dyn_cast(Op0->getOperand(0)); + auto *SVI0B = dyn_cast(Op0->getOperand(1)); + auto *SVI1A = dyn_cast(Op1->getOperand(0)); + auto *SVI1B = dyn_cast(Op1->getOperand(1)); auto checkSVNonOpUses = [&](Instruction *I) { if (!I || I->getOperand(0)->getType() != VT) return true; - return any_of(I->users(), [&](User *U) { - return U != Op0 && U != Op1 && - !(isa(U) && - (InputShuffles.contains(cast(U)) || - isInstructionTriviallyDead(cast(U)))); - }); + return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; }); }; if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) @@ -1289,25 +1283,13 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { if (FromReduction && Shuffles.size() > 1) return false; - // Add any shuffle uses for the shuffles we have found, to include them in our - // cost calculations. - if (!FromReduction) { - for (ShuffleVectorInst *SV : Shuffles) { - for (auto U : SV->users()) { - ShuffleVectorInst *SSV = dyn_cast(U); - if (SSV && isa(SSV->getOperand(1))) - Shuffles.push_back(SSV); - } - } - } - // For each of the output shuffles, we try to sort all the first vector // elements to the beginning, followed by the second array elements at the // end. If the binops are legalized to smaller vectors, this may reduce total // number of binops. We compute the ReconstructMask mask needed to convert // back to the original lane order. - SmallVector> V1, V2; - SmallVector> OrigReconstructMasks; + SmallVector V1, V2; + SmallVector> ReconstructMasks; int MaxV1Elt = 0, MaxV2Elt = 0; unsigned NumElts = VT->getNumElements(); for (ShuffleVectorInst *SVN : Shuffles) { @@ -1318,13 +1300,6 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // case we need to commute the mask). Value *SVOp0 = SVN->getOperand(0); Value *SVOp1 = SVN->getOperand(1); - if (isa(SVOp1)) { - auto *SSV = cast(SVOp0); - SVOp0 = SSV->getOperand(0); - SVOp1 = SSV->getOperand(1); - for (unsigned I = 0, E = Mask.size(); I != E; I++) - Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); - } if (SVOp0 == Op1 && SVOp1 == Op0) { std::swap(SVOp0, SVOp1); ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); @@ -1341,25 +1316,21 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { ReconstructMask.push_back(-1); } else if (Mask[I] < static_cast(NumElts)) { MaxV1Elt = std::max(MaxV1Elt, Mask[I]); - auto It = find_if(V1, [&](const std::pair &A) { - return Mask[I] == A.first; - }); + auto It = find(V1, Mask[I]); if (It != V1.end()) ReconstructMask.push_back(It - V1.begin()); else { ReconstructMask.push_back(V1.size()); - V1.emplace_back(Mask[I], V1.size()); + V1.push_back(Mask[I]); } } else { MaxV2Elt = std::max(MaxV2Elt, Mask[I] - NumElts); - auto It = find_if(V2, [&](const std::pair &A) { - return Mask[I] - static_cast(NumElts) == A.first; - }); + auto It = find(V2, Mask[I] - NumElts); if (It != V2.end()) ReconstructMask.push_back(NumElts + It - V2.begin()); else { ReconstructMask.push_back(NumElts + V2.size()); - V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); + V2.push_back(Mask[I] - NumElts); } } } @@ -1368,7 +1339,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // result. In-order can help simplify the shuffle away. if (FromReduction) sort(ReconstructMask); - OrigReconstructMasks.push_back(std::move(ReconstructMask)); + ReconstructMasks.push_back(ReconstructMask); } // If the Maximum element used from V1 and V2 are not larger than the new @@ -1380,68 +1351,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { MaxV2Elt == static_cast(V2.size()) - 1)) return false; - // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a - // shuffle of another shuffle, or not a shuffle (that is treated like a - // identity shuffle). - auto GetBaseMaskValue = [&](Instruction *I, int M) { - auto *SV = dyn_cast(I); - if (!SV) - return M; - if (isa(SV->getOperand(1))) - if (auto *SSV = dyn_cast(SV->getOperand(0))) - if (InputShuffles.contains(SSV)) - return SSV->getMaskValue(SV->getMaskValue(M)); - return SV->getMaskValue(M); - }; - - // Attempt to sort the inputs my ascending mask values to make simpler input - // shuffles and push complex shuffles down to the uses. We sort on the first - // of the two input shuffle orders, to try and get at least one input into a - // nice order. - auto SortBase = [&](Instruction *A, std::pair X, - std::pair Y) { - int MXA = GetBaseMaskValue(A, X.first); - int MYA = GetBaseMaskValue(A, Y.first); - return MXA < MYA; - }; - stable_sort(V1, [&](std::pair A, std::pair B) { - return SortBase(SVI0A, A, B); - }); - stable_sort(V2, [&](std::pair A, std::pair B) { - return SortBase(SVI1A, A, B); - }); - // Calculate our ReconstructMasks from the OrigReconstructMasks and the - // modified order of the input shuffles. - SmallVector> ReconstructMasks; - for (auto Mask : OrigReconstructMasks) { - SmallVector ReconstructMask; - for (int M : Mask) { - auto FindIndex = [](const SmallVector> &V, int M) { - auto It = find_if(V, [M](auto A) { return A.second == M; }); - assert(It != V.end() && "Expected all entries in Mask"); - return std::distance(V.begin(), It); - }; - if (M < 0) - ReconstructMask.push_back(-1); - else if (M < static_cast(NumElts)) { - ReconstructMask.push_back(FindIndex(V1, M)); - } else { - ReconstructMask.push_back(NumElts + FindIndex(V2, M)); - } - } - ReconstructMasks.push_back(std::move(ReconstructMask)); - } - // Calculate the masks needed for the new input shuffles, which get padded // with undef SmallVector V1A, V1B, V2A, V2B; for (unsigned I = 0; I < V1.size(); I++) { - V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); - V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); + V1A.push_back(SVI0A->getMaskValue(V1[I])); + V1B.push_back(SVI0B->getMaskValue(V1[I])); } for (unsigned I = 0; I < V2.size(); I++) { - V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); - V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); + V2A.push_back(SVI1A->getMaskValue(V2[I])); + V2B.push_back(SVI1B->getMaskValue(V2[I])); } while (V1A.size() < NumElts) { V1A.push_back(UndefMaskElem); @@ -1452,14 +1371,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { V2B.push_back(UndefMaskElem); } - auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { - auto *SV = dyn_cast(I); - if (!SV) - return C; - return C + TTI.getShuffleCost(isa(SV->getOperand(1)) - ? TTI::SK_PermuteSingleSrc - : TTI::SK_PermuteTwoSrc, - VT, SV->getShuffleMask()); + auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) { + return C + + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask()); }; auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef Mask) { return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); @@ -1472,6 +1386,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), InstructionCost(0), AddShuffleCost); + // This set helps us only cost each unique shuffle once. + SmallPtrSet InputShuffles( + {SVI0A, SVI0B, SVI1A, SVI1B}); CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), InstructionCost(0), AddShuffleCost); @@ -1491,35 +1408,22 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), InstructionCost(0), AddShuffleMaskCost); - LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); - LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore - << " vs CostAfter: " << CostAfter << "\n"); if (CostBefore <= CostAfter) return false; // The cost model has passed, create the new instructions. - auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { - auto *SV = dyn_cast(I); - if (!SV) - return I; - if (isa(SV->getOperand(1))) - if (auto *SSV = dyn_cast(SV->getOperand(0))) - if (InputShuffles.contains(SSV)) - return SSV->getOperand(Op); - return SV->getOperand(Op); - }; - Builder.SetInsertPoint(SVI0A->getNextNode()); - Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), - GetShuffleOperand(SVI0A, 1), V1A); - Builder.SetInsertPoint(SVI0B->getNextNode()); - Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), - GetShuffleOperand(SVI0B, 1), V1B); - Builder.SetInsertPoint(SVI1A->getNextNode()); - Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), - GetShuffleOperand(SVI1A, 1), V2A); - Builder.SetInsertPoint(SVI1B->getNextNode()); - Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), - GetShuffleOperand(SVI1B, 1), V2B); + Builder.SetInsertPoint(SVI0A); + Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0), + SVI0A->getOperand(1), V1A); + Builder.SetInsertPoint(SVI0B); + Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0), + SVI0B->getOperand(1), V1B); + Builder.SetInsertPoint(SVI1A); + Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0), + SVI1A->getOperand(1), V2A); + Builder.SetInsertPoint(SVI1B); + Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0), + SVI1B->getOperand(1), V2B); Builder.SetInsertPoint(Op0); Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), NSV0A, NSV0B); diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll index ffda0373e3ed5..d6eab5a3c9555 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -22,12 +22,12 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) { define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test1_reduce( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]]) ; CHECK-NEXT: ret i32 [[R]] @@ -130,13 +130,13 @@ define <16 x i32> @test2_1_ins(<16 x i32> %x1, <16 x i32> %x2) { define <16 x i32> @test2_2(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test2_2( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[S3]] ; %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> @@ -170,11 +170,13 @@ define <16 x i32> @test2_12(<16 x i32> %x, <16 x i32> %y) { define <16 x i32> @test3_1(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test3_1( -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[S3]] ; %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> @@ -187,13 +189,13 @@ define <16 x i32> @test3_1(<16 x i32> %x, <16 x i32> %y) { define <16 x i32> @test3_2(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test3_2( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[S3]] ; %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> @@ -228,12 +230,14 @@ define <16 x i32> @test3_12(<16 x i32> %x, <16 x i32> %y) { define <16 x i32> @test23(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test23( -; CHECK-NEXT: [[S10:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S20:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[A0:%.*]] = add nsw <16 x i32> [[S10]], [[S20]] -; CHECK-NEXT: [[B0:%.*]] = sub nsw <16 x i32> [[S10]], [[S20]] -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[A0]], <16 x i32> [[B0]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[B0]], <16 x i32> [[A0]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] ; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] ; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> @@ -257,12 +261,12 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32> %y) { define <16 x i32> @testgood(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @testgood( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[S3]] ; @@ -274,48 +278,6 @@ define <16 x i32> @testgood(<16 x i32> %x, <16 x i32> %y) { ret <16 x i32> %s3 } -define <16 x i32> @test_shufshufin(<16 x i32> %x, <16 x i32> %y) { -; CHECK-LABEL: @test_shufshufin( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: ret <16 x i32> [[S3]] -; - %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> - %s2 = shufflevector <16 x i32> %s1, <16 x i32> poison, <16 x i32> - %a = add nsw <16 x i32> %s1, %s2 - %b = sub nsw <16 x i32> %s1, %s2 - %s3 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %s3 -} - -define <16 x i32> @testshufshufout(<16 x i32> %x, <16 x i32> %y) { -; CHECK-LABEL: @testshufshufout( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[R:%.*]] = add nsw <16 x i32> [[S3]], [[S4]] -; CHECK-NEXT: ret <16 x i32> [[R]] -; - %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> - %s2 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> - %a = add nsw <16 x i32> %s1, %s2 - %b = sub nsw <16 x i32> %s1, %s2 - %s3 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - %s4 = shufflevector <16 x i32> %s3, <16 x i32> poison, <16 x i32> - %r = add nsw <16 x i32> %s3, %s4 - ret <16 x i32> %r -} - declare void @use(<16 x i32>) define <16 x i32> @test_extrashuffleuse(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test_extrashuffleuse( @@ -411,15 +373,15 @@ define void @test_31(ptr %src, ptr %dst) { define <16 x i32> @test_1651256324(<16 x i32> %l0, <16 x i32> %l1, <16 x i32> %l6, <16 x i32> %l7) { ; CHECK-LABEL: @test_1651256324( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[L0:%.*]], <16 x i32> [[L6:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[L1:%.*]], <16 x i32> [[L1]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[L1]], <16 x i32> [[L1]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[L0:%.*]], <16 x i32> [[L6:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[L1:%.*]], <16 x i32> [[L1]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[L1]], <16 x i32> [[L1]], <16 x i32> ; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[L7:%.*]], <16 x i32> [[L7]], <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[L6]], <16 x i32> [[L7]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[L6]], <16 x i32> [[L7]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i32> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = sub <16 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[T0:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[T1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[T0:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[T1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = xor <16 x i32> [[T0]], [[T1]] ; CHECK-NEXT: ret <16 x i32> [[R]] ; @@ -543,39 +505,36 @@ define dso_local i32 @full(i8* nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP48]], [[TMP56]] ; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], ; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP40]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> [[TMP59]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> [[TMP59]], <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = add nsw <16 x i32> [[TMP61]], [[TMP63]] -; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP60]], [[TMP62]] -; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP66]], [[TMP68]] -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP76:%.*]] = add nsw <16 x i32> [[TMP73]], [[TMP75]] -; CHECK-NEXT: [[TMP77:%.*]] = sub nsw <16 x i32> [[TMP72]], [[TMP74]] -; CHECK-NEXT: [[TMP78:%.*]] = shufflevector <16 x i32> [[TMP76]], <16 x i32> [[TMP77]], <16 x i32> -; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <16 x i32> [[TMP76]], <16 x i32> [[TMP77]], <16 x i32> -; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <16 x i32> [[TMP76]], <16 x i32> [[TMP77]], <16 x i32> -; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <16 x i32> [[TMP76]], <16 x i32> [[TMP77]], <16 x i32> -; CHECK-NEXT: [[TMP82:%.*]] = add nsw <16 x i32> [[TMP79]], [[TMP81]] -; CHECK-NEXT: [[TMP83:%.*]] = sub nsw <16 x i32> [[TMP78]], [[TMP80]] -; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <16 x i32> [[TMP82]], <16 x i32> [[TMP83]], <16 x i32> -; CHECK-NEXT: [[TMP85:%.*]] = lshr <16 x i32> [[TMP84]], -; CHECK-NEXT: [[TMP86:%.*]] = and <16 x i32> [[TMP85]], -; CHECK-NEXT: [[TMP87:%.*]] = mul nuw <16 x i32> [[TMP86]], -; CHECK-NEXT: [[TMP88:%.*]] = add <16 x i32> [[TMP87]], [[TMP84]] -; CHECK-NEXT: [[TMP89:%.*]] = xor <16 x i32> [[TMP88]], [[TMP87]] -; CHECK-NEXT: [[TMP90:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP89]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP90]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP90]], 16 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = add nsw <16 x i32> [[TMP59]], [[REORDER]] +; CHECK-NEXT: [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[REORDER]] +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP62]], [[TMP64]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP70]] +; CHECK-NEXT: [[TMP73:%.*]] = sub nsw <16 x i32> [[TMP69]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP78:%.*]] = add nsw <16 x i32> [[TMP74]], [[TMP76]] +; CHECK-NEXT: [[TMP79:%.*]] = sub nsw <16 x i32> [[TMP75]], [[TMP77]] +; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <16 x i32> [[TMP78]], <16 x i32> [[TMP79]], <16 x i32> +; CHECK-NEXT: [[TMP81:%.*]] = lshr <16 x i32> [[TMP80]], +; CHECK-NEXT: [[TMP82:%.*]] = and <16 x i32> [[TMP81]], +; CHECK-NEXT: [[TMP83:%.*]] = mul nuw <16 x i32> [[TMP82]], +; CHECK-NEXT: [[TMP84:%.*]] = add <16 x i32> [[TMP83]], [[TMP80]] +; CHECK-NEXT: [[TMP85:%.*]] = xor <16 x i32> [[TMP84]], [[TMP83]] +; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP85]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP86]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP86]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] @@ -757,39 +716,36 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi ; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], ; CHECK-NEXT: [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]] -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = add nsw <16 x i32> [[TMP53]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP54]] -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP68:%.*]] = add nsw <16 x i32> [[TMP65]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP66]] -; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP70]], [[TMP72]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] -; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 +; CHECK-NEXT: [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[TMP51]], [[REORDER]] +; CHECK-NEXT: [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP51]], [[REORDER]] +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP57]] +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP62]] +; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP61]], [[TMP63]] +; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP66]], [[TMP68]] +; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP69]] +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = lshr <16 x i32> [[TMP72]], +; CHECK-NEXT: [[TMP74:%.*]] = and <16 x i32> [[TMP73]], +; CHECK-NEXT: [[TMP75:%.*]] = mul nuw <16 x i32> [[TMP74]], +; CHECK-NEXT: [[TMP76:%.*]] = add <16 x i32> [[TMP75]], [[TMP72]] +; CHECK-NEXT: [[TMP77:%.*]] = xor <16 x i32> [[TMP76]], [[TMP75]] +; CHECK-NEXT: [[TMP78:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP77]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP78]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP78]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]