From b69c75d53f8d687a7c6d6d0b8e1239ddaaf43804 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 5 Jul 2022 09:29:11 +0200
Subject: [PATCH] Revert "[VectorCombine] Improve shuffle select
 shuffle-of-shuffles"

This reverts commit 19a1e20b8a0f69da2a871eae6cbd03d1314ee02d.

Clang crashes while linking bullet from llvm-test-suite in
ReleaseLTO-g cmake configuration.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 164 +++--------
 .../VectorCombine/AArch64/select-shuffle.ll   | 260 ++++++++----------
 2 files changed, 142 insertions(+), 282 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 319d40f04634e..90598937affcb 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1249,20 +1249,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
       VT != Op0->getType())
     return false;
-  auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
-  auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
-  auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
-  auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
-  SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
+  auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0));
+  auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1));
+  auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0));
+  auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1));
   auto checkSVNonOpUses = [&](Instruction *I) {
     if (!I || I->getOperand(0)->getType() != VT)
       return true;
-    return any_of(I->users(), [&](User *U) {
-      return U != Op0 && U != Op1 &&
-             !(isa<ShuffleVectorInst>(U) &&
-               (InputShuffles.contains(cast<Instruction>(U)) ||
-                isInstructionTriviallyDead(cast<Instruction>(U))));
-    });
+    return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; });
   };
   if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
       checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
@@ -1289,25 +1283,13 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   if (FromReduction && Shuffles.size() > 1)
     return false;
 
-  // Add any shuffle uses for the shuffles we have found, to include them in our
-  // cost calculations.
-  if (!FromReduction) {
-    for (ShuffleVectorInst *SV : Shuffles) {
-      for (auto U : SV->users()) {
-        ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
-        if (SSV && isa<UndefValue>(SSV->getOperand(1)))
-          Shuffles.push_back(SSV);
-      }
-    }
-  }
-
   // For each of the output shuffles, we try to sort all the first vector
   // elements to the beginning, followed by the second array elements at the
   // end. If the binops are legalized to smaller vectors, this may reduce total
   // number of binops. We compute the ReconstructMask mask needed to convert
   // back to the original lane order.
-  SmallVector<std::pair<int, int>> V1, V2;
-  SmallVector<SmallVector<int>> OrigReconstructMasks;
+  SmallVector<int> V1, V2;
+  SmallVector<SmallVector<int>> ReconstructMasks;
   int MaxV1Elt = 0, MaxV2Elt = 0;
   unsigned NumElts = VT->getNumElements();
   for (ShuffleVectorInst *SVN : Shuffles) {
@@ -1318,13 +1300,6 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
     // case we need to commute the mask).
     Value *SVOp0 = SVN->getOperand(0);
     Value *SVOp1 = SVN->getOperand(1);
-    if (isa<UndefValue>(SVOp1)) {
-      auto *SSV = cast<ShuffleVectorInst>(SVOp0);
-      SVOp0 = SSV->getOperand(0);
-      SVOp1 = SSV->getOperand(1);
-      for (unsigned I = 0, E = Mask.size(); I != E; I++)
-        Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]);
-    }
     if (SVOp0 == Op1 && SVOp1 == Op0) {
       std::swap(SVOp0, SVOp1);
       ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
@@ -1341,25 +1316,21 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
         ReconstructMask.push_back(-1);
       } else if (Mask[I] < static_cast<int>(NumElts)) {
         MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
-        auto It = find_if(V1, [&](const std::pair<int, int> &A) {
-          return Mask[I] == A.first;
-        });
+        auto It = find(V1, Mask[I]);
         if (It != V1.end())
           ReconstructMask.push_back(It - V1.begin());
         else {
           ReconstructMask.push_back(V1.size());
-          V1.emplace_back(Mask[I], V1.size());
+          V1.push_back(Mask[I]);
         }
       } else {
         MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
-        auto It = find_if(V2, [&](const std::pair<int, int> &A) {
-          return Mask[I] - static_cast<int>(NumElts) == A.first;
-        });
+        auto It = find(V2, Mask[I] - NumElts);
         if (It != V2.end())
           ReconstructMask.push_back(NumElts + It - V2.begin());
         else {
           ReconstructMask.push_back(NumElts + V2.size());
-          V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
+          V2.push_back(Mask[I] - NumElts);
         }
       }
     }
@@ -1368,7 +1339,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
     // result. In-order can help simplify the shuffle away.
     if (FromReduction)
       sort(ReconstructMask);
-    OrigReconstructMasks.push_back(std::move(ReconstructMask));
+    ReconstructMasks.push_back(ReconstructMask);
   }
 
   // If the Maximum element used from V1 and V2 are not larger than the new
@@ -1380,68 +1351,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
        MaxV2Elt == static_cast<int>(V2.size()) - 1))
     return false;
 
-  // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
-  // shuffle of another shuffle, or not a shuffle (that is treated like a
-  // identity shuffle).
-  auto GetBaseMaskValue = [&](Instruction *I, int M) {
-    auto *SV = dyn_cast<ShuffleVectorInst>(I);
-    if (!SV)
-      return M;
-    if (isa<UndefValue>(SV->getOperand(1)))
-      if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
-        if (InputShuffles.contains(SSV))
-          return SSV->getMaskValue(SV->getMaskValue(M));
-    return SV->getMaskValue(M);
-  };
-
-  // Attempt to sort the inputs my ascending mask values to make simpler input
-  // shuffles and push complex shuffles down to the uses. We sort on the first
-  // of the two input shuffle orders, to try and get at least one input into a
-  // nice order.
-  auto SortBase = [&](Instruction *A, std::pair<int, int> X,
-                      std::pair<int, int> Y) {
-    int MXA = GetBaseMaskValue(A, X.first);
-    int MYA = GetBaseMaskValue(A, Y.first);
-    return MXA < MYA;
-  };
-  stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
-    return SortBase(SVI0A, A, B);
-  });
-  stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
-    return SortBase(SVI1A, A, B);
-  });
-  // Calculate our ReconstructMasks from the OrigReconstructMasks and the
-  // modified order of the input shuffles.
-  SmallVector<SmallVector<int>> ReconstructMasks;
-  for (auto Mask : OrigReconstructMasks) {
-    SmallVector<int> ReconstructMask;
-    for (int M : Mask) {
-      auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
-        auto It = find_if(V, [M](auto A) { return A.second == M; });
-        assert(It != V.end() && "Expected all entries in Mask");
-        return std::distance(V.begin(), It);
-      };
-      if (M < 0)
-        ReconstructMask.push_back(-1);
-      else if (M < static_cast<int>(NumElts)) {
-        ReconstructMask.push_back(FindIndex(V1, M));
-      } else {
-        ReconstructMask.push_back(NumElts + FindIndex(V2, M));
-      }
-    }
-    ReconstructMasks.push_back(std::move(ReconstructMask));
-  }
-
   // Calculate the masks needed for the new input shuffles, which get padded
   // with undef
   SmallVector<int> V1A, V1B, V2A, V2B;
   for (unsigned I = 0; I < V1.size(); I++) {
-    V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
-    V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
+    V1A.push_back(SVI0A->getMaskValue(V1[I]));
+    V1B.push_back(SVI0B->getMaskValue(V1[I]));
   }
   for (unsigned I = 0; I < V2.size(); I++) {
-    V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
-    V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
+    V2A.push_back(SVI1A->getMaskValue(V2[I]));
+    V2B.push_back(SVI1B->getMaskValue(V2[I]));
   }
   while (V1A.size() < NumElts) {
     V1A.push_back(UndefMaskElem);
@@ -1452,14 +1371,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
     V2B.push_back(UndefMaskElem);
   }
 
-  auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
-    auto *SV = dyn_cast<ShuffleVectorInst>(I);
-    if (!SV)
-      return C;
-    return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
-                                      ? TTI::SK_PermuteSingleSrc
-                                      : TTI::SK_PermuteTwoSrc,
-                                  VT, SV->getShuffleMask());
+  auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) {
+    return C +
+           TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask());
   };
   auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
     return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
@@ -1472,6 +1386,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
       TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
   CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
                                 InstructionCost(0), AddShuffleCost);
+  // This set helps us only cost each unique shuffle once.
+  SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles(
+      {SVI0A, SVI0B, SVI1A, SVI1B});
   CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
                                 InstructionCost(0), AddShuffleCost);
 
@@ -1491,35 +1408,22 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
       std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
                       InstructionCost(0), AddShuffleMaskCost);
 
-  LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
-  LLVM_DEBUG(dbgs() << "  CostBefore: " << CostBefore
-                    << " vs CostAfter: " << CostAfter << "\n");
   if (CostBefore <= CostAfter)
     return false;
 
   // The cost model has passed, create the new instructions.
-  auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
-    auto *SV = dyn_cast<ShuffleVectorInst>(I);
-    if (!SV)
-      return I;
-    if (isa<UndefValue>(SV->getOperand(1)))
-      if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
-        if (InputShuffles.contains(SSV))
-          return SSV->getOperand(Op);
-    return SV->getOperand(Op);
-  };
-  Builder.SetInsertPoint(SVI0A->getNextNode());
-  Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
-                                             GetShuffleOperand(SVI0A, 1), V1A);
-  Builder.SetInsertPoint(SVI0B->getNextNode());
-  Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
-                                             GetShuffleOperand(SVI0B, 1), V1B);
-  Builder.SetInsertPoint(SVI1A->getNextNode());
-  Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
-                                             GetShuffleOperand(SVI1A, 1), V2A);
-  Builder.SetInsertPoint(SVI1B->getNextNode());
-  Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
-                                             GetShuffleOperand(SVI1B, 1), V2B);
+  Builder.SetInsertPoint(SVI0A);
+  Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0),
+                                             SVI0A->getOperand(1), V1A);
+  Builder.SetInsertPoint(SVI0B);
+  Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0),
+                                             SVI0B->getOperand(1), V1B);
+  Builder.SetInsertPoint(SVI1A);
+  Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0),
+                                             SVI1A->getOperand(1), V2A);
+  Builder.SetInsertPoint(SVI1B);
+  Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0),
+                                             SVI1B->getOperand(1), V2B);
   Builder.SetInsertPoint(Op0);
   Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
                                     NSV0A, NSV0B);
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
index ffda0373e3ed5..d6eab5a3c9555 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
@@ -22,12 +22,12 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) {
 
 define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test1_reduce(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 6, i32 9, i32 12, i32 15, i32 21, i32 24, i32 27, i32 30, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 23, i32 26, i32 29, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 4, i32 7, i32 10, i32 13, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 31, i32 28, i32 25, i32 22, i32 5, i32 8, i32 11, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 23, i32 26, i32 29, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 21, i32 6, i32 24, i32 9, i32 27, i32 12, i32 30, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 31, i32 28, i32 25, i32 22, i32 5, i32 8, i32 11, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 4, i32 19, i32 7, i32 18, i32 10, i32 17, i32 13, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]])
 ; CHECK-NEXT:    ret i32 [[R]]
@@ -130,13 +130,13 @@ define <16 x i32> @test2_1_ins(<16 x i32> %x1, <16 x i32> %x2) {
 
 define <16 x i32> @test2_2(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test2_2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 1, i32 12, i32 15, i32 18, i32 19, i32 22, i32 25, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 10, i32 11, i32 13, i32 14, i32 20, i32 21, i32 23, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 29, i32 26, i32 27, i32 30, i32 7, i32 4, i32 5, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 17, i32 28, i32 16, i32 31, i32 3, i32 6, i32 2, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 19, i32 18, i32 23, i32 22, i32 5, i32 4, i32 21, i32 7, i32 6, i32 20, i32 1, i32 0, i32 17, i32 3, i32 2, i32 16>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 21, i32 20, i32 24, i32 23, i32 11, i32 10, i32 14, i32 13, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 15, i32 12, i32 25, i32 22, i32 19, i32 18, i32 1, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 6, i32 3, i32 9, i32 2, i32 28, i32 17, i32 31, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 30, i32 27, i32 8, i32 5, i32 4, i32 7, i32 26, i32 29, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 20, i32 2, i32 3, i32 21, i32 4, i32 5, i32 22, i32 6, i32 7, i32 23>
 ; CHECK-NEXT:    ret <16 x i32> [[S3]]
 ;
   %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -170,11 +170,13 @@ define <16 x i32> @test2_12(<16 x i32> %x, <16 x i32> %y) {
 
 define <16 x i32> @test3_1(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test3_1(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 18, i32 3, i32 16, i32 1, i32 5, i32 9, i32 13, i32 20, i32 24, i32 28, i32 7, i32 11, i32 15, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 17, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 18, i32 3, i32 7, i32 11, i32 15, i32 22, i32 26, i32 30, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 16, i32 1, i32 5, i32 9, i32 13, i32 20, i32 24, i32 28, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <16 x i32> [[S3]]
 ;
   %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
@@ -187,13 +189,13 @@ define <16 x i32> @test3_1(<16 x i32> %x, <16 x i32> %y) {
 
 define <16 x i32> @test3_2(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test3_2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 17, i32 21, i32 25, i32 29, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 19, i32 23, i32 27, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 17, i32 21, i32 1, i32 5, i32 4, i32 7, i32 6, i32 0, i32 3, i32 2, i32 20, i32 23, i32 22, i32 16, i32 19, i32 18>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 6, i32 23, i32 19, i32 31, i32 27, i32 2, i32 14, i32 10, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 4, i32 21, i32 17, i32 29, i32 25, i32 0, i32 12, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 20, i32 5, i32 1, i32 13, i32 9, i32 16, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 22, i32 7, i32 3, i32 15, i32 11, i32 18, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    ret <16 x i32> [[S3]]
 ;
   %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
@@ -228,12 +230,14 @@ define <16 x i32> @test3_12(<16 x i32> %x, <16 x i32> %y) {
 
 define <16 x i32> @test23(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test23(
-; CHECK-NEXT:    [[S10:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[S20:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 29, i32 26, i32 7, i32 4, i32 3, i32 6, i32 5, i32 2, i32 9, i32 8, i32 17, i32 28, i32 27, i32 16, i32 31, i32 30>
-; CHECK-NEXT:    [[A0:%.*]] = add nsw <16 x i32> [[S10]], [[S20]]
-; CHECK-NEXT:    [[B0:%.*]] = sub nsw <16 x i32> [[S10]], [[S20]]
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[A0]], <16 x i32> [[B0]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[B0]], <16 x i32> [[A0]], <16 x i32> <i32 18, i32 3, i32 16, i32 1, i32 5, i32 9, i32 13, i32 20, i32 24, i32 28, i32 7, i32 11, i32 15, i32 22, i32 26, i32 30>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 18, i32 0, i32 20, i32 24, i32 12, i32 22, i32 10, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 19, i32 1, i32 21, i32 25, i32 13, i32 23, i32 11, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 7, i32 29, i32 3, i32 9, i32 27, i32 5, i32 17, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 4, i32 26, i32 6, i32 8, i32 16, i32 2, i32 28, i32 30, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 1, i32 17, i32 0, i32 16, i32 21, i32 22, i32 23, i32 5, i32 6, i32 7, i32 18, i32 19, i32 20, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 18, i32 19, i32 20, i32 2, i32 3, i32 4, i32 21, i32 22, i32 23, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
 ; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -257,12 +261,12 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32> %y) {
 
 define <16 x i32> @testgood(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @testgood(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 ; CHECK-NEXT:    ret <16 x i32> [[S3]]
 ;
@@ -274,48 +278,6 @@ define <16 x i32> @testgood(<16 x i32> %x, <16 x i32> %y) {
   ret <16 x i32> %s3
 }
 
-define <16 x i32> @test_shufshufin(<16 x i32> %x, <16 x i32> %y) {
-; CHECK-LABEL: @test_shufshufin(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-; CHECK-NEXT:    ret <16 x i32> [[S3]]
-;
-  %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-  %s2 = shufflevector <16 x i32> %s1, <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %a = add nsw <16 x i32> %s1, %s2
-  %b = sub nsw <16 x i32> %s1, %s2
-  %s3 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-  ret <16 x i32> %s3
-}
-
-define <16 x i32> @testshufshufout(<16 x i32> %x, <16 x i32> %y) {
-; CHECK-LABEL: @testshufshufout(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 3, i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = add nsw <16 x i32> [[S3]], [[S4]]
-; CHECK-NEXT:    ret <16 x i32> [[R]]
-;
-  %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-  %s2 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-  %a = add nsw <16 x i32> %s1, %s2
-  %b = sub nsw <16 x i32> %s1, %s2
-  %s3 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-  %s4 = shufflevector <16 x i32> %s3, <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-  %r = add nsw <16 x i32> %s3, %s4
-  ret <16 x i32> %r
-}
-
 declare void @use(<16 x i32>)
 define <16 x i32> @test_extrashuffleuse(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test_extrashuffleuse(
@@ -411,15 +373,15 @@ define void @test_31(ptr %src, ptr %dst) {
 
 define <16 x i32> @test_1651256324(<16 x i32> %l0, <16 x i32> %l1, <16 x i32> %l6, <16 x i32> %l7) {
 ; CHECK-LABEL: @test_1651256324(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[L0:%.*]], <16 x i32> [[L6:%.*]], <16 x i32> <i32 0, i32 1, i32 3, i32 10, i32 10, i32 15, i32 20, i32 20, i32 30, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[L1:%.*]], <16 x i32> [[L1]], <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 7, i32 10, i32 11, i32 11, i32 12, i32 22, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[L1]], <16 x i32> [[L1]], <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 7, i32 10, i32 11, i32 11, i32 12, i32 22, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[L0:%.*]], <16 x i32> [[L6:%.*]], <16 x i32> <i32 30, i32 10, i32 1, i32 20, i32 10, i32 0, i32 20, i32 3, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[L1:%.*]], <16 x i32> [[L1]], <16 x i32> <i32 24, i32 1, i32 10, i32 0, i32 5, i32 7, i32 11, i32 11, i32 4, i32 12, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[L1]], <16 x i32> [[L1]], <16 x i32> <i32 24, i32 1, i32 10, i32 0, i32 5, i32 7, i32 11, i32 11, i32 4, i32 12, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[L7:%.*]], <16 x i32> [[L7]], <16 x i32> <i32 23, i32 20, i32 29, i32 25, i32 14, i32 21, i32 11, i32 9, i32 2, i32 7, i32 5, i32 15, i32 24, i32 30, i32 26, i32 5>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[L6]], <16 x i32> [[L7]], <16 x i32> <i32 29, i32 13, i32 30, i32 24, i32 0, i32 15, i32 15, i32 8, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[L6]], <16 x i32> [[L7]], <16 x i32> <i32 4, i32 24, i32 13, i32 15, i32 0, i32 29, i32 8, i32 30, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub <16 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[T0:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 17, i32 7, i32 23, i32 1, i32 2, i32 1, i32 8, i32 10, i32 6, i32 6, i32 18, i32 24, i32 17, i32 9, i32 21, i32 16>
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 10, i32 1, i32 5, i32 24, i32 19, i32 0, i32 3, i32 0, i32 17, i32 22, i32 0, i32 4, i32 1, i32 20, i32 16, i32 6>
+; CHECK-NEXT:    [[T0:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 18, i32 7, i32 22, i32 1, i32 8, i32 1, i32 9, i32 0, i32 6, i32 6, i32 23, i32 16, i32 18, i32 10, i32 24, i32 21>
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 16, i32 17, i32 3, i32 4, i32 3, i32 18, i32 19, i32 3, i32 5, i32 1, i32 20, i32 21, i32 6>
 ; CHECK-NEXT:    [[R:%.*]] = xor <16 x i32> [[T0]], [[T1]]
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
@@ -543,39 +505,36 @@ define dso_local i32 @full(i8* nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP48]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP40]]
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> [[TMP59]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> [[TMP59]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP64:%.*]] = add nsw <16 x i32> [[TMP61]], [[TMP63]]
-; CHECK-NEXT:    [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP70:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP69]]
-; CHECK-NEXT:    [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP66]], [[TMP68]]
-; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP76:%.*]] = add nsw <16 x i32> [[TMP73]], [[TMP75]]
-; CHECK-NEXT:    [[TMP77:%.*]] = sub nsw <16 x i32> [[TMP72]], [[TMP74]]
-; CHECK-NEXT:    [[TMP78:%.*]] = shufflevector <16 x i32> [[TMP76]], <16 x i32> [[TMP77]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP79:%.*]] = shufflevector <16 x i32> [[TMP76]], <16 x i32> [[TMP77]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <16 x i32> [[TMP76]], <16 x i32> [[TMP77]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <16 x i32> [[TMP76]], <16 x i32> [[TMP77]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP82:%.*]] = add nsw <16 x i32> [[TMP79]], [[TMP81]]
-; CHECK-NEXT:    [[TMP83:%.*]] = sub nsw <16 x i32> [[TMP78]], [[TMP80]]
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <16 x i32> [[TMP82]], <16 x i32> [[TMP83]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP85:%.*]] = lshr <16 x i32> [[TMP84]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP86:%.*]] = and <16 x i32> [[TMP85]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP87:%.*]] = mul nuw <16 x i32> [[TMP86]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP88:%.*]] = add <16 x i32> [[TMP87]], [[TMP84]]
-; CHECK-NEXT:    [[TMP89:%.*]] = xor <16 x i32> [[TMP88]], [[TMP87]]
-; CHECK-NEXT:    [[TMP90:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP89]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP90]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP90]], 16
+; CHECK-NEXT:    [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[TMP60:%.*]] = add nsw <16 x i32> [[TMP59]], [[REORDER]]
+; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[REORDER]]
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 7, i32 3, i32 15, i32 11, i32 18, i32 22, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 5, i32 13, i32 9, i32 16, i32 20, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 5, i32 1, i32 13, i32 9, i32 16, i32 20, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 18, i32 22, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 1, i32 4, i32 16, i32 20, i32 3, i32 7, i32 19, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 5, i32 17, i32 21, i32 2, i32 6, i32 18, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 5, i32 17, i32 21, i32 2, i32 6, i32 18, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 1, i32 4, i32 16, i32 20, i32 3, i32 7, i32 19, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP72:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP70]]
+; CHECK-NEXT:    [[TMP73:%.*]] = sub nsw <16 x i32> [[TMP69]], [[TMP71]]
+; CHECK-NEXT:    [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> <i32 0, i32 16, i32 17, i32 1, i32 18, i32 2, i32 19, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> <i32 4, i32 20, i32 21, i32 5, i32 22, i32 6, i32 23, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP77:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP78:%.*]] = add nsw <16 x i32> [[TMP74]], [[TMP76]]
+; CHECK-NEXT:    [[TMP79:%.*]] = sub nsw <16 x i32> [[TMP75]], [[TMP77]]
+; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <16 x i32> [[TMP78]], <16 x i32> [[TMP79]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP81:%.*]] = lshr <16 x i32> [[TMP80]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP82:%.*]] = and <16 x i32> [[TMP81]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP83:%.*]] = mul nuw <16 x i32> [[TMP82]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP84:%.*]] = add <16 x i32> [[TMP83]], [[TMP80]]
+; CHECK-NEXT:    [[TMP85:%.*]] = xor <16 x i32> [[TMP84]], [[TMP83]]
+; CHECK-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP85]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP86]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP86]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]
@@ -757,39 +716,36 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi
 ; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]]
-; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP56:%.*]] = add nsw <16 x i32> [[TMP53]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP54]]
-; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]]
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP68:%.*]] = add nsw <16 x i32> [[TMP65]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP66]]
-; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP74:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP70]], [[TMP72]]
-; CHECK-NEXT:    [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP78:%.*]] = and <16 x i32> [[TMP77]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]]
-; CHECK-NEXT:    [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]]
-; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP82]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP82]], 16
+; CHECK-NEXT:    [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[TMP51]], [[REORDER]]
+; CHECK-NEXT:    [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP51]], [[REORDER]]
+; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 7, i32 3, i32 15, i32 11, i32 18, i32 22, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 1, i32 5, i32 13, i32 9, i32 16, i32 20, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 5, i32 1, i32 13, i32 9, i32 16, i32 20, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 18, i32 22, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP58:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> <i32 1, i32 4, i32 16, i32 20, i32 3, i32 7, i32 19, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> <i32 0, i32 5, i32 17, i32 21, i32 2, i32 6, i32 18, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> <i32 0, i32 5, i32 17, i32 21, i32 2, i32 6, i32 18, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> <i32 1, i32 4, i32 16, i32 20, i32 3, i32 7, i32 19, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP64:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT:    [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP61]], [[TMP63]]
+; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 0, i32 16, i32 17, i32 1, i32 18, i32 2, i32 19, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 4, i32 20, i32 21, i32 5, i32 22, i32 6, i32 23, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP70:%.*]] = add nsw <16 x i32> [[TMP66]], [[TMP68]]
+; CHECK-NEXT:    [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP69]]
+; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP73:%.*]] = lshr <16 x i32> [[TMP72]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP74:%.*]] = and <16 x i32> [[TMP73]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP75:%.*]] = mul nuw <16 x i32> [[TMP74]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP76:%.*]] = add <16 x i32> [[TMP75]], [[TMP72]]
+; CHECK-NEXT:    [[TMP77:%.*]] = xor <16 x i32> [[TMP76]], [[TMP75]]
+; CHECK-NEXT:    [[TMP78:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP77]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP78]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP78]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]