[X86] Teach getUndefRegClearance that we use undef for inputs to PUNP…

…CK in some cases. This enables the register to be changed from XMM/YMM/ZMM0 to instead match the other source. This prevents a false dependency. I added all the integer unpck instructions, but the tests only show changes for BW and WD. Unfortunately, we can have undef on operand 1 or 2 of the AVX instructions. This breaks the interface with hasUndefRegUpdate which used to tell which operand to check. Now we scan the input operands looking for an undef register and then ask hasUndefRegUpdate if its an instruction we care about and which operands of that instruction we care about. I also had to make some changes to the load folding code to always pass operand 1 to hasUndefRegUpdate. I've updated hasUndefRegUpdate to return false when ForLoadFold is set for instructions that are not explicitly blocked for load folding in isel patterns. Differential Revision: https://reviews.llvm.org/D79615
llvm · May 9, 2020 · c7be6a8 · c7be6a8
1 parent 56bf0b5
commit c7be6a8
Show file tree

Hide file tree

Showing 50 changed files with 1,136 additions and 1,082 deletions.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4834,15 +4834,29 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
 // operand into the unused high bits of the destination operand.
 // Also returns true for instructions that have two inputs where one may
 // be undef and we want it to use the same register as the other input.
-static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
+static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
                               bool ForLoadFold = false) {
   // Set the OpNum parameter to the first source operand.
-  OpNum = 1;
   switch (Opcode) {
   case X86::PACKSSWBrr:
   case X86::PACKUSWBrr:
   case X86::PACKSSDWrr:
   case X86::PACKUSDWrr:
+  case X86::PUNPCKHBWrr:
+  case X86::PUNPCKLBWrr:
+  case X86::PUNPCKHWDrr:
+  case X86::PUNPCKLWDrr:
+  case X86::PUNPCKHDQrr:
+  case X86::PUNPCKLDQrr:
+  case X86::PUNPCKHQDQrr:
+  case X86::PUNPCKLQDQrr:
+    // These instructions are sometimes used with an undef first or second
+    // source. Return true here so BreakFalseDeps will assign this source to the
+    // same register as the first source to avoid a false dependency.
+    // Operand 1 of these instructions is tied so they're separate from their
+    // VEX counterparts.
+    return OpNum == 2 && !ForLoadFold;
+
   case X86::VPACKSSWBrr:
   case X86::VPACKUSWBrr:
   case X86::VPACKSSDWrr:
@@ -4851,12 +4865,51 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
   case X86::VPACKUSWBZ128rr:
   case X86::VPACKSSDWZ128rr:
   case X86::VPACKUSDWZ128rr:
-    // These instructions are sometimes used with an undef second source to
-    // truncate 128-bit vectors to 64-bit with undefined high bits. Return
-    // true here so BreakFalseDeps will assign this source to the same register
-    // as the first source to avoid a false dependency.
-    OpNum = 2;
-    return true;
+  case X86::VPUNPCKHBWrr:
+  case X86::VPUNPCKLBWrr:
+  case X86::VPUNPCKHBWYrr:
+  case X86::VPUNPCKLBWYrr:
+  case X86::VPUNPCKHBWZ128rr:
+  case X86::VPUNPCKLBWZ128rr:
+  case X86::VPUNPCKHBWZ256rr:
+  case X86::VPUNPCKLBWZ256rr:
+  case X86::VPUNPCKHBWZrr:
+  case X86::VPUNPCKLBWZrr:
+  case X86::VPUNPCKHWDrr:
+  case X86::VPUNPCKLWDrr:
+  case X86::VPUNPCKHWDYrr:
+  case X86::VPUNPCKLWDYrr:
+  case X86::VPUNPCKHWDZ128rr:
+  case X86::VPUNPCKLWDZ128rr:
+  case X86::VPUNPCKHWDZ256rr:
+  case X86::VPUNPCKLWDZ256rr:
+  case X86::VPUNPCKHWDZrr:
+  case X86::VPUNPCKLWDZrr:
+  case X86::VPUNPCKHDQrr:
+  case X86::VPUNPCKLDQrr:
+  case X86::VPUNPCKHDQYrr:
+  case X86::VPUNPCKLDQYrr:
+  case X86::VPUNPCKHDQZ128rr:
+  case X86::VPUNPCKLDQZ128rr:
+  case X86::VPUNPCKHDQZ256rr:
+  case X86::VPUNPCKLDQZ256rr:
+  case X86::VPUNPCKHDQZrr:
+  case X86::VPUNPCKLDQZrr:
+  case X86::VPUNPCKHQDQrr:
+  case X86::VPUNPCKLQDQrr:
+  case X86::VPUNPCKHQDQYrr:
+  case X86::VPUNPCKLQDQYrr:
+  case X86::VPUNPCKHQDQZ128rr:
+  case X86::VPUNPCKLQDQZ128rr:
+  case X86::VPUNPCKHQDQZ256rr:
+  case X86::VPUNPCKLQDQZ256rr:
+  case X86::VPUNPCKHQDQZrr:
+  case X86::VPUNPCKLQDQZrr:
+    // These instructions are sometimes used with an undef first or second
+    // source. Return true here so BreakFalseDeps will assign this source to the
+    // same register as the first source to avoid a false dependency.
+    return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
+
   case X86::VCVTSI2SSrr:
   case X86::VCVTSI2SSrm:
   case X86::VCVTSI2SSrr_Int:
@@ -4914,7 +4967,7 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
   case X86::VCVTUSI642SDZrm_Int:
     // Load folding won't effect the undef register update since the input is
     // a GPR.
-    return !ForLoadFold;
+    return OpNum == 1 && !ForLoadFold;
   case X86::VCVTSD2SSrr:
   case X86::VCVTSD2SSrm:
   case X86::VCVTSD2SSrr_Int:
@@ -5013,15 +5066,13 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
   case X86::VSQRTSDZrb_Int:
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
-    return true;
+    return OpNum == 1;
   case X86::VMOVSSZrrk:
   case X86::VMOVSDZrrk:
-    OpNum = 3;
-    return true;
+    return OpNum == 3 && !ForLoadFold;
   case X86::VMOVSSZrrkz:
   case X86::VMOVSDZrrkz:
-    OpNum = 2;
-    return true;
+    return OpNum == 2 && !ForLoadFold;
   }
 
   return false;
@@ -5044,13 +5095,17 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
 unsigned
 X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
                                    const TargetRegisterInfo *TRI) const {
-  if (!hasUndefRegUpdate(MI.getOpcode(), OpNum))
-    return 0;
-
-  const MachineOperand &MO = MI.getOperand(OpNum);
-  if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) {
-    return UndefRegClearance;
+  for (unsigned i = MI.getNumExplicitDefs(), e = MI.getNumExplicitOperands();
+         i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (MO.isReg() && MO.isUndef() &&
+        Register::isPhysicalRegister(MO.getReg()) &&
+        hasUndefRegUpdate(MI.getOpcode(), i)) {
+      OpNum = i;
+      return UndefRegClearance;
+    }
   }
+
   return 0;
 }
 
@@ -5294,8 +5349,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
 
 static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
                                                MachineInstr &MI) {
-  unsigned Ignored;
-  if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) ||
+  if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/true) ||
       !MI.getOperand(1).isReg())
     return false;
 

diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -171,12 +171,12 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
 define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
 ; X32-LABEL: mul_v32i8:
 ; X32:       # %bb.0:
-; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X32-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X32-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; X32-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X32-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    vpand %ymm3, %ymm0, %ymm0
@@ -185,12 +185,12 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
 ;
 ; X64-LABEL: mul_v32i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X64-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X64-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; X64-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; X64-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X64-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    vpand %ymm3, %ymm0, %ymm0

diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -575,7 +575,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
 ; X32-LABEL: ashr_32i8:
 ; X32:       # %bb.0:
 ; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
-; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X32-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X32-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
@@ -586,7 +586,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
 ; X32-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
 ; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; X32-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X32-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
@@ -603,7 +603,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
 ; X64-LABEL: ashr_32i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
-; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X64-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X64-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
@@ -614,7 +614,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
 ; X64-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
 ; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; X64-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X64-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0

diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll
@@ -295,9 +295,9 @@ define void @example25() nounwind {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, dj+4112(%rax)
 ; SSE2-NEXT:    movdqa %xmm1, dj+4096(%rax)
@@ -326,7 +326,7 @@ define void @example25() nounwind {
 ; SSE41-NEXT:    pand %xmm1, %xmm2
 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    pand %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, dj+4112(%rax)
 ; SSE41-NEXT:    movdqa %xmm1, dj+4096(%rax)

diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -294,7 +294,7 @@ define <16 x i8> @combine_mul_to_abs_v16i8(<16 x i8> %x) {
 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE-NEXT:    pmullw %xmm0, %xmm2
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
 ; SSE-NEXT:    pand %xmm0, %xmm2

diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -240,15 +240,15 @@ define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    pslld $20, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pslld $20, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_shl_ext_shl0:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    pslld $20, %xmm1
 ; SSE41-NEXT:    pslld $20, %xmm0
 ; SSE41-NEXT:    retq

diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -264,14 +264,14 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst)
 ; X86-NEXT:    movdqa %xmm5, %xmm2
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; X86-NEXT:    movdqa %xmm2, (%ecx)
-; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    movdqa %xmm1, %xmm2
-; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X86-NEXT:    pmullw %xmm3, %xmm2
 ; X86-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; X86-NEXT:    pand %xmm3, %xmm2
-; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    pmullw %xmm5, %xmm1
 ; X86-NEXT:    pand %xmm3, %xmm1
 ; X86-NEXT:    packuswb %xmm2, %xmm1
@@ -377,14 +377,14 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst)
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movdqa %xmm2, (%rax)
-; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-NEXT:    movdqa %xmm1, %xmm2
-; X64-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X64-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X64-NEXT:    pmullw %xmm3, %xmm2
 ; X64-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; X64-NEXT:    pand %xmm3, %xmm2
-; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-NEXT:    pmullw %xmm6, %xmm1
 ; X64-NEXT:    pand %xmm3, %xmm1
 ; X64-NEXT:    packuswb %xmm2, %xmm1