diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 15fedf6b63bc7..240dd2a1c0e07 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49888,6 +49888,29 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, return Res; } + // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and + // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern: + // psrad(pshufd(psllq(X,63),1,1,3,3),31) -> + // pshufd(psrad(pslld(X,31),31),0,0,2,2). + if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 && + N0.getOpcode() == X86ISD::PSHUFD && + N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) && + N0->hasOneUse()) { + SDValue BC = peekThroughOneUseBitcasts(N0.getOperand(0)); + if (BC.getOpcode() == X86ISD::VSHLI && + BC.getScalarValueSizeInBits() == 64 && + BC.getConstantOperandVal(1) == 63) { + SDLoc DL(N); + SDValue Src = BC.getOperand(0); + Src = DAG.getBitcast(VT, Src); + Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src, + getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG)); + Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1); + Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1); + return Src; + } + } + auto TryConstantFold = [&](SDValue V) { APInt UndefElts; SmallVector EltBits; diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll index 2930efe7334b0..c49bb9c0f8f86 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -587,9 +587,8 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) { ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE41-NEXT: orps %xmm2, %xmm0 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -612,12 +611,11 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) { ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: andps %xmm3, %xmm0 ; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,3,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: psllq $63, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: retq %cmp1 = icmp eq <4 x i64> %x, @@ -729,9 +727,8 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { ; SSE41-NEXT: orps %xmm2, %xmm0 ; SSE41-NEXT: xorps %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -756,12 +753,11 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { ; SSE2-NEXT: andps %xmm4, %xmm0 ; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: xorps %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,3,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: psllq $63, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i64> %x, diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll index 529396ca46170..614d86bd4c794 100644 --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -35,9 +35,8 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,1,3,3] -; SSE2-NEXT: psllq $63, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 3e44e2cdb2b18..890514fbdc022 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -5510,8 +5510,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] ; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE2-NEXT: psllq $63, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pslld $31, %xmm8 ; SSE2-NEXT: psrad $31, %xmm8 ; SSE2-NEXT: movdqa %xmm8, %xmm10 ; SSE2-NEXT: pandn %xmm7, %xmm10 @@ -5520,8 +5520,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; SSE2-NEXT: por %xmm10, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE2-NEXT: psllq $63, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-NEXT: pslld $31, %xmm7 ; SSE2-NEXT: psrad $31, %xmm7 ; SSE2-NEXT: movdqa %xmm7, %xmm10 ; SSE2-NEXT: pandn %xmm6, %xmm10 @@ -5529,8 +5529,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; SSE2-NEXT: por %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE2-NEXT: psllq $63, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSE2-NEXT: pslld $31, %xmm6 ; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm10 ; SSE2-NEXT: pandn %xmm5, %xmm10 @@ -5538,8 +5538,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; SSE2-NEXT: por %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm9 ; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 23d6f256e1ab6..65b324c793428 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -1775,13 +1775,11 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) { ; SSE2-NEXT: shrb $3, %al ; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: pinsrw $6, %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE2-NEXT: psllq $63, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: retq ; @@ -1805,13 +1803,11 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) { ; SSSE3-NEXT: shrb $3, %al ; SSSE3-NEXT: movzbl %al, %eax ; SSSE3-NEXT: pinsrw $6, %eax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSSE3-NEXT: psllq $63, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: retq ; @@ -1831,17 +1827,15 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) { ; SSE41-NEXT: shrb $2, %cl ; SSE41-NEXT: andb $1, %cl ; SSE41-NEXT: movzbl %cl, %ecx -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 ; SSE41-NEXT: shrb $3, %al ; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] ; SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE41-NEXT: psllq $63, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: retq ; @@ -1939,13 +1933,11 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) { ; X86-SSE2-NEXT: shrb $3, %al ; X86-SSE2-NEXT: movzbl %al, %eax ; X86-SSE2-NEXT: pinsrw $6, %eax, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; X86-SSE2-NEXT: psllq $63, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; X86-SSE2-NEXT: pslld $31, %xmm0 ; X86-SSE2-NEXT: psrad $31, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; X86-SSE2-NEXT: psllq $63, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; X86-SSE2-NEXT: pslld $31, %xmm1 ; X86-SSE2-NEXT: psrad $31, %xmm1 ; X86-SSE2-NEXT: retl ; @@ -1966,17 +1958,15 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) { ; X86-SSE41-NEXT: shrb $2, %cl ; X86-SSE41-NEXT: andb $1, %cl ; X86-SSE41-NEXT: movzbl %cl, %ecx -; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; X86-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 ; X86-SSE41-NEXT: shrb $3, %al ; X86-SSE41-NEXT: movzbl %al, %eax +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] ; X86-SSE41-NEXT: pinsrb $12, %eax, %xmm1 -; X86-SSE41-NEXT: psllq $63, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pslld $31, %xmm0 ; X86-SSE41-NEXT: psrad $31, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; X86-SSE41-NEXT: psllq $63, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: pslld $31, %xmm1 ; X86-SSE41-NEXT: psrad $31, %xmm1 ; X86-SSE41-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index 3fd74a253b0f2..0c57f497aa8aa 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -483,8 +483,8 @@ define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) { define <2 x i64> @shrunkblend_2uses(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-LABEL: shrunkblend_2uses: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm5 @@ -522,8 +522,8 @@ define <2 x i64> @shrunkblend_2uses(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b, define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-LABEL: shrunkblend_nonvselectuse: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vsplit-and.ll b/llvm/test/CodeGen/X86/vsplit-and.ll index 04869aa0b9a92..85def820ecb92 100644 --- a/llvm/test/CodeGen/X86/vsplit-and.ll +++ b/llvm/test/CodeGen/X86/vsplit-and.ll @@ -23,31 +23,30 @@ define void @t0(ptr %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind readonly { define void @t2(ptr %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly { ; CHECK-LABEL: t2: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %r9, %xmm1 -; CHECK-NEXT: movq %r8, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: movq %rdx, %xmm1 +; CHECK-NEXT: movq %r9, %xmm0 +; CHECK-NEXT: movq %r8, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movq %rsi, %xmm2 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; CHECK-NEXT: movq %rcx, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; CHECK-NEXT: movq %rcx, %xmm0 ; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqq %xmm4, %xmm1 +; CHECK-NEXT: pcmpeqq %xmm4, %xmm0 ; CHECK-NEXT: pcmpeqq %xmm4, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] +; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: pcmpeqq %xmm4, %xmm3 -; CHECK-NEXT: pcmpeqq %xmm4, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; CHECK-NEXT: orps %xmm2, %xmm0 -; CHECK-NEXT: xorps %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: psllq $63, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-NEXT: psrad $31, %xmm1 -; CHECK-NEXT: pmovsxdq %xmm0, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rdi) -; CHECK-NEXT: movq %xmm1, 16(%rdi) +; CHECK-NEXT: pcmpeqq %xmm4, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; CHECK-NEXT: orps %xmm2, %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; CHECK-NEXT: pslld $31, %xmm0 +; CHECK-NEXT: psrad $31, %xmm0 +; CHECK-NEXT: pmovsxdq %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rdi) +; CHECK-NEXT: movq %xmm0, 16(%rdi) ; CHECK-NEXT: retq %cmp1 = icmp ne <3 x i64> %src1, zeroinitializer %cmp2 = icmp ne <3 x i64> %src2, zeroinitializer