diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 24bc264df129fa..7c134a8c7cb929 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -13723,9 +13723,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); - int WidenedMask[4] = { - std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, - std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; + int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2), + Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1), + Mask[1] < 0 ? -1 : (Mask[1] * 2), + Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index d2638a1681e85d..1411318d8176d0 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -160,9 +160,9 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero @@ -172,10 +172,10 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vmovdqa (%rsi), %xmm6 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] @@ -454,24 +454,24 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] @@ -482,10 +482,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] @@ -493,10 +493,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] @@ -504,10 +504,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll index 653a88edd26a04..b3fa8ac9aeeda6 100644 --- a/llvm/test/CodeGen/X86/avx-cvt.ll +++ b/llvm/test/CodeGen/X86/avx-cvt.ll @@ -33,7 +33,7 @@ define <8 x float> @sitofp02(<8 x i16> %a) { ; AVX-LABEL: sitofp02: ; AVX: # %bb.0: ; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll index 1fd3e15c3e0196..b504646336def0 100644 --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -5,7 +5,7 @@ define i32 @hadd_16(<16 x i32> %x225) { ; KNL-LABEL: hadd_16: ; KNL: # %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -14,7 +14,7 @@ define i32 @hadd_16(<16 x i32> %x225) { ; ; SKX-LABEL: hadd_16: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -32,7 +32,7 @@ define i32 @hadd_16(<16 x i32> %x225) { define i32 @hsub_16(<16 x i32> %x225) { ; KNL-LABEL: hsub_16: ; KNL: # %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -41,7 +41,7 @@ define i32 @hsub_16(<16 x i32> %x225) { ; ; SKX-LABEL: hsub_16: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index f115f9a6ef382b..650bbe23b86e51 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6550,7 +6550,7 @@ define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6563,7 +6563,7 @@ define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6602,7 +6602,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6636,7 +6636,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6668,7 +6668,7 @@ define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6681,7 +6681,7 @@ define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6706,7 +6706,7 @@ define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6719,7 +6719,7 @@ define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6747,7 +6747,7 @@ define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6762,7 +6762,7 @@ define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6807,7 +6807,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6844,7 +6844,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6882,7 +6882,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6898,7 +6898,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6928,7 +6928,7 @@ define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6943,7 +6943,7 @@ define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index 863ab4dee1238d..2195526f94c391 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -114,21 +114,21 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: extract2_i32_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll index 2e044548404ebd..6e9e4fd00636d9 100644 --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -31,7 +31,7 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: pmovsxwd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -39,9 +39,9 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; AVX1: # %bb.0: ; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5 ; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 @@ -87,7 +87,7 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: retq ; @@ -459,7 +459,7 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE41-NEXT: packssdw %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 ; SSE41-NEXT: movdqa %xmm3, dj+4096(%rax) @@ -480,9 +480,9 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll index b96f44ec3073fa..16a993316d7e5a 100644 --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -696,7 +696,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { ; SSE2-LABEL: _clearupper16xi8b: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %r10 ; SSE2-NEXT: movq %r10, %r8 ; SSE2-NEXT: shrq $56, %r8 @@ -878,7 +878,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; SSE2-LABEL: _clearupper32xi8b: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r10 ; SSE2-NEXT: movq %r10, %r8 ; SSE2-NEXT: shrq $56, %r8 diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll index b18c7246104d4c..17d01e1d3362ca 100644 --- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll @@ -93,7 +93,7 @@ define i32 @movmskps_sext_v4i64(<4 x i32> %a0) { ; AVX1-LABEL: movmskps_sext_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovmskpd %ymm0, %eax @@ -116,7 +116,7 @@ define i32 @movmskps_sext_v8i32(<8 x i16> %a0) { ; AVX1-LABEL: movmskps_sext_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 1a52ebfc6cd909..ce411b5e8f06b6 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2959,7 +2959,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; SSE41-LABEL: pr38658: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 @@ -2984,7 +2984,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; AVX1-LABEL: pr38658: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 @@ -3058,7 +3058,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; XOP-LABEL: pr38658: ; XOP: # %bb.0: -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; XOP-NEXT: vpmovsxbw %xmm1, %xmm1 ; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll index a6950873daf550..383d1866aa1d9e 100644 --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -311,7 +311,7 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -344,7 +344,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -377,7 +377,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) { ; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index e2f3f2dc7523fe..28a73cdb6a41ef 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -152,7 +152,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrad %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrad %xmm4, %xmm5 @@ -272,7 +272,7 @@ define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index 36fbdaf5370254..2e886defafd4cf 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -400,7 +400,7 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index e6d7aac9261622..c44342d00357a7 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -233,7 +233,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -249,7 +249,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -307,7 +307,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -324,7 +324,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -384,7 +384,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -401,7 +401,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll index b21ed8ec60cef8..cd0b21d02969c4 100644 --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -213,7 +213,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrld %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: psrld %xmm5, %xmm6 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 753aee85f319ce..59101503b5a93e 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -565,9 +565,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst ; X86-NEXT: cltd ; X86-NEXT: idivl %esi ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: movd %xmm3, %eax -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi @@ -608,9 +608,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx ; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %ecx ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx @@ -657,11 +657,11 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: movd %xmm1, (%esp) ; X86-NEXT: calll __divdi3 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload @@ -707,9 +707,9 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 43fc23c836a9b8..d787f91ababba0 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -565,9 +565,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: movd %xmm3, %eax -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi @@ -608,9 +608,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx ; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx @@ -657,11 +657,11 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: movd %xmm1, (%esp) ; X86-NEXT: calll __udivdi3 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload @@ -707,9 +707,9 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx diff --git a/llvm/test/CodeGen/X86/extract-store.ll b/llvm/test/CodeGen/X86/extract-store.ll index c29fac61af3fe9..d50c2ed9207533 100644 --- a/llvm/test/CodeGen/X86/extract-store.ll +++ b/llvm/test/CodeGen/X86/extract-store.ll @@ -314,7 +314,7 @@ define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind { ; ; SSE2-X64-LABEL: extract_i64_1: ; SSE2-X64: # %bb.0: -; SSE2-X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-X64-NEXT: movq %xmm0, (%rdi) ; SSE2-X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll index cf06f8dcb13e15..f2e01e93361ef6 100644 --- a/llvm/test/CodeGen/X86/extractelement-index.ll +++ b/llvm/test/CodeGen/X86/extractelement-index.ll @@ -351,7 +351,7 @@ define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind { define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v2i64_1: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; @@ -371,7 +371,7 @@ define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind { define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v4i64_1: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; @@ -392,7 +392,7 @@ define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind { define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v4i64_3: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index 94628c70d989ec..b694859b757c01 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -10,13 +10,13 @@ define i32 @t(<2 x i64>* %val) nounwind { ; X32-SSE2-LABEL: t: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: pshufd $78, (%eax), %xmm0 # xmm0 = mem[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3] ; X32-SSE2-NEXT: movd %xmm0, %eax ; X32-SSE2-NEXT: retl ; ; X64-SSSE3-LABEL: t: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: pshufd $78, (%rdi), %xmm0 # xmm0 = mem[2,3,0,1] +; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3] ; X64-SSSE3-NEXT: movd %xmm0, %eax ; X64-SSSE3-NEXT: retq ; @@ -60,13 +60,13 @@ define void @t3(<2 x double>* %a0) { ; ; X64-SSSE3-LABEL: t3: ; X64-SSSE3: # %bb.0: # %bb -; X64-SSSE3-NEXT: movsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero +; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-SSSE3-NEXT: movsd %xmm0, (%rax) ; X64-SSSE3-NEXT: retq ; ; X64-AVX-LABEL: t3: ; X64-AVX: # %bb.0: # %bb -; X64-AVX-NEXT: vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: vmovsd %xmm0, (%rax) ; X64-AVX-NEXT: retq bb: @@ -139,7 +139,7 @@ define float @t6(<8 x float> *%a0) { ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-SSE2-NEXT: xorps %xmm1, %xmm1 ; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1 -; X32-SSE2-NEXT: movss {{\.LCPI.*}}, %xmm2 # xmm2 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: andps %xmm1, %xmm2 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1 ; X32-SSE2-NEXT: orps %xmm2, %xmm1 @@ -151,10 +151,10 @@ define float @t6(<8 x float> *%a0) { ; ; X64-SSSE3-LABEL: t6: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movshdup (%rdi), %xmm1 # xmm1 = mem[1,1,3,3] +; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 -; X64-SSSE3-NEXT: movss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: andps %xmm0, %xmm2 ; X64-SSSE3-NEXT: andnps %xmm1, %xmm0 ; X64-SSSE3-NEXT: orps %xmm2, %xmm0 @@ -162,10 +162,10 @@ define float @t6(<8 x float> *%a0) { ; ; X64-AVX-LABEL: t6: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 -; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %vecload = load <8 x float>, <8 x float>* %a0, align 32 @@ -184,7 +184,7 @@ define void @PR43971(<8 x float> *%a0, float *%a1) { ; X32-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X32-SSE2-NEXT: xorps %xmm1, %xmm1 ; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1 -; X32-SSE2-NEXT: movss (%eax), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: andps %xmm1, %xmm2 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1 ; X32-SSE2-NEXT: orps %xmm2, %xmm1 @@ -193,10 +193,10 @@ define void @PR43971(<8 x float> *%a0, float *%a1) { ; ; X64-SSSE3-LABEL: PR43971: ; X64-SSSE3: # %bb.0: # %entry -; X64-SSSE3-NEXT: movss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: xorps %xmm1, %xmm1 ; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1 -; X64-SSSE3-NEXT: movss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: andps %xmm1, %xmm2 ; X64-SSSE3-NEXT: andnps %xmm0, %xmm1 ; X64-SSSE3-NEXT: orps %xmm2, %xmm1 @@ -205,10 +205,10 @@ define void @PR43971(<8 x float> *%a0, float *%a1) { ; ; X64-AVX-LABEL: PR43971: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 -; X64-AVX-NEXT: vmovss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovss %xmm0, (%rsi) ; X64-AVX-NEXT: retq @@ -231,7 +231,7 @@ define float @PR43971_1(<8 x float> *%a0) nounwind { ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-SSE2-NEXT: xorps %xmm1, %xmm1 ; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1 -; X32-SSE2-NEXT: movss {{\.LCPI.*}}, %xmm2 # xmm2 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: andps %xmm1, %xmm2 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1 ; X32-SSE2-NEXT: orps %xmm2, %xmm1 @@ -242,10 +242,10 @@ define float @PR43971_1(<8 x float> *%a0) nounwind { ; ; X64-SSSE3-LABEL: PR43971_1: ; X64-SSSE3: # %bb.0: # %entry -; X64-SSSE3-NEXT: movshdup (%rdi), %xmm1 # xmm1 = mem[1,1,3,3] +; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 -; X64-SSSE3-NEXT: movss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: andps %xmm0, %xmm2 ; X64-SSSE3-NEXT: andnps %xmm1, %xmm0 ; X64-SSSE3-NEXT: orps %xmm2, %xmm0 @@ -253,10 +253,10 @@ define float @PR43971_1(<8 x float> *%a0) nounwind { ; ; X64-AVX-LABEL: PR43971_1: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 -; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll index 1cd85e6e582c40..00f84a6e4b1594 100644 --- a/llvm/test/CodeGen/X86/gather-addresses.ll +++ b/llvm/test/CodeGen/X86/gather-addresses.ll @@ -17,7 +17,7 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; LIN-SSE2-NEXT: movd %xmm0, %eax ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; LIN-SSE2-NEXT: movd %xmm1, %ecx -; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; LIN-SSE2-NEXT: movd %xmm1, %edx ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; LIN-SSE2-NEXT: movd %xmm0, %esi @@ -56,7 +56,7 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; WIN-SSE2-NEXT: movd %xmm0, %r8d ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; WIN-SSE2-NEXT: movd %xmm1, %r9d -; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; WIN-SSE2-NEXT: movd %xmm1, %r10d ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; WIN-SSE2-NEXT: movd %xmm0, %edx @@ -141,7 +141,7 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind ; LIN-SSE2-NEXT: movd %xmm0, %eax ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; LIN-SSE2-NEXT: movd %xmm1, %edx -; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; LIN-SSE2-NEXT: movd %xmm1, %esi ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; LIN-SSE2-NEXT: movd %xmm0, %edi @@ -184,7 +184,7 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind ; WIN-SSE2-NEXT: movd %xmm0, %eax ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; WIN-SSE2-NEXT: movd %xmm1, %ecx -; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; WIN-SSE2-NEXT: movd %xmm1, %r8d ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; WIN-SSE2-NEXT: movd %xmm0, %edx diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index c4d470a6cd69ef..e36c0479448ea7 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -127,7 +127,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx ; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx @@ -136,7 +136,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi @@ -181,7 +181,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE3-LABEL: phadd_d_test2: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx @@ -192,7 +192,7 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: addl %eax, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %eax, %esi ; SSE3-NEXT: movd %esi, %xmm0 @@ -243,7 +243,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx ; SSE3-NEXT: subl %ecx, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %ecx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx @@ -252,7 +252,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: subl %esi, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi @@ -297,7 +297,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE3-LABEL: phsub_d_test2: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx @@ -306,7 +306,7 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: subl %edx, %ecx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi @@ -513,7 +513,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm4, %r8d ; SSE3-NEXT: addl %ecx, %r8d -; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm4, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %r9d @@ -522,7 +522,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %edx, %esi -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi @@ -531,7 +531,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %r10d ; SSE3-NEXT: addl %eax, %r10d -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %ecx @@ -540,7 +540,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: addl %eax, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %r11d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %eax @@ -819,7 +819,7 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: movd %xmm0, %edx @@ -830,7 +830,7 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { ; SSE-NEXT: subl %esi, %edx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE-NEXT: movd %xmm0, %esi -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movd %xmm0, %edi ; SSE-NEXT: subl %edi, %esi ; SSE-NEXT: movd %esi, %xmm0 @@ -1133,7 +1133,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm4, %r8d ; SSE3-NEXT: addl %ecx, %r8d -; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm4, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %r9d @@ -1142,7 +1142,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %edx, %esi -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi @@ -1151,7 +1151,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %r10d ; SSE3-NEXT: addl %eax, %r10d -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %ecx @@ -1160,7 +1160,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: addl %eax, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %r11d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index 6663459f49d56b..740e4b291f5f9a 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -554,7 +554,7 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrld %xmm2, %xmm5 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrld %xmm2, %xmm4 @@ -640,7 +640,7 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrld %xmm2, %xmm5 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrld %xmm2, %xmm4 @@ -677,7 +677,7 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrld %xmm2, %xmm5 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrld %xmm2, %xmm4 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll index 64d8de9aead789..dab7785c85cc16 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll @@ -11,7 +11,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; SSE2-LABEL: PR37890_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -20,7 +20,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; SSSE3-SLOW-LABEL: PR37890_v4i32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -36,7 +36,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; AVX1-SLOW-LABEL: PR37890_v4i32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -52,7 +52,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; AVX2-LABEL: PR37890_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -70,7 +70,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { define i16 @PR37890_v8i16(<8 x i16> %a) { ; SSE2-LABEL: PR37890_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddw %xmm1, %xmm0 @@ -83,7 +83,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; SSSE3-SLOW-LABEL: PR37890_v8i16: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 @@ -105,7 +105,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; AVX1-SLOW-LABEL: PR37890_v8i16: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -126,7 +126,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; AVX2-LABEL: PR37890_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -151,7 +151,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; SSE2-LABEL: PR37890_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -161,7 +161,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; SSSE3-SLOW-LABEL: PR37890_v8i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -180,7 +180,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -202,7 +202,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -225,7 +225,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; SSE2-LABEL: PR37890_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddw %xmm1, %xmm0 @@ -239,7 +239,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; SSSE3-SLOW-LABEL: PR37890_v16i16: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 @@ -264,7 +264,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -291,7 +291,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -322,7 +322,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -334,7 +334,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 @@ -346,7 +346,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm1 ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 ; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-FAST-NEXT: movd %xmm0, %eax @@ -359,7 +359,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -385,7 +385,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll index dc8f60248c672c..a17b1db2c1783e 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -16,7 +16,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; X86-SSE42-NEXT: movd %xmm2, %eax @@ -49,7 +49,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX-LABEL: test_reduce_v2i64: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax @@ -58,7 +58,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -80,7 +80,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; X64-SSE42-NEXT: movq %xmm2, %rax @@ -88,7 +88,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -96,7 +96,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -104,7 +104,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: retq @@ -118,7 +118,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -135,7 +135,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i32: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 @@ -144,7 +144,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-AVX-LABEL: test_reduce_v4i32: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -153,7 +153,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v4i32: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -170,7 +170,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v4i32: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 @@ -179,7 +179,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-AVX-LABEL: test_reduce_v4i32: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -198,7 +198,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -229,7 +229,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -273,7 +273,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -328,7 +328,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -419,7 +419,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -444,7 +444,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -457,7 +457,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -470,7 +470,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -496,7 +496,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -519,7 +519,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -531,7 +531,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -543,7 +543,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -554,7 +554,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -577,7 +577,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 @@ -595,7 +595,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE42-LABEL: test_reduce_v8i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 @@ -606,7 +606,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -618,7 +618,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -633,7 +633,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm2 ; X64-SSE2-NEXT: por %xmm0, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 @@ -651,7 +651,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE42-LABEL: test_reduce_v8i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 @@ -662,7 +662,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -674,7 +674,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -686,7 +686,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -710,7 +710,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -758,7 +758,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-LABEL: test_reduce_v16i16: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -838,7 +838,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 @@ -916,7 +916,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm2 ; X64-SSE2-NEXT: por %xmm0, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 @@ -1072,7 +1072,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm5 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1104,7 +1104,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42-NEXT: movapd %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1122,7 +1122,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1137,7 +1137,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1193,7 +1193,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm5 ; X64-SSE2-NEXT: pandn %xmm2, %xmm1 ; X64-SSE2-NEXT: por %xmm5, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1223,7 +1223,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: movapd %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1240,7 +1240,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -1254,7 +1254,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -1267,7 +1267,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -1303,7 +1303,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm4, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -1323,7 +1323,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm1 ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 @@ -1337,7 +1337,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1350,7 +1350,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1375,7 +1375,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm0, %xmm1 ; X64-SSE2-NEXT: pandn %xmm4, %xmm0 ; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -1395,7 +1395,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm1 ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 @@ -1409,7 +1409,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1422,7 +1422,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1436,7 +1436,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1465,7 +1465,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1521,7 +1521,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1622,7 +1622,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm4, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -1716,7 +1716,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm0, %xmm1 ; X64-SSE2-NEXT: pandn %xmm4, %xmm0 ; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -1837,7 +1837,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1869,7 +1869,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1914,7 +1914,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1946,7 +1946,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1991,7 +1991,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2047,7 +2047,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2119,7 +2119,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2175,7 +2175,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll index 987da0f68082c9..17c3a9fd4a0108 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -16,7 +16,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -50,7 +50,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX-LABEL: test_reduce_v2i64: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax @@ -59,7 +59,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -81,7 +81,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -90,7 +90,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -98,7 +98,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -106,7 +106,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: retq @@ -120,7 +120,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -137,7 +137,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i32: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 @@ -146,7 +146,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-AVX-LABEL: test_reduce_v4i32: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -155,7 +155,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v4i32: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -172,7 +172,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v4i32: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 @@ -181,7 +181,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-AVX-LABEL: test_reduce_v4i32: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -200,7 +200,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -231,7 +231,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -275,7 +275,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -330,7 +330,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -421,7 +421,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -447,7 +447,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -460,7 +460,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -473,7 +473,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -499,7 +499,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -523,7 +523,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -535,7 +535,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -547,7 +547,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -558,7 +558,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -581,7 +581,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 @@ -599,7 +599,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE42-LABEL: test_reduce_v8i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 @@ -610,7 +610,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -622,7 +622,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -637,7 +637,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm2 ; X64-SSE2-NEXT: por %xmm0, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 @@ -655,7 +655,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE42-LABEL: test_reduce_v8i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 @@ -666,7 +666,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -678,7 +678,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -690,7 +690,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -714,7 +714,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -762,7 +762,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-LABEL: test_reduce_v16i16: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -842,7 +842,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 @@ -920,7 +920,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm2 ; X64-SSE2-NEXT: por %xmm0, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 @@ -1076,7 +1076,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm5, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 @@ -1108,7 +1108,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42-NEXT: movapd %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1126,7 +1126,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1141,7 +1141,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1197,7 +1197,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm3 ; X64-SSE2-NEXT: pandn %xmm5, %xmm1 ; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1227,7 +1227,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: movapd %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1244,7 +1244,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -1258,7 +1258,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -1271,7 +1271,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -1307,7 +1307,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm4 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm4, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 @@ -1327,7 +1327,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE42-NEXT: pminsd %xmm3, %xmm1 ; X86-SSE42-NEXT: pminsd %xmm2, %xmm1 ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 @@ -1341,7 +1341,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1354,7 +1354,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1379,7 +1379,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm4 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm4, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm1 @@ -1399,7 +1399,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE42-NEXT: pminsd %xmm3, %xmm1 ; X64-SSE42-NEXT: pminsd %xmm2, %xmm1 ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 @@ -1413,7 +1413,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1426,7 +1426,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1440,7 +1440,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1469,7 +1469,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pminsw %xmm3, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1525,7 +1525,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pminsw %xmm3, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1626,7 +1626,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm4 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm4, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 @@ -1720,7 +1720,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm4 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm4, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm1 @@ -1841,7 +1841,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1873,7 +1873,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1918,7 +1918,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1950,7 +1950,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1995,7 +1995,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2051,7 +2051,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2123,7 +2123,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2179,7 +2179,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index 6e59cd046cb0a8..c69551aa3d8838 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -16,7 +16,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -52,7 +52,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 @@ -65,7 +65,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX2-LABEL: test_reduce_v2i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -77,7 +77,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -99,7 +99,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -110,7 +110,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -121,7 +121,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -132,7 +132,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: retq @@ -146,7 +146,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -169,7 +169,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i32: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 @@ -178,7 +178,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-AVX-LABEL: test_reduce_v4i32: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -187,7 +187,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v4i32: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -210,7 +210,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v4i32: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 @@ -219,7 +219,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-AVX-LABEL: test_reduce_v4i32: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -238,7 +238,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -275,7 +275,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -344,7 +344,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -385,7 +385,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -485,7 +485,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -514,7 +514,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -533,7 +533,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -551,7 +551,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -579,7 +579,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -606,7 +606,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -623,7 +623,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -640,7 +640,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -653,7 +653,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -680,7 +680,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -703,7 +703,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE42-LABEL: test_reduce_v8i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 @@ -714,7 +714,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -726,7 +726,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -745,7 +745,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -768,7 +768,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE42-LABEL: test_reduce_v8i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 @@ -779,7 +779,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -791,7 +791,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -803,7 +803,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -830,7 +830,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -885,7 +885,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -965,7 +965,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -1026,7 +1026,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-LABEL: test_reduce_v32i8: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -1169,7 +1169,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm5 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1210,7 +1210,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm5, %xmm0 ; X86-SSE42-NEXT: pxor %xmm1, %xmm5 @@ -1238,7 +1238,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1260,7 +1260,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1318,7 +1318,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm5 ; X64-SSE2-NEXT: pandn %xmm2, %xmm1 ; X64-SSE2-NEXT: por %xmm5, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1357,7 +1357,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pxor %xmm1, %xmm5 @@ -1383,7 +1383,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1404,7 +1404,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1419,7 +1419,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -1465,7 +1465,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm5 ; X86-SSE2-NEXT: pandn %xmm2, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1490,7 +1490,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxud %xmm2, %xmm1 ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 @@ -1504,7 +1504,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1517,7 +1517,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1552,7 +1552,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm5 ; X64-SSE2-NEXT: pandn %xmm2, %xmm1 ; X64-SSE2-NEXT: por %xmm5, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1577,7 +1577,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxud %xmm2, %xmm1 ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 @@ -1591,7 +1591,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1604,7 +1604,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1618,7 +1618,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1652,7 +1652,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1717,7 +1717,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1810,7 +1810,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1 ; X86-SSE2-NEXT: pmaxub %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 @@ -1879,7 +1879,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1 ; X64-SSE2-NEXT: pmaxub %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 @@ -1987,7 +1987,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2025,7 +2025,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2097,7 +2097,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2135,7 +2135,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2207,7 +2207,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -2249,7 +2249,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -2332,7 +2332,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -2374,7 +2374,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index 413b5f2ac4aa53..5f33520200d241 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -16,7 +16,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE42-NEXT: pxor %xmm0, %xmm3 @@ -53,7 +53,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 @@ -66,7 +66,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX2-LABEL: test_reduce_v2i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -78,7 +78,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -100,7 +100,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm3 ; X64-SSE42-NEXT: pxor %xmm0, %xmm3 @@ -112,7 +112,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -123,7 +123,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -134,7 +134,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: retq @@ -148,7 +148,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -171,7 +171,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i32: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 @@ -180,7 +180,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-AVX-LABEL: test_reduce_v4i32: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -189,7 +189,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v4i32: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -212,7 +212,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v4i32: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 @@ -221,7 +221,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-AVX-LABEL: test_reduce_v4i32: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -240,7 +240,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -271,7 +271,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -315,7 +315,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -350,7 +350,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -421,7 +421,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -451,7 +451,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -471,7 +471,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -489,7 +489,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -517,7 +517,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -545,7 +545,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -563,7 +563,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -580,7 +580,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -593,7 +593,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -620,7 +620,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -643,7 +643,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE42-LABEL: test_reduce_v8i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 @@ -654,7 +654,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -666,7 +666,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -685,7 +685,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -708,7 +708,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE42-LABEL: test_reduce_v8i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 @@ -719,7 +719,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -731,7 +731,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -743,7 +743,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -770,7 +770,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -816,7 +816,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -885,7 +885,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -937,7 +937,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-LABEL: test_reduce_v32i8: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -1069,7 +1069,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm5, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 @@ -1111,7 +1111,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm4, %xmm0 ; X86-SSE42-NEXT: pxor %xmm1, %xmm4 @@ -1140,7 +1140,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1162,7 +1162,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1220,7 +1220,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm3 ; X64-SSE2-NEXT: pandn %xmm5, %xmm1 ; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1260,7 +1260,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm4, %xmm0 ; X64-SSE42-NEXT: pxor %xmm1, %xmm4 @@ -1287,7 +1287,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1308,7 +1308,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1323,7 +1323,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -1369,7 +1369,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm3 ; X86-SSE2-NEXT: pandn %xmm6, %xmm1 ; X86-SSE2-NEXT: por %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1394,7 +1394,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE42-NEXT: pminud %xmm3, %xmm1 ; X86-SSE42-NEXT: pminud %xmm2, %xmm1 ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 @@ -1408,7 +1408,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1421,7 +1421,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1456,7 +1456,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm3 ; X64-SSE2-NEXT: pandn %xmm6, %xmm1 ; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1481,7 +1481,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE42-NEXT: pminud %xmm3, %xmm1 ; X64-SSE42-NEXT: pminud %xmm2, %xmm1 ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 @@ -1495,7 +1495,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1508,7 +1508,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1522,7 +1522,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1556,7 +1556,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pminsw %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1612,7 +1612,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pminsw %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1694,7 +1694,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pminub %xmm3, %xmm1 ; X86-SSE2-NEXT: pminub %xmm2, %xmm1 ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 @@ -1754,7 +1754,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pminub %xmm3, %xmm1 ; X64-SSE2-NEXT: pminub %xmm2, %xmm1 ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 @@ -1851,7 +1851,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1883,7 +1883,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1928,7 +1928,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1960,7 +1960,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2005,7 +2005,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -2041,7 +2041,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -2093,7 +2093,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -2129,7 +2129,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll index 8e75c4a575ee48..d128d75e64573c 100644 --- a/llvm/test/CodeGen/X86/i128-add.ll +++ b/llvm/test/CodeGen/X86/i128-add.ll @@ -77,7 +77,7 @@ define <1 x i128> @add_v1i128(<1 x i128> %x, <1 x i128> %y) nounwind { ; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: movq %rsi, %xmm1 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: addq $1, %rax ; X64-NEXT: adcq $0, %rdx diff --git a/llvm/test/CodeGen/X86/inline-asm-x-i128.ll b/llvm/test/CodeGen/X86/inline-asm-x-i128.ll index 552ce96a53a517..7aee1d175494e2 100644 --- a/llvm/test/CodeGen/X86/inline-asm-x-i128.ll +++ b/llvm/test/CodeGen/X86/inline-asm-x-i128.ll @@ -16,7 +16,7 @@ define { i64, i64 } @foo(i64 %0, i64 %1) { ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movq %xmm0, %rdx ; CHECK-NEXT: retq %3 = zext i64 %1 to i128 diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll index a1606e93e2e23f..3b6912a9d94610 100644 --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -116,14 +116,14 @@ define <4 x i32> @knownbits_mask_shuffle_shuffle_undef_sext(<8 x i16> %a0) nounw ; X32-LABEL: knownbits_mask_shuffle_shuffle_undef_sext: ; X32: # %bb.0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-NEXT: vpmovsxwd %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_shuffle_undef_sext: ; X64: # %bb.0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: vpmovsxwd %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <8 x i16> %a0, diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index b18b8079fd236b..6c4d0a919ef0f6 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -252,7 +252,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) ; X86-LABEL: signbits_sext_shuffle_sitofp: ; X86: # %bb.0: ; X86-NEXT: vpmovsxdq %xmm0, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X86-NEXT: vpmovsxdq %xmm0, %xmm0 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] @@ -264,7 +264,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) ; X64-AVX1-LABEL: signbits_sext_shuffle_sitofp: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] @@ -478,7 +478,7 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] ; X64-AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X64-AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 ; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 ; X64-AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm4 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 7bde3facc735e2..93097e2b98fb7c 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -25,7 +25,7 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB0_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -48,7 +48,7 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read ; AVX-NEXT: cmpq %rcx, %rax ; AVX-NEXT: jne .LBB0_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -103,7 +103,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -128,7 +128,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -153,7 +153,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -218,7 +218,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -251,7 +251,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -278,7 +278,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -305,7 +305,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -386,7 +386,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -433,7 +433,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -466,7 +466,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -498,7 +498,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -527,7 +527,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -593,7 +593,7 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -616,7 +616,7 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly ; AVX-NEXT: cmpq %rcx, %rax ; AVX-NEXT: jne .LBB4_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -675,7 +675,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: jne .LBB5_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -701,7 +701,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -727,7 +727,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -798,7 +798,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -833,7 +833,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -861,7 +861,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -889,7 +889,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -982,7 +982,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1033,7 +1033,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1068,7 +1068,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1102,7 +1102,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1132,7 +1132,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1195,7 +1195,7 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -1218,7 +1218,7 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read ; AVX-NEXT: cmpq %rcx, %rax ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1279,7 +1279,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: jne .LBB9_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -1310,7 +1310,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1336,7 +1336,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1414,7 +1414,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1459,7 +1459,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1491,7 +1491,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1519,7 +1519,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1627,7 +1627,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; SSE2-NEXT: paddd %xmm5, %xmm9 ; SSE2-NEXT: paddd %xmm10, %xmm9 ; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE2-NEXT: paddd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1700,7 +1700,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1744,7 +1744,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1778,7 +1778,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2074,7 +2074,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) { ; ; AVX1-LABEL: pmaddwd_negative2: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 @@ -2647,7 +2647,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; SSE2-NEXT: movdqu (%rcx), %xmm2 ; SSE2-NEXT: pmaddwd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -2661,7 +2661,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2708,7 +2708,7 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a ; SSE2-NEXT: pmaddwd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -2730,7 +2730,7 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a ; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2798,13 +2798,13 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; SSE2-NEXT: jne .LBB33_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -2839,14 +2839,14 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovd %xmm1, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2874,14 +2874,14 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vmovd %xmm1, %ecx ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2953,7 +2953,7 @@ define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) { ; SSE2-NEXT: jne .LBB34_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -2980,7 +2980,7 @@ define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) { ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -3007,7 +3007,7 @@ define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) { ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll index 24ae0c77af2fe1..c93543ddda7aed 100644 --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -2478,7 +2478,7 @@ define void @compressstore_v2i64_v2i1(i64* %base, <2 x i64> %V, <2 x i1> %mask) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB7_4 ; SSE2-NEXT: LBB7_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; @@ -2574,7 +2574,7 @@ define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: LBB8_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: testb $4, %al @@ -2585,7 +2585,7 @@ define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB8_8 ; SSE2-NEXT: LBB8_7: ## %cond.store7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; @@ -2762,7 +2762,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB9_4 ; SSE2-NEXT: LBB9_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: testb $4, %al @@ -2773,7 +2773,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB9_8 ; SSE2-NEXT: LBB9_7: ## %cond.store7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: testb $16, %al @@ -2784,7 +2784,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB9_12 ; SSE2-NEXT: LBB9_11: ## %cond.store13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: testb $64, %al @@ -2795,7 +2795,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB9_16 ; SSE2-NEXT: LBB9_15: ## %cond.store19 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; @@ -3068,7 +3068,7 @@ define void @compressstore_v4i32_v4i32(i32* %base, <4 x i32> %V, <4 x i32> %trig ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB10_6 ; SSE2-NEXT: LBB10_5: ## %cond.store4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testb $8, %al diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll index dbd95213a60dec..c3020b5f467c6a 100644 --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -34,23 +34,23 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; SSE-NEXT: retq ; SSE-NEXT: .LBB0_1: # %cond.load ; SSE-NEXT: movq %xmm0, %rcx -; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; SSE-NEXT: testb $2, %al ; SSE-NEXT: je .LBB0_4 ; SSE-NEXT: .LBB0_3: # %cond.load1 ; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: insertps $16, (%rcx), %xmm3 # xmm3 = xmm3[0],mem[0],xmm3[2,3] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] ; SSE-NEXT: testb $4, %al ; SSE-NEXT: je .LBB0_6 ; SSE-NEXT: .LBB0_5: # %cond.load4 ; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: insertps $32, (%rcx), %xmm3 # xmm3 = xmm3[0,1],mem[0],xmm3[3] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB0_8 ; SSE-NEXT: .LBB0_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm1, %rax -; SSE-NEXT: insertps $48, (%rax), %xmm3 # xmm3 = xmm3[0,1,2],mem[0] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; @@ -63,14 +63,14 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; AVX1-NEXT: je .LBB0_2 ; AVX1-NEXT: # %bb.1: # %cond.load ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB0_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB0_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB0_4: # %else2 ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -84,12 +84,12 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB0_5: # %cond.load4 ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB0_8 ; AVX1-NEXT: .LBB0_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -103,14 +103,14 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; AVX2-NEXT: je .LBB0_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB0_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB0_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB0_4: # %else2 ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -124,12 +124,12 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB0_5: # %cond.load4 ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB0_8 ; AVX2-NEXT: .LBB0_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -159,7 +159,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; SSE-NEXT: pmovsxdq %xmm0, %xmm4 ; SSE-NEXT: psllq $2, %xmm4 ; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pmovsxdq %xmm0, %xmm0 ; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm5 @@ -168,7 +168,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; SSE-NEXT: je .LBB1_2 ; SSE-NEXT: # %bb.1: # %cond.load ; SSE-NEXT: movq %xmm4, %rcx -; SSE-NEXT: movd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; SSE-NEXT: .LBB1_2: # %else ; SSE-NEXT: psllq $2, %xmm0 @@ -176,7 +176,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; SSE-NEXT: je .LBB1_4 ; SSE-NEXT: # %bb.3: # %cond.load1 ; SSE-NEXT: pextrq $1, %xmm4, %rcx -; SSE-NEXT: insertps $16, (%rcx), %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; SSE-NEXT: .LBB1_4: # %else2 ; SSE-NEXT: paddq %xmm0, %xmm3 ; SSE-NEXT: testb $4, %al @@ -189,12 +189,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; SSE-NEXT: retq ; SSE-NEXT: .LBB1_5: # %cond.load4 ; SSE-NEXT: movq %xmm3, %rcx -; SSE-NEXT: insertps $32, (%rcx), %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB1_8 ; SSE-NEXT: .LBB1_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm3, %rax -; SSE-NEXT: insertps $48, (%rax), %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -202,7 +202,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq %rdi, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm4, %xmm4 ; AVX1-NEXT: vpsllq $2, %xmm4, %xmm4 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm4 @@ -217,14 +217,14 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX1-NEXT: je .LBB1_2 ; AVX1-NEXT: # %bb.1: # %cond.load ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB1_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB1_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB1_4: # %else2 ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -238,12 +238,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB1_5: # %cond.load4 ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB1_8 ; AVX1-NEXT: .LBB1_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -262,14 +262,14 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX2-NEXT: je .LBB1_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB1_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB1_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB1_4: # %else2 ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -283,12 +283,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB1_5: # %cond.load4 ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB1_8 ; AVX2-NEXT: .LBB1_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -328,7 +328,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; SSE-NEXT: je .LBB2_2 ; SSE-NEXT: # %bb.1: # %cond.load ; SSE-NEXT: movq %xmm0, %rcx -; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; SSE-NEXT: .LBB2_2: # %else ; SSE-NEXT: psllq $2, %xmm1 @@ -336,7 +336,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; SSE-NEXT: je .LBB2_4 ; SSE-NEXT: # %bb.3: # %cond.load1 ; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: insertps $16, (%rcx), %xmm3 # xmm3 = xmm3[0],mem[0],xmm3[2,3] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] ; SSE-NEXT: .LBB2_4: # %else2 ; SSE-NEXT: paddq %xmm1, %xmm4 ; SSE-NEXT: testb $4, %al @@ -349,12 +349,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; SSE-NEXT: retq ; SSE-NEXT: .LBB2_5: # %cond.load4 ; SSE-NEXT: movq %xmm4, %rcx -; SSE-NEXT: insertps $32, (%rcx), %xmm3 # xmm3 = xmm3[0,1],mem[0],xmm3[3] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB2_8 ; SSE-NEXT: .LBB2_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm4, %rax -; SSE-NEXT: insertps $48, (%rax), %xmm3 # xmm3 = xmm3[0,1,2],mem[0] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; @@ -375,14 +375,14 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; AVX1-NEXT: je .LBB2_2 ; AVX1-NEXT: # %bb.1: # %cond.load ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB2_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB2_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB2_4: # %else2 ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -396,12 +396,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB2_5: # %cond.load4 ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB2_8 ; AVX1-NEXT: .LBB2_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -419,14 +419,14 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; AVX2-NEXT: je .LBB2_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB2_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB2_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB2_4: # %else2 ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -440,12 +440,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB2_5: # %cond.load4 ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB2_8 ; AVX2-NEXT: .LBB2_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -480,7 +480,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE-NEXT: pmovsxdq %xmm0, %xmm0 ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: pxor %xmm6, %xmm6 @@ -513,7 +513,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; SSE-NEXT: pextrq $1, %xmm4, %rcx ; SSE-NEXT: pinsrb $3, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_8: # %else8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: testb $16, %al ; SSE-NEXT: je .LBB3_10 @@ -542,7 +542,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; SSE-NEXT: pextrq $1, %xmm1, %rcx ; SSE-NEXT: pinsrb $7, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_16: # %else20 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: testl $256, %eax # imm = 0x100 ; SSE-NEXT: je .LBB3_18 @@ -571,7 +571,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; SSE-NEXT: pextrq $1, %xmm1, %rcx ; SSE-NEXT: pinsrb $11, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_24: # %else32 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE-NEXT: je .LBB3_26 @@ -611,7 +611,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vmovq %rdi, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm6, %xmm6 ; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -626,7 +626,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vpinsrb $0, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_2: # %else -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB3_4 @@ -657,7 +657,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vpinsrb $4, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_10: # %else11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6 ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB3_12 @@ -689,7 +689,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vpinsrb $8, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_18: # %else23 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 ; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB3_20 @@ -1040,7 +1040,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; SSE-NEXT: je .LBB4_1 ; SSE-NEXT: # %bb.2: # %cond.load ; SSE-NEXT: movq %xmm5, %rcx -; SSE-NEXT: movd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: testb $2, %al ; SSE-NEXT: jne .LBB4_4 ; SSE-NEXT: jmp .LBB4_5 @@ -1105,7 +1105,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; SSE-NEXT: je .LBB4_19 ; SSE-NEXT: # %bb.20: # %cond.load23 ; SSE-NEXT: movq %xmm4, %rcx -; SSE-NEXT: movd (%rcx), %xmm5 # xmm5 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSE-NEXT: testb $2, %al ; SSE-NEXT: jne .LBB4_22 ; SSE-NEXT: jmp .LBB4_23 @@ -1174,7 +1174,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; SSE-NEXT: je .LBB4_37 ; SSE-NEXT: # %bb.38: # %cond.load72 ; SSE-NEXT: movq %xmm4, %rcx -; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: testb $2, %al ; SSE-NEXT: jne .LBB4_40 ; SSE-NEXT: jmp .LBB4_41 @@ -1260,7 +1260,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: je .LBB4_2 ; AVX1-NEXT: # %bb.1: # %cond.load ; AVX1-NEXT: vmovq %xmm3, %rdx -; AVX1-NEXT: vmovd (%rdx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: .LBB4_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_4 @@ -1334,7 +1334,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: je .LBB4_18 ; AVX1-NEXT: # %bb.17: # %cond.load23 ; AVX1-NEXT: vmovq %xmm7, %rcx -; AVX1-NEXT: vmovd (%rcx), %xmm4 # xmm4 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; AVX1-NEXT: .LBB4_18: # %else27 ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_20 @@ -1405,7 +1405,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: je .LBB4_34 ; AVX1-NEXT: # %bb.33: # %cond.load72 ; AVX1-NEXT: vmovq %xmm7, %rcx -; AVX1-NEXT: vmovd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: .LBB4_34: # %else76 ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_36 @@ -1491,7 +1491,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: je .LBB4_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm3, %rcx -; AVX2-NEXT: vmovd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: .LBB4_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB4_4 @@ -1534,7 +1534,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: je .LBB4_18 ; AVX2-NEXT: # %bb.17: # %cond.load23 ; AVX2-NEXT: vmovq %xmm3, %rcx -; AVX2-NEXT: vmovd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX2-NEXT: .LBB4_18: # %else27 ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB4_20 @@ -1678,7 +1678,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: jmp .LBB4_32 ; AVX2-NEXT: .LBB4_33: # %cond.load72 ; AVX2-NEXT: vmovq %xmm3, %rcx -; AVX2-NEXT: vmovd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB4_36 ; AVX2-NEXT: .LBB4_35: # %cond.load78 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index a14d537bc25da6..e8dc7412eef8b8 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -153,7 +153,7 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, < ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 @@ -233,7 +233,7 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 @@ -458,12 +458,12 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; ; AVX1-LABEL: load_v8f64_v8i16: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 @@ -479,7 +479,7 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; ; AVX2-LABEL: load_v8f64_v8i16: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3 @@ -1778,12 +1778,12 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6 ; ; AVX1-LABEL: load_v8i64_v8i16: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 @@ -1799,7 +1799,7 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6 ; ; AVX2-LABEL: load_v8i64_v8i16: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 29678e8495c229..389281726d2739 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -838,7 +838,7 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> % ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB7_4 ; SSE2-NEXT: LBB7_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; @@ -922,7 +922,7 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> % ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: LBB8_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, 8(%rdi) ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB8_6 @@ -931,7 +931,7 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> % ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB8_8 ; SSE2-NEXT: LBB8_7: ## %cond.store5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm0, 24(%rdi) ; SSE2-NEXT: retq ; @@ -1158,7 +1158,7 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> % ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB11_6 ; SSE2-NEXT: LBB11_5: ## %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB11_8 @@ -1280,7 +1280,7 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> % ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB12_6 ; SSE2-NEXT: LBB12_5: ## %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB12_8 @@ -1299,7 +1299,7 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> % ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB12_14 ; SSE2-NEXT: LBB12_13: ## %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB12_16 @@ -4674,7 +4674,7 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub ; AVX1: ## %bb.0: ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) @@ -4853,7 +4853,7 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; SSE2-NEXT: testb $4, %cl ; SSE2-NEXT: je LBB25_6 ; SSE2-NEXT: LBB25_5: ## %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 341a349911180d..640e145c202310 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -60,7 +60,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: .LBB0_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: jne .LBB0_7 @@ -75,7 +75,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 ; SSE2-NEXT: .LBB0_13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 @@ -1030,7 +1030,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 ; SSE2-NEXT: .LBB3_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index c120684c426103..17c113f098eb4b 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -141,7 +141,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm4, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm3, %xmm2 @@ -176,7 +176,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 ; SSE2-NEXT: .LBB0_13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 @@ -1579,7 +1579,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 ; SSE2-NEXT: .LBB3_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 0160733732d709..2ca9ebb0d5c6d6 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -87,7 +87,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm0, %xmm3 @@ -122,7 +122,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 ; SSE2-NEXT: .LBB0_13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 @@ -1351,7 +1351,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 ; SSE2-NEXT: .LBB3_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll index 8b3756a1fa2415..6dcc47b9a65c4e 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -309,7 +309,7 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 12(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 8(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -320,7 +320,7 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 28(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 24(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] @@ -348,12 +348,12 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, (%rsi) -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) ; X64-SSE2-NEXT: movq %xmm1, %rax ; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) ; X64-SSE2-NEXT: retq @@ -422,7 +422,7 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 12(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 8(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -433,7 +433,7 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 28(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 24(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] @@ -461,12 +461,12 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, (%rsi) -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) ; X64-SSE2-NEXT: movq %xmm1, %rax ; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 2980a1665db145..666df4a1b960d8 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -197,7 +197,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -263,7 +263,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -327,7 +327,7 @@ define i32 @sad_16i8_256() "min-legal-vector-width"="256" { ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -388,7 +388,7 @@ define i32 @sad_16i8_512() "min-legal-vector-width"="512" { ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -955,10 +955,10 @@ define void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal ; CHECK-LABEL: zext_v16i8_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero @@ -977,10 +977,10 @@ define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal ; CHECK-LABEL: sext_v16i8_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3 ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 ; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll index c83c675e78890a..4e83a7ce7231b2 100644 --- a/llvm/test/CodeGen/X86/nontemporal-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-2.ll @@ -595,14 +595,14 @@ define void @test_extract_f64(<2 x double> %arg, double* %dst) { define void @test_extract_i64(<2 x i64> %arg, i64* %dst) { ; SSE2-LABEL: test_extract_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movntiq %rax, (%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_extract_i64: ; SSE4A: # %bb.0: -; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE4A-NEXT: movq %xmm0, %rax ; SSE4A-NEXT: movntiq %rax, (%rdi) ; SSE4A-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 2f74c830221b08..d24fd3f024d49f 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -9,7 +9,7 @@ define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind { ; SSE2-LABEL: v3i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movq %xmm2, 16(%rdi) ; SSE2-NEXT: movdqa %xmm0, (%rdi) @@ -285,7 +285,7 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind { ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7] ; SSE2-NEXT: movw %ax, 12(%rdi) ; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; @@ -391,7 +391,7 @@ define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind { ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: movq %xmm2, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; @@ -539,7 +539,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind { ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] @@ -637,7 +637,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; @@ -1202,7 +1202,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE2-NEXT: movups 32(%rdi), %xmm10 ; SSE2-NEXT: movups 48(%rdi), %xmm12 ; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[3,3] @@ -1215,7 +1215,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[1,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm4[0,2] ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3] @@ -1243,12 +1243,12 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[1] ; SSE42-NEXT: movdqa %xmm9, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1] ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,2] @@ -1476,7 +1476,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] @@ -1489,7 +1489,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5],xmm4[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll index dd258c5f424a43..7f3eb0898cc864 100644 --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -46,7 +46,7 @@ define i32 @extract_extract01_v4i32_add_i32(<4 x i32> %x) { define i32 @extract_extract23_v4i32_add_i32(<4 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -114,7 +114,7 @@ define i32 @extract_extract01_v4i32_add_i32_commute(<4 x i32> %x) { define i32 @extract_extract23_v4i32_add_i32_commute(<4 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -326,7 +326,7 @@ define i32 @extract_extract01_v4i32_sub_i32(<4 x i32> %x) { define i32 @extract_extract23_v4i32_sub_i32(<4 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4i32_sub_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %eax ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx @@ -382,7 +382,7 @@ define i32 @extract_extract01_v4i32_sub_i32_commute(<4 x i32> %x) { define i32 @extract_extract23_v4i32_sub_i32_commute(<4 x i32> %x) { ; SSE3-LABEL: extract_extract23_v4i32_sub_i32_commute: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm1, %ecx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %eax @@ -557,7 +557,7 @@ define i32 @extract_extract01_v8i32_add_i32(<8 x i32> %x) { define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -594,7 +594,7 @@ define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) { define i32 @extract_extract67_v8i32_add_i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -684,7 +684,7 @@ define i32 @extract_extract01_v8i32_add_i32_commute(<8 x i32> %x) { define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -721,7 +721,7 @@ define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) { define i32 @extract_extract67_v8i32_add_i32_commute(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -1119,7 +1119,7 @@ define i32 @extract_extract01_v8i32_sub_i32(<8 x i32> %x) { define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8i32_sub_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %eax ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx @@ -1156,7 +1156,7 @@ define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) { define i32 @extract_extract67_v8i32_sub_i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract67_v8i32_sub_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx @@ -1672,7 +1672,7 @@ define i32 @extract_extract01_v4i32_add_i32_uses3(<4 x i32> %x, i32* %p1, i32* % define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: partial_reduction_add_v8i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -1681,7 +1681,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { ; ; SSE3-FAST-LABEL: partial_reduction_add_v8i32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 ; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSE3-FAST-NEXT: movd %xmm1, %eax @@ -1689,7 +1689,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_add_v8i32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1715,7 +1715,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { ; SSE3-SLOW-LABEL: partial_reduction_add_v16i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -1724,7 +1724,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { ; ; SSE3-FAST-LABEL: partial_reduction_add_v16i32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 ; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSE3-FAST-NEXT: movd %xmm1, %eax @@ -1732,7 +1732,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_add_v16i32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1758,7 +1758,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: partial_reduction_sub_v8i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 @@ -1767,7 +1767,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { ; ; SSE3-FAST-LABEL: partial_reduction_sub_v8i32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: psubd %xmm1, %xmm0 ; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 ; SSE3-FAST-NEXT: movd %xmm0, %eax @@ -1775,7 +1775,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_sub_v8i32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1785,7 +1785,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { ; ; AVX-FAST-LABEL: partial_reduction_sub_v8i32: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax @@ -1802,7 +1802,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; SSE3-SLOW-LABEL: partial_reduction_sub_v16i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 @@ -1811,7 +1811,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; SSE3-FAST-LABEL: partial_reduction_sub_v16i32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: psubd %xmm1, %xmm0 ; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 ; SSE3-FAST-NEXT: movd %xmm0, %eax @@ -1819,7 +1819,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_sub_v16i32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1829,7 +1829,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; AVX1-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax @@ -1838,7 +1838,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; AVX2-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1848,7 +1848,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; AVX512-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1868,7 +1868,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { define i16 @hadd16_8(<8 x i16> %x223) { ; SSE3-SLOW-LABEL: hadd16_8: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddw %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddw %xmm1, %xmm0 @@ -1890,7 +1890,7 @@ define i16 @hadd16_8(<8 x i16> %x223) { ; ; AVX-SLOW-LABEL: hadd16_8: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -1921,7 +1921,7 @@ define i16 @hadd16_8(<8 x i16> %x223) { define i32 @hadd32_4(<4 x i32> %x225) { ; SSE3-SLOW-LABEL: hadd32_4: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -1937,7 +1937,7 @@ define i32 @hadd32_4(<4 x i32> %x225) { ; ; AVX-SLOW-LABEL: hadd32_4: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1961,7 +1961,7 @@ define i32 @hadd32_4(<4 x i32> %x225) { define i32 @hadd32_8(<8 x i32> %x225) { ; SSE3-SLOW-LABEL: hadd32_8: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -1970,7 +1970,7 @@ define i32 @hadd32_8(<8 x i32> %x225) { ; ; SSE3-FAST-LABEL: hadd32_8: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 ; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSE3-FAST-NEXT: movd %xmm1, %eax @@ -1978,7 +1978,7 @@ define i32 @hadd32_8(<8 x i32> %x225) { ; ; AVX-SLOW-LABEL: hadd32_8: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2004,7 +2004,7 @@ define i32 @hadd32_8(<8 x i32> %x225) { define i32 @hadd32_16(<16 x i32> %x225) { ; SSE3-SLOW-LABEL: hadd32_16: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -2013,7 +2013,7 @@ define i32 @hadd32_16(<16 x i32> %x225) { ; ; SSE3-FAST-LABEL: hadd32_16: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 ; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSE3-FAST-NEXT: movd %xmm1, %eax @@ -2021,7 +2021,7 @@ define i32 @hadd32_16(<16 x i32> %x225) { ; ; AVX-SLOW-LABEL: hadd32_16: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2119,7 +2119,7 @@ define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 { define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_8_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-NEXT: paddd %xmm0, %xmm1 ; SSE3-NEXT: phaddd %xmm1, %xmm1 ; SSE3-NEXT: movd %xmm1, %eax @@ -2143,7 +2143,7 @@ define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize { define i32 @hadd32_16_optsize(<16 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_16_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-NEXT: paddd %xmm0, %xmm1 ; SSE3-NEXT: phaddd %xmm1, %xmm1 ; SSE3-NEXT: movd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 30f87e3d9b27bc..4285e7b603f808 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1083,7 +1083,7 @@ define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) { ; ; SSE41-LABEL: mul_v4i64_zero_upper_left: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -1252,14 +1252,14 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm14 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] @@ -1306,7 +1306,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 @@ -1324,7 +1324,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; ; AVX2-LABEL: mul_v8i64_sext: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 14fc182a334a8f..31e113f9a003f6 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -434,7 +434,7 @@ define <8 x i32> @mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmulhuw %xmm1, %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -467,7 +467,7 @@ define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmulhw %xmm1, %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -498,7 +498,7 @@ define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmulhw %xmm1, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -538,9 +538,9 @@ define <16 x i32> @mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmulhuw %xmm3, %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm5, %xmm1 @@ -589,9 +589,9 @@ define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmulhw %xmm3, %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm5, %xmm1 @@ -639,9 +639,9 @@ define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) { ; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 ; SSE41-NEXT: pmulhw %xmm3, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm5, %xmm1 @@ -702,16 +702,16 @@ define <32 x i32> @mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhuw %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; SSE41-NEXT: pmulhuw %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; SSE41-NEXT: pmulhuw %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; SSE41-NEXT: pmulhuw %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -800,16 +800,16 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; SSE41-NEXT: pmulhw %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; SSE41-NEXT: pmulhw %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; SSE41-NEXT: pmulhw %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -901,16 +901,16 @@ define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm4, %xmm4 ; SSE41-NEXT: pmulhw %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm5, %xmm5 ; SSE41-NEXT: pmulhw %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm6, %xmm6 ; SSE41-NEXT: pmulhw %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm7, %xmm7 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 @@ -1026,28 +1026,28 @@ define <64 x i32> @mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1204,28 +1204,28 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1386,28 +1386,28 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm8, %xmm8 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm9, %xmm9 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm10, %xmm10 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm11, %xmm11 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm12, %xmm12 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm13, %xmm13 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm14, %xmm14 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm15, %xmm15 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 @@ -1541,7 +1541,7 @@ define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -1552,7 +1552,7 @@ define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX2-NEXT: retq ; @@ -1648,7 +1648,7 @@ define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -1659,7 +1659,7 @@ define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX2-NEXT: retq ; @@ -1775,7 +1775,7 @@ define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 @@ -1786,7 +1786,7 @@ define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll index 73acb76ce55fce..b8ecadba81c0ee 100644 --- a/llvm/test/CodeGen/X86/pr15267.ll +++ b/llvm/test/CodeGen/X86/pr15267.ll @@ -75,7 +75,7 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind { ; CHECK-NEXT: negl %eax ; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr39733.ll b/llvm/test/CodeGen/X86/pr39733.ll index 75f9dc51b85eb5..31bd5b71d0a6e7 100644 --- a/llvm/test/CodeGen/X86/pr39733.ll +++ b/llvm/test/CodeGen/X86/pr39733.ll @@ -21,7 +21,7 @@ define void @test55() { ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vmovaps %xmm1, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rsp) diff --git a/llvm/test/CodeGen/X86/pr42452.ll b/llvm/test/CodeGen/X86/pr42452.ll index f2f0cd2d3ce6ba..d3a1dad42bd39e 100644 --- a/llvm/test/CodeGen/X86/pr42452.ll +++ b/llvm/test/CodeGen/X86/pr42452.ll @@ -8,7 +8,7 @@ define void @foo(i1 %c, <2 x i64> %x) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: # kill: def $dil killed $dil killed $edi ; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rcx ; CHECK-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill diff --git a/llvm/test/CodeGen/X86/pr42905.ll b/llvm/test/CodeGen/X86/pr42905.ll index 310a173f824e95..6ebe5be45a4f8b 100644 --- a/llvm/test/CodeGen/X86/pr42905.ll +++ b/llvm/test/CodeGen/X86/pr42905.ll @@ -7,7 +7,7 @@ define <4 x double> @autogen_SD30452(i1 %L230) { ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [151829,151829] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm2, %rax ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2sd %rax, %xmm2 diff --git a/llvm/test/CodeGen/X86/pr44976.ll b/llvm/test/CodeGen/X86/pr44976.ll index 871937d295508b..72ced2b1fa0f4d 100644 --- a/llvm/test/CodeGen/X86/pr44976.ll +++ b/llvm/test/CodeGen/X86/pr44976.ll @@ -57,7 +57,7 @@ define <3 x i32> @f_29(<12 x i16> %a, <12 x i16> %b) { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,1] ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,2,3] ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; CHECK-NEXT: paddd %xmm3, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll index 681e06ada65d21..36b2de07bcc835 100644 --- a/llvm/test/CodeGen/X86/pr45378.ll +++ b/llvm/test/CodeGen/X86/pr45378.ll @@ -77,7 +77,7 @@ define i1 @parseHeaders2_scalar_and(i64 * %ptr) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: testq %rcx, %rax ; SSE2-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/pr46189.ll b/llvm/test/CodeGen/X86/pr46189.ll index 558483754c6809..97190c10ef7c95 100644 --- a/llvm/test/CodeGen/X86/pr46189.ll +++ b/llvm/test/CodeGen/X86/pr46189.ll @@ -21,7 +21,7 @@ define { i64, i64 } @PR46189(double %0, double %1, double %2, double %3, double ; SSE-NEXT: cvttpd2dq %xmm3, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movq %xmm0, %rdx ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr46455.ll b/llvm/test/CodeGen/X86/pr46455.ll index e5ed94aa54934e..c232d548faefd1 100644 --- a/llvm/test/CodeGen/X86/pr46455.ll +++ b/llvm/test/CodeGen/X86/pr46455.ll @@ -10,7 +10,7 @@ define void @EntryModule(i8** %buffer_table) { ; CHECK-NEXT: vcmpneqps (%rax), %ymm0, %ymm0 ; CHECK-NEXT: vpsrld $31, %xmm0, %xmm1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; CHECK-NEXT: vpsubd %xmm0, %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 0706254f4e5c05..bdbc5638168643 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -135,7 +135,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 ; AVX256VL-NEXT: vpmovsxbd %xmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 28ab2e1968dbe3..d785b10f9c3270 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -522,7 +522,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test13: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm5, %xmm3 @@ -697,7 +697,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero @@ -735,9 +735,9 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; AVX1-NEXT: vpmaxud %xmm6, %xmm7, %xmm6 @@ -772,7 +772,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; ; AVX2-LABEL: test14: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpmaxud %ymm4, %ymm1, %ymm4 @@ -873,7 +873,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test15: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm4, %xmm5 @@ -1005,7 +1005,7 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test16: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmaxud %xmm1, %xmm4 @@ -1871,10 +1871,10 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmaxud %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index f55a58048e227a..a197e795754adc 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -27,7 +27,7 @@ define i32 @sad_16i8() nounwind { ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -54,7 +54,7 @@ define i32 @sad_16i8() nounwind { ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -79,7 +79,7 @@ define i32 @sad_16i8() nounwind { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -104,7 +104,7 @@ define i32 @sad_16i8() nounwind { ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -172,7 +172,7 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -207,7 +207,7 @@ define i32 @sad_32i8() nounwind { ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -234,7 +234,7 @@ define i32 @sad_32i8() nounwind { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -261,7 +261,7 @@ define i32 @sad_32i8() nounwind { ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -346,7 +346,7 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -397,7 +397,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -431,7 +431,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -463,7 +463,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -492,7 +492,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -628,7 +628,7 @@ define i32 @sad_4i8() nounwind { ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -649,7 +649,7 @@ define i32 @sad_4i8() nounwind { ; AVX-NEXT: addq $4, %rax ; AVX-NEXT: jne .LBB4_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -760,7 +760,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rdx), %xmm1 ; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq @@ -769,7 +769,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n ; AVX: # %bb.0: ; AVX-NEXT: vmovdqu (%rdi), %xmm0 ; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -803,7 +803,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq @@ -815,7 +815,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq @@ -826,7 +826,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper @@ -838,7 +838,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper @@ -883,7 +883,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; SSE2-NEXT: paddq %xmm0, %xmm2 ; SSE2-NEXT: paddq %xmm1, %xmm2 ; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq @@ -901,7 +901,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq @@ -915,7 +915,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper @@ -931,7 +931,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper @@ -945,7 +945,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper @@ -987,7 +987,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; SSE2-NEXT: psadbw %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1002,7 +1002,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1051,7 +1051,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; SSE2-NEXT: movdqu (%rcx), %xmm2 ; SSE2-NEXT: psadbw %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1065,7 +1065,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index 02400d28c0e2cd..3b3a1b57ecd02e 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -443,13 +443,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pcmpgtd %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movdqa %xmm1, %xmm4 ; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: psllq $31, %xmm0 ; X64-NEXT: movq %xmm0, %rax @@ -457,9 +457,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx @@ -478,9 +478,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: idivq %rdi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X64-NEXT: movq %xmm2, %rsi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rsi diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index e5348b9febb4d4..512488e8f8725d 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -651,14 +651,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx ; X64-NEXT: movq %rbx, %r13 ; X64-NEXT: sarq $63, %r13 ; X64-NEXT: shldq $31, %rbx, %r13 -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rbp @@ -709,8 +709,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -720,8 +720,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movq %rbx, %r12 ; X64-NEXT: sarq $63, %r12 ; X64-NEXT: shldq $31, %rbx, %r12 -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -773,14 +773,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx ; X64-NEXT: movq %rbx, %r13 ; X64-NEXT: sarq $63, %r13 ; X64-NEXT: shldq $31, %rbx, %r13 -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rbp diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index 74a6ae58f9f9f3..8a788f41d5cc23 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -75,16 +75,16 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) { define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rcx ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: movq %xmm1, %r8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rsi ; SSE2-NEXT: xorq %rcx, %rsi ; SSE2-NEXT: orq %rdi, %rsi @@ -155,16 +155,16 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rcx ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: movq %xmm1, %r8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rsi ; SSE2-NEXT: xorq %rcx, %rsi ; SSE2-NEXT: orq %rdi, %rsi @@ -235,28 +235,28 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; SSE2-LABEL: ne_i512: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rsi ; SSE2-NEXT: movq %xmm0, %r11 ; SSE2-NEXT: movq %xmm2, %r8 ; SSE2-NEXT: movq %xmm1, %r9 ; SSE2-NEXT: movq %xmm3, %r10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: xorq %rdx, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx ; SSE2-NEXT: orq %rcx, %rdx @@ -426,28 +426,28 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; SSE2-LABEL: eq_i512: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rsi ; SSE2-NEXT: movq %xmm0, %r11 ; SSE2-NEXT: movq %xmm2, %r8 ; SSE2-NEXT: movq %xmm1, %r9 ; SSE2-NEXT: movq %xmm3, %r10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: xorq %rdx, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx ; SSE2-NEXT: orq %rcx, %rdx diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 6a5fab8469fa7b..17cfc53266909c 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -2094,9 +2094,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %esi ; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] ; X86-SSE-NEXT: movd %xmm7, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,0,1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] ; X86-SSE-NEXT: movd %xmm7, %esi ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %esi @@ -2137,9 +2137,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %esi ; X86-SSE-NEXT: movd %edx, %xmm4 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-SSE-NEXT: movd %xmm2, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X86-SSE-NEXT: movd %xmm1, %esi ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %esi @@ -2336,9 +2336,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %ecx ; X64-SSE-NEXT: movd %edx, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] ; X64-SSE-NEXT: movd %xmm7, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] ; X64-SSE-NEXT: movd %xmm7, %ecx ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %ecx @@ -2379,9 +2379,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %ecx ; X64-SSE-NEXT: movd %edx, %xmm4 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X64-SSE-NEXT: movd %xmm2, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-SSE-NEXT: movd %xmm1, %ecx ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll index 0b79b62f84a115..53deafc9a4b435 100644 --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -224,7 +224,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLM32: # %bb.0: ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLM32-NEXT: movdqa %xmm1, %xmm4 ; SLM32-NEXT: movdqa %xmm3, %xmm5 @@ -244,7 +244,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLM64: # %bb.0: ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLM64-NEXT: movdqa %xmm1, %xmm4 ; SLM64-NEXT: movdqa %xmm3, %xmm5 @@ -270,7 +270,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLOW32-NEXT: movdqa %xmm1, %xmm4 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLOW32-NEXT: movdqa %xmm3, %xmm0 ; SLOW32-NEXT: pmulhw %xmm2, %xmm0 @@ -291,7 +291,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLOW64-NEXT: movdqa %xmm1, %xmm4 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLOW64-NEXT: movdqa %xmm3, %xmm0 ; SLOW64-NEXT: pmulhw %xmm2, %xmm0 @@ -306,7 +306,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SSE4-32: # %bb.0: ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -322,7 +322,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SSE4-64: # %bb.0: ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -336,7 +336,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; ; AVX2-32-LABEL: test_mul_v16i32_v16i8: ; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] @@ -346,7 +346,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; ; AVX2-64-LABEL: test_mul_v16i32_v16i8: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] @@ -494,7 +494,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; ; SSE4-32-LABEL: test_mul_v8i32_v8i16: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -504,7 +504,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; ; SSE4-64-LABEL: test_mul_v8i32_v8i16: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -609,9 +609,9 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; ; SSE4-32-LABEL: test_mul_v16i32_v16i16: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -625,9 +625,9 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; ; SSE4-64-LABEL: test_mul_v16i32_v16i16: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -880,7 +880,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] ; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero @@ -896,7 +896,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] ; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero @@ -910,7 +910,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLOW32: # %bb.0: ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -926,7 +926,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLOW64: # %bb.0: ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -942,7 +942,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SSE4-32: # %bb.0: ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -958,7 +958,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SSE4-64: # %bb.0: ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -972,7 +972,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize: ; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] @@ -982,7 +982,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] @@ -1077,7 +1077,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; SLM32-LABEL: test_mul_v8i32_v8i16_minsize: ; SLM32: # %bb.0: ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLM32-NEXT: pmulld %xmm2, %xmm0 @@ -1087,7 +1087,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; SLM64-LABEL: test_mul_v8i32_v8i16_minsize: ; SLM64: # %bb.0: ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLM64-NEXT: pmulld %xmm2, %xmm0 @@ -1096,7 +1096,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; ; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize: ; SLOW32: # %bb.0: -; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -1106,7 +1106,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; ; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize: ; SLOW64: # %bb.0: -; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -1116,7 +1116,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; ; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -1126,7 +1126,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; ; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -1155,9 +1155,9 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM32: # %bb.0: -; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1171,9 +1171,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM64: # %bb.0: -; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1187,9 +1187,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize: ; SLOW32: # %bb.0: -; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -1203,9 +1203,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize: ; SLOW64: # %bb.0: -; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -1219,9 +1219,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -1235,9 +1235,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index 284f51d7422e6a..e0c1b762c15072 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -206,10 +206,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; X64-NEXT: cmovll %ecx, %edx ; X64-NEXT: movd %edx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %edx ; X64-NEXT: movslq %edx, %rdx -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %esi ; X64-NEXT: movslq %esi, %rsi ; X64-NEXT: imulq %rdx, %rsi @@ -476,9 +476,9 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF ; X64-NEXT: imull %edx, %ecx ; X64-NEXT: cmovol %edi, %ecx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm2, %edx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm2, %esi ; X64-NEXT: movl %esi, %edi ; X64-NEXT: imull %edx, %edi diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll index 7f0e2cfa44b02e..0abac92099055f 100644 --- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll +++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll @@ -5,7 +5,7 @@ define <4 x i64> @autogen_SD88863() { ; CHECK-LABEL: autogen_SD88863: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] diff --git a/llvm/test/CodeGen/X86/split-vector-rem.ll b/llvm/test/CodeGen/X86/split-vector-rem.ll index ef03075ac65d67..959c9bd0ff7694 100644 --- a/llvm/test/CodeGen/X86/split-vector-rem.ll +++ b/llvm/test/CodeGen/X86/split-vector-rem.ll @@ -12,9 +12,9 @@ define <8 x i32> @foo(<8 x i32> %t, <8 x i32> %u) { ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; CHECK-NEXT: movd %xmm5, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; CHECK-NEXT: movd %xmm5, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx @@ -41,9 +41,9 @@ define <8 x i32> @foo(<8 x i32> %t, <8 x i32> %u) { ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; CHECK-NEXT: movd %xmm4, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; CHECK-NEXT: movd %xmm4, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx @@ -79,9 +79,9 @@ define <8 x i32> @bar(<8 x i32> %t, <8 x i32> %u) { ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; CHECK-NEXT: movd %xmm5, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; CHECK-NEXT: movd %xmm5, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx @@ -108,9 +108,9 @@ define <8 x i32> @bar(<8 x i32> %t, <8 x i32> %u) { ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; CHECK-NEXT: movd %xmm4, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; CHECK-NEXT: movd %xmm4, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 85400656e2e547..206814cfcf1c77 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2791,8 +2791,8 @@ define void @test_mm_storeh_pi(x86_mmx *%a0, <4 x float> %a1) nounwind { ; ; X64-SSE2-LABEL: test_mm_storeh_pi: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pshufd $78, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x4e] -; X64-SSE2-NEXT: # xmm0 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: punpckhqdq %xmm0, %xmm0 # encoding: [0x66,0x0f,0x6d,0xc0] +; X64-SSE2-NEXT: # xmm0 = xmm0[1,1] ; X64-SSE2-NEXT: movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0] ; X64-SSE2-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07] ; X64-SSE2-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 460987cd74dfee..e2973ebbab89ea 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -692,24 +692,24 @@ entry: define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: insertps_from_shufflevector_i32_2: ; SSE: ## %bb.0: ## %entry -; SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e] -; SSE-NEXT: ## xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] +; SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pblendw $12, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x0c] ; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: insertps_from_shufflevector_i32_2: ; AVX1: ## %bb.0: ## %entry -; AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: insertps_from_shufflevector_i32_2: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] @@ -1875,8 +1875,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; X86-SSE-LABEL: insertps_pr20411: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e] -; X86-SSE-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X86-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] +; X86-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3] ; X86-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; X86-SSE-NEXT: movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08] @@ -1885,8 +1885,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; X86-AVX1-LABEL: insertps_pr20411: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X86-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X86-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00] @@ -1895,8 +1895,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; X86-AVX512-LABEL: insertps_pr20411: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X86-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] @@ -1904,8 +1904,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; ; X64-SSE-LABEL: insertps_pr20411: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e] -; X64-SSE-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X64-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] +; X64-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3] ; X64-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; X64-SSE-NEXT: movdqu %xmm1, (%rdi) ## encoding: [0xf3,0x0f,0x7f,0x0f] @@ -1913,8 +1913,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; ; X64-AVX1-LABEL: insertps_pr20411: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X64-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X64-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07] @@ -1922,8 +1922,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; ; X64-AVX512-LABEL: insertps_pr20411: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X64-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll index 05b2b6608addda..4c7acf60d30803 100644 --- a/llvm/test/CodeGen/X86/trunc-subvector.ll +++ b/llvm/test/CodeGen/X86/trunc-subvector.ll @@ -73,7 +73,7 @@ define <2 x i32> @test4(<8 x i32> %v) { define <2 x i32> @test5(<8 x i32> %v) { ; SSE2-LABEL: test5: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; @@ -175,7 +175,7 @@ define <2 x i32> @test9(<8 x i32> %v) { define <2 x i32> @test10(<8 x i32> %v) { ; SSE2-LABEL: test10: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll index 7540b394babdfd..e42e527553f14b 100644 --- a/llvm/test/CodeGen/X86/udiv_fix.ll +++ b/llvm/test/CodeGen/X86/udiv_fix.ll @@ -248,9 +248,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx @@ -264,9 +264,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll index 493515c418caa2..f6c8baf2e92387 100644 --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -335,9 +335,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm7 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx @@ -369,9 +369,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx diff --git a/llvm/test/CodeGen/X86/uint_to_fp-3.ll b/llvm/test/CodeGen/X86/uint_to_fp-3.ll index 9efd9a5bef5f24..ca46b48b7731f3 100644 --- a/llvm/test/CodeGen/X86/uint_to_fp-3.ll +++ b/llvm/test/CodeGen/X86/uint_to_fp-3.ll @@ -40,7 +40,7 @@ define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) { ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm1 ; X32-SSE-NEXT: movaps %xmm2, %xmm0 ; X32-SSE-NEXT: retl @@ -55,7 +55,7 @@ define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) { ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm1 ; X64-SSE-NEXT: movaps %xmm2, %xmm0 ; X64-SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index 18b769f32ec94d..ce744f93cdfe8a 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -151,9 +151,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movl $-1, %eax ; X64-NEXT: cmoval %eax, %ecx ; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %ecx -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %edx ; X64-NEXT: imulq %rcx, %rdx ; X64-NEXT: movq %rdx, %rcx @@ -361,9 +361,9 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movl $-1, %ecx ; X64-NEXT: cmovol %ecx, %eax ; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %edx ; X64-NEXT: mull %edx ; X64-NEXT: cmovol %ecx, %eax diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 9fdb08ba6d4e83..923aaa34f04d0f 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -2411,7 +2411,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2516,7 +2516,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,1] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 8bc971e79f5073..0a057852613aef 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -16,7 +16,7 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { ; SSE3: # %bb.0: ; SSE3-NEXT: movq %xmm1, %rax ; SSE3-NEXT: andl $1, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE3-NEXT: movq %xmm1, %rcx ; SSE3-NEXT: andl $1, %ecx ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -29,7 +29,7 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %xmm1, %rax ; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -69,7 +69,7 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind { ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm1, %esi @@ -379,7 +379,7 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun ; SSE3: # %bb.0: ; SSE3-NEXT: movq %xmm1, %rax ; SSE3-NEXT: andl $1, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE3-NEXT: movq %xmm1, %rcx ; SSE3-NEXT: andl $1, %ecx ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -391,7 +391,7 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %xmm1, %rax ; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -430,7 +430,7 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm1, %esi diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll index 04692995b7e95e..94f7d7eeaf39d0 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -135,7 +135,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fildll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fstps (%esp) @@ -154,7 +154,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq %xmm0, %rax ; SSE-64-NEXT: cvtsi2ss %rax, %xmm1 -; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-64-NEXT: movq %xmm0, %rax ; SSE-64-NEXT: xorps %xmm0, %xmm0 ; SSE-64-NEXT: cvtsi2ss %rax, %xmm0 @@ -172,7 +172,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE41-32-NEXT: andl $-8, %esp ; SSE41-32-NEXT: subl $24, %esp ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp) ; SSE41-32-NEXT: fstps (%esp) @@ -191,7 +191,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE41-64: # %bb.0: ; SSE41-64-NEXT: movq %xmm0, %rax ; SSE41-64-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-64-NEXT: movq %xmm0, %rax ; SSE41-64-NEXT: xorps %xmm0, %xmm0 ; SSE41-64-NEXT: cvtsi2ss %rax, %xmm0 @@ -209,7 +209,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $24, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstps (%esp) @@ -236,7 +236,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -271,7 +271,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE-32-NEXT: .cfi_def_cfa_register %ebp ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp) ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] @@ -313,7 +313,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE-64-NEXT: # %bb.1: ; SSE-64-NEXT: addss %xmm0, %xmm0 ; SSE-64-NEXT: .LBB3_2: -; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-64-NEXT: movq %xmm1, %rax ; SSE-64-NEXT: movq %rax, %rcx ; SSE-64-NEXT: shrq %rcx @@ -340,7 +340,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE41-32-NEXT: .cfi_def_cfa_register %ebp ; SSE41-32-NEXT: andl $-8, %esp ; SSE41-32-NEXT: subl $24, %esp -; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp) ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] @@ -382,7 +382,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE41-64-NEXT: # %bb.1: ; SSE41-64-NEXT: addss %xmm0, %xmm0 ; SSE41-64-NEXT: .LBB3_2: -; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-64-NEXT: movq %xmm1, %rax ; SSE41-64-NEXT: movq %rax, %rcx ; SSE41-64-NEXT: shrq %rcx @@ -410,7 +410,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $24, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax @@ -471,7 +471,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -1146,7 +1146,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $32, %esp ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fildll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -1164,7 +1164,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq %xmm0, %rax ; SSE-64-NEXT: cvtsi2sd %rax, %xmm1 -; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-64-NEXT: movq %xmm0, %rax ; SSE-64-NEXT: xorps %xmm0, %xmm0 ; SSE-64-NEXT: cvtsi2sd %rax, %xmm0 @@ -1182,7 +1182,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE41-32-NEXT: andl $-8, %esp ; SSE41-32-NEXT: subl $32, %esp ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp) ; SSE41-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -1200,7 +1200,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE41-64: # %bb.0: ; SSE41-64-NEXT: movq %xmm0, %rax ; SSE41-64-NEXT: cvtsi2sd %rax, %xmm1 -; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-64-NEXT: movq %xmm0, %rax ; SSE41-64-NEXT: xorps %xmm0, %xmm0 ; SSE41-64-NEXT: cvtsi2sd %rax, %xmm0 @@ -1218,7 +1218,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll index d97787e36cd853..a5519e68f73a3f 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -281,7 +281,7 @@ define <8 x float> @sitofp_v8i16_v8f32(<8 x i16> %x) #0 { ; AVX1-LABEL: sitofp_v8i16_v8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll index ed703f17084785..c979050fc18098 100644 --- a/llvm/test/CodeGen/X86/vec_cast2.ll +++ b/llvm/test/CodeGen/X86/vec_cast2.ll @@ -18,7 +18,7 @@ define <8 x float> @cvt_v8i16_v8f32(<8 x i16> %src) { ; CHECK-LABEL: cvt_v8i16_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 6cb352c3f348f1..a413752993b5ae 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -95,7 +95,7 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 @@ -305,14 +305,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 @@ -421,7 +421,7 @@ define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { ; SSE-LABEL: sitofp_4i32_to_4f64: ; SSE: # %bb.0: ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq @@ -440,7 +440,7 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -448,7 +448,7 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -468,7 +468,7 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -476,7 +476,7 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -504,7 +504,7 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -512,7 +512,7 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -533,7 +533,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -541,7 +541,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1015,7 +1015,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] ; SSE41-NEXT: por %xmm3, %xmm2 ; SSE41-NEXT: subpd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: subpd %xmm3, %xmm1 @@ -1074,7 +1074,7 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1083,7 +1083,7 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1103,7 +1103,7 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1112,7 +1112,7 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1140,7 +1140,7 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1149,7 +1149,7 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1170,7 +1170,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1179,7 +1179,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1209,7 +1209,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -1274,7 +1274,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { ; SSE2-LABEL: sitofp_2i64_to_4f32_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -1345,7 +1345,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -1464,7 +1464,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { ; AVX1-LABEL: sitofp_8i16_to_4f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 @@ -1568,7 +1568,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -1576,7 +1576,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -1719,7 +1719,7 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm2, %xmm0 @@ -1728,7 +1728,7 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { ; AVX1-LABEL: sitofp_8i16_to_8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 @@ -1868,7 +1868,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB41_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB41_4 @@ -1969,7 +1969,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; SSE2-LABEL: uitofp_2i64_to_2f32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB42_1 @@ -2102,7 +2102,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB43_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB43_4 @@ -2462,7 +2462,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB49_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB49_4 @@ -2494,7 +2494,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB49_9: ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB49_10 @@ -2769,7 +2769,7 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm2, %xmm0 @@ -2906,7 +2906,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { ; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 @@ -3103,7 +3103,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; SSE2-NEXT: movdqa 16(%rdi), %xmm2 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 @@ -3111,7 +3111,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2sd %rax, %xmm2 @@ -3209,7 +3209,7 @@ define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) { ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE-NEXT: retq ; @@ -3229,7 +3229,7 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -3237,7 +3237,7 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd (%rdi), %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3259,7 +3259,7 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -3267,7 +3267,7 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxbd (%rdi), %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3770,7 +3770,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: subpd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: subpd %xmm2, %xmm1 @@ -3831,7 +3831,7 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -3839,7 +3839,7 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3861,7 +3861,7 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -3869,7 +3869,7 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3894,7 +3894,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -3902,7 +3902,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -4073,7 +4073,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movdqa 48(%rdi), %xmm3 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -4081,7 +4081,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -4090,7 +4090,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movq %xmm3, %rax ; SSE2-NEXT: xorps %xmm4, %xmm4 ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -4098,7 +4098,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 @@ -4378,7 +4378,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB83_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB83_4 @@ -4410,7 +4410,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB83_9: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB83_10 @@ -4729,7 +4729,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: addss %xmm3, %xmm3 ; SSE2-NEXT: .LBB87_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_4 @@ -4760,7 +4760,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB87_9: -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm5, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_10 @@ -4791,7 +4791,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm5 ; SSE2-NEXT: addss %xmm5, %xmm5 ; SSE2-NEXT: .LBB87_15: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_16 @@ -4826,7 +4826,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: .LBB87_21: ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_22 diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index dd3a733ab2178d..4f071c064e5c83 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -107,7 +107,7 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: retq ; @@ -119,7 +119,7 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movq %xmm1, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, 8(%rdi) ; SSSE3-NEXT: retq ; @@ -512,13 +512,13 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -644,7 +644,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -667,9 +667,9 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovdqa %xmm3, (%rdi) @@ -683,7 +683,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm3, (%rdi) ; AVX2-NEXT: retq @@ -769,7 +769,7 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -889,7 +889,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm1, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] @@ -924,7 +924,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSSE3-NEXT: movd %xmm1, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 5fde07d1269df8..b5fefe296d77e5 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1193,13 +1193,13 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpacksswb %xmm1, %xmm5, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 +; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-NEXT: vpacksswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -1376,11 +1376,11 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovsxbw %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: pmullw %xmm2, %xmm4 @@ -1407,7 +1407,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -1436,9 +1436,9 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm5 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1450,9 +1450,9 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -1478,7 +1478,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm3, (%rdi) ; AVX2-NEXT: retq @@ -1730,11 +1730,11 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: pmovsxbw %xmm3, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmovsxbw %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] @@ -1754,12 +1754,12 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm1, %xmm4 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmovsxbw %xmm2, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE41-NEXT: movdqa %xmm2, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm7, %xmm0 ; SSE41-NEXT: pmullw %xmm10, %xmm1 @@ -1783,7 +1783,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm8 ; SSE41-NEXT: psrad $31, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 @@ -1795,7 +1795,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm7 ; SSE41-NEXT: psrad $31, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 @@ -1841,9 +1841,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm5, %xmm4 ; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 ; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -1865,9 +1865,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm7 ; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1882,14 +1882,14 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vmovdqa %xmm9, 16(%rdi) @@ -1929,9 +1929,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi) ; AVX2-NEXT: retq @@ -2411,8 +2411,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pand %xmm10, %xmm9 ; SSE41-NEXT: packuswb %xmm11, %xmm9 ; SSE41-NEXT: pmovsxbw %xmm3, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,0,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,3,2,3] ; SSE41-NEXT: pmullw %xmm12, %xmm8 ; SSE41-NEXT: pxor %xmm7, %xmm7 ; SSE41-NEXT: pcmpgtb %xmm9, %xmm7 @@ -2436,8 +2436,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pand %xmm10, %xmm12 ; SSE41-NEXT: packuswb %xmm3, %xmm12 ; SSE41-NEXT: pmovsxbw %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: pmullw %xmm7, %xmm3 ; SSE41-NEXT: pxor %xmm7, %xmm7 ; SSE41-NEXT: pcmpgtb %xmm12, %xmm7 @@ -2461,8 +2461,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pand %xmm10, %xmm11 ; SSE41-NEXT: packuswb %xmm7, %xmm11 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmullw %xmm6, %xmm2 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpgtb %xmm11, %xmm6 @@ -2487,8 +2487,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: packuswb %xmm6, %xmm5 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 ; SSE41-NEXT: pmullw %xmm7, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 ; SSE41-NEXT: pmullw %xmm4, %xmm0 @@ -2523,7 +2523,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2538,7 +2538,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 208(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2553,7 +2553,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 144(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2568,7 +2568,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 80(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2606,9 +2606,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm6, %xmm7 ; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 @@ -2628,9 +2628,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm7 ; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 @@ -2652,9 +2652,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm11, %xmm7 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 ; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 @@ -2672,9 +2672,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm5 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm6 ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm2 @@ -2699,37 +2699,37 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vmovdqa %xmm1, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 240(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 176(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 96(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 112(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -2794,15 +2794,15 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX2-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm7, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm8 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 @@ -2932,7 +2932,7 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -2971,9 +2971,9 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { ; SSE2-LABEL: smulo_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r8 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rcx ; SSE2-NEXT: movq %xmm1, %rdx ; SSE2-NEXT: movq %xmm0, %rsi @@ -2996,9 +2996,9 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; SSSE3-LABEL: smulo_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %r8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %rcx ; SSSE3-NEXT: movq %xmm1, %rdx ; SSSE3-NEXT: movq %xmm0, %rsi @@ -3158,7 +3158,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: movd %xmm2, %edx @@ -3213,7 +3213,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSSE3-NEXT: movd %xmm0, %ecx ; SSSE3-NEXT: movw %cx, 6(%rdi) ; SSSE3-NEXT: movd %xmm2, %edx diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 8ab9367c32f861..eb12f0dbcbff26 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -109,7 +109,7 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movq %xmm3, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: retq ; @@ -122,7 +122,7 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movq %xmm3, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, 8(%rdi) ; SSSE3-NEXT: retq ; @@ -517,13 +517,13 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -649,7 +649,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -672,9 +672,9 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovdqa %xmm3, (%rdi) @@ -688,7 +688,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm3, (%rdi) ; AVX2-NEXT: retq @@ -774,7 +774,7 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -899,7 +899,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm1, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] @@ -934,7 +934,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSSE3-NEXT: movd %xmm1, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 864f0b59f453fd..9ff793b6b677f9 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -120,7 +120,7 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: retq ; @@ -132,7 +132,7 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 ; SSSE3-NEXT: movq %xmm1, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, 8(%rdi) ; SSSE3-NEXT: retq ; @@ -601,13 +601,13 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6 -; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -727,7 +727,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -750,9 +750,9 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -766,7 +766,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm2, (%rdi) ; AVX2-NEXT: retq @@ -850,7 +850,7 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -956,7 +956,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm1, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] @@ -988,7 +988,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSSE3-NEXT: movd %xmm1, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index e08bbc363721c1..87fe4922dfcb05 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1008,16 +1008,16 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm7 +; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6 -; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpacksswb %xmm5, %xmm11, %xmm1 +; AVX1-NEXT: vpacksswb %xmm11, %xmm11, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -1217,7 +1217,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -1254,9 +1254,9 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -1278,7 +1278,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm2, (%rdi) ; AVX2-NEXT: retq @@ -1560,7 +1560,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 @@ -1572,7 +1572,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 @@ -1647,14 +1647,14 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) @@ -1689,9 +1689,9 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi) ; AVX2-NEXT: retq @@ -2230,7 +2230,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2245,7 +2245,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 208(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2260,7 +2260,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 144(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2275,7 +2275,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 80(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2390,37 +2390,37 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vmovdqa %xmm4, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, 240(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 176(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, 96(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, 112(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -2475,15 +2475,15 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 @@ -2608,7 +2608,7 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -2646,9 +2646,9 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { ; SSE2-LABEL: umulo_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r8 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r10 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movq %xmm1, %rdx @@ -2672,9 +2672,9 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; SSSE3-LABEL: umulo_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %r8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %r10 ; SSSE3-NEXT: movq %xmm0, %rax ; SSSE3-NEXT: movq %xmm1, %rdx @@ -2829,7 +2829,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: movd %xmm1, %edx @@ -2873,7 +2873,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: movd %xmm2, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm2, %ecx ; SSSE3-NEXT: movw %cx, 6(%rdi) ; SSSE3-NEXT: movd %xmm1, %edx diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index 5302b3f8913dfc..155c5591ce11a1 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -126,7 +126,7 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -140,7 +140,7 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSSE3-NEXT: movd %xmm0, 8(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq @@ -644,13 +644,13 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6 -; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -771,7 +771,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -793,9 +793,9 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -809,7 +809,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm2, (%rdi) ; AVX2-NEXT: retq @@ -895,7 +895,7 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -1003,7 +1003,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm1, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] @@ -1035,7 +1035,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSSE3-NEXT: movd %xmm1, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 22e97e74075334..62bc377b9cec31 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6296,7 +6296,7 @@ define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 @@ -6342,7 +6342,7 @@ define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 @@ -6375,7 +6375,7 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: movd %xmm1, %eax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2sd %eax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %eax, %xmm0 @@ -6414,7 +6414,7 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2ss %eax, %xmm2 ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %eax, %xmm0 @@ -6535,7 +6535,7 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i32(<4 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -6575,14 +6575,14 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 @@ -6642,7 +6642,7 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: cvtsi2ss %rax, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2ss %rax, %xmm1 @@ -6650,7 +6650,7 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 @@ -6970,7 +6970,7 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: addss %xmm0, %xmm0 ; CHECK-NEXT: .LBB174_2: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx @@ -7031,7 +7031,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: movd %xmm1, %eax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2sd %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 @@ -7082,7 +7082,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2ss %rax, %xmm2 ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 @@ -7157,7 +7157,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0] @@ -7458,7 +7458,7 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: addss %xmm2, %xmm2 ; CHECK-NEXT: .LBB182_2: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx @@ -7487,7 +7487,7 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: addss %xmm1, %xmm1 ; CHECK-NEXT: .LBB182_6: # %entry ; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index d8442048f65ecb..0192d1e8137c1f 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -30,7 +30,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psllq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psllq %xmm4, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] @@ -38,7 +38,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: psubq %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrlq %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE2-NEXT: psrlq %xmm3, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE2-NEXT: orpd %xmm5, %xmm1 @@ -56,7 +56,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: psllq %xmm5, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm4[4,5,6,7] @@ -64,7 +64,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: psubq %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: psrlq %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: psrlq %xmm0, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: por %xmm1, %xmm4 @@ -78,13 +78,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 @@ -212,7 +212,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psllq %xmm4, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] @@ -220,7 +220,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X32-SSE-NEXT: psubq %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4 ; X32-SSE-NEXT: psrlq %xmm3, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm1 @@ -249,7 +249,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm6, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: psrld %xmm5, %xmm6 @@ -285,7 +285,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm7 ; SSE41-NEXT: psrld %xmm6, %xmm7 @@ -465,7 +465,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: psrld %xmm6, %xmm3 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm1, %xmm6 ; X32-SSE-NEXT: psrld %xmm5, %xmm6 @@ -1366,7 +1366,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; X32-SSE-NEXT: psubq %xmm3, %xmm4 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: psrlq %xmm4, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 0688107ed5c0b7..0cf4c172412a67 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -26,11 +26,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 @@ -38,12 +38,12 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vpsubq %xmm4, %xmm8, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm6 ; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index c560f99916bed4..59bef3a97b1faa 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -31,14 +31,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psllq %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psllq %xmm1, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE2-NEXT: psrlq %xmm2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 @@ -52,14 +52,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psllq %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psllq %xmm1, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlq %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: psrlq %xmm2, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 @@ -70,14 +70,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 @@ -136,14 +136,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psllq %xmm1, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 @@ -745,14 +745,14 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psllq %xmm1, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 655b6e4c250485..fd0e1c7e2f3abe 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -24,11 +24,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -38,13 +38,13 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -520,7 +520,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index b7cc39a32d718e..817bca051e0ae3 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -30,7 +30,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrlq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrlq %xmm4, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] @@ -38,7 +38,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: psubq %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psllq %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE2-NEXT: psllq %xmm3, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 @@ -58,7 +58,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psrlq %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: psrlq %xmm4, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7] @@ -66,7 +66,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: psubq %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: psllq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: psllq %xmm0, %xmm3 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm3 @@ -80,13 +80,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -215,7 +215,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: psrlq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] @@ -223,7 +223,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X32-SSE-NEXT: psubq %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm3, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 @@ -251,7 +251,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm5, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: psrld %xmm5, %xmm6 @@ -287,7 +287,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrld %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrld %xmm5, %xmm6 @@ -469,7 +469,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: psrld %xmm5, %xmm3 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm1, %xmm6 ; X32-SSE-NEXT: psrld %xmm5, %xmm6 @@ -1380,7 +1380,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; X32-SSE-NEXT: psubq %xmm4, %xmm5 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm5, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm5, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; X32-SSE-NEXT: movdqa %xmm1, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index e9cb0a0586f096..f0848cfd2e49aa 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -26,11 +26,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 @@ -38,12 +38,12 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vpsubq %xmm4, %xmm8, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm6 ; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index eadc8544f8afb7..8fe7ba9e471a9f 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -31,14 +31,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrlq %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psrlq %xmm1, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psllq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE2-NEXT: psllq %xmm2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 @@ -52,14 +52,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psrlq %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrlq %xmm1, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psllq %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: psllq %xmm2, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 @@ -70,14 +70,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -138,14 +138,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psrlq %xmm1, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 @@ -789,14 +789,14 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psrlq %xmm1, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 61c45a118e47af..4e92bfc4f91369 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -24,11 +24,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -38,13 +38,13 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -566,7 +566,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsllq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index ac8e1998ceb88b..22a38956873565 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -20,7 +20,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: sarq %rdx ; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: imulq %rcx ; SSE2-NEXT: movq %rdx, %rax @@ -199,7 +199,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] ; SSE41-NEXT: pmullw %xmm2, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 ; SSE41-NEXT: pmullw %xmm2, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 @@ -223,7 +223,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -327,7 +327,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] @@ -371,7 +371,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -457,7 +457,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: subq %rax, %rdx ; SSE2-NEXT: addq %rcx, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rsi @@ -674,7 +674,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] ; SSE41-NEXT: pmullw %xmm2, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 ; SSE41-NEXT: pmullw %xmm2, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 @@ -702,7 +702,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -836,7 +836,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm4 ; SSE41-NEXT: psrlw $8, %xmm4 @@ -879,7 +879,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index a9cf1aa80af8db..e06140f988d54f 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -163,7 +163,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 @@ -182,7 +182,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 ; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -260,7 +260,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm4 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -289,7 +289,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -574,7 +574,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 @@ -598,7 +598,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -688,7 +688,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm4 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 @@ -725,7 +725,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm5 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm6, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 16be83c84fb3c1..65131c0e3cb735 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -20,7 +20,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: addq %rdx, %rcx ; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi @@ -445,7 +445,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: subq %rdx, %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll index 2900ce2c8661ea..724f6007623dbd 100644 --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -452,7 +452,7 @@ define <8 x i32> @cmpne_knownzeros_zext_v8i16_v8i32(<8 x i16> %x) { ; SSE42: # %bb.0: ; SSE42-NEXT: psrlw $15, %xmm0 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll index e4f785dca2b1aa..a00e74fa1cacc4 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -14,21 +14,21 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddq %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: retq @@ -40,7 +40,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddq %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq @@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -84,7 +84,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: paddq %xmm2, %xmm1 ; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -96,7 +96,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -107,7 +107,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -119,7 +119,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -138,7 +138,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: paddq %xmm4, %xmm2 ; SSE-NEXT: paddq %xmm3, %xmm2 ; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -156,7 +156,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -169,7 +169,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -182,7 +182,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -236,7 +236,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: paddd %xmm1, %xmm0 @@ -245,7 +245,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX1-SLOW-LABEL: test_v4i32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -261,7 +261,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX2-LABEL: test_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -270,7 +270,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -284,7 +284,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: paddd %xmm1, %xmm0 @@ -295,7 +295,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -317,7 +317,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -329,7 +329,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -346,7 +346,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: paddd %xmm3, %xmm1 ; SSE-NEXT: paddd %xmm2, %xmm1 ; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 @@ -360,7 +360,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -375,7 +375,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax @@ -387,7 +387,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -401,7 +401,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -422,7 +422,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: paddd %xmm4, %xmm2 ; SSE-NEXT: paddd %xmm3, %xmm2 ; SSE-NEXT: paddd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 @@ -442,7 +442,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -463,7 +463,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax @@ -477,7 +477,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -492,7 +492,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -608,7 +608,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: paddw %xmm1, %xmm0 @@ -621,7 +621,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX1-SLOW-LABEL: test_v8i16: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -642,7 +642,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX2-LABEL: test_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -654,7 +654,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX512-LABEL: test_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -671,7 +671,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: paddw %xmm1, %xmm0 @@ -686,7 +686,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -713,7 +713,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -728,7 +728,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -748,7 +748,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: paddw %xmm3, %xmm1 ; SSE-NEXT: paddw %xmm2, %xmm1 ; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 @@ -766,7 +766,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -784,7 +784,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -799,7 +799,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -816,7 +816,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -840,7 +840,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: paddw %xmm4, %xmm2 ; SSE-NEXT: paddw %xmm3, %xmm2 ; SSE-NEXT: paddw %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 @@ -864,7 +864,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -888,7 +888,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -905,7 +905,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -923,7 +923,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -1142,7 +1142,7 @@ define i8 @test_v8i8_load(<8 x i8>* %p) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddb %xmm0, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: psadbw %xmm1, %xmm0 @@ -1152,7 +1152,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1162,7 +1162,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512-LABEL: test_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1177,7 +1177,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddb %xmm0, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: psadbw %xmm1, %xmm0 @@ -1189,7 +1189,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1202,7 +1202,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1215,7 +1215,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1233,7 +1233,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: paddb %xmm2, %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddb %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: psadbw %xmm0, %xmm1 @@ -1248,7 +1248,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1262,7 +1262,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1277,7 +1277,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1299,7 +1299,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: paddb %xmm4, %xmm2 ; SSE-NEXT: paddb %xmm3, %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: psadbw %xmm0, %xmm1 @@ -1320,7 +1320,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1336,7 +1336,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1352,7 +1352,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll index c94f96958f5b0a..17a3d6f46e98c3 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -13,7 +13,7 @@ define i1 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax @@ -22,7 +22,7 @@ define i1 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: testq %rax, %rax @@ -37,7 +37,7 @@ define i1 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax @@ -48,7 +48,7 @@ define i1 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -60,7 +60,7 @@ define i1 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax @@ -72,7 +72,7 @@ define i1 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: testq %rax, %rax @@ -90,7 +90,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax @@ -102,7 +102,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -115,7 +115,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax @@ -129,7 +129,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: testq %rax, %rax @@ -151,7 +151,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax @@ -165,7 +165,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -180,7 +180,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax @@ -195,7 +195,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: testq %rax, %rax @@ -237,7 +237,7 @@ define i1 @test_v2i32(<2 x i32> %a0) { define i1 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -248,7 +248,7 @@ define i1 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -265,7 +265,7 @@ define i1 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -278,7 +278,7 @@ define i1 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -292,7 +292,7 @@ define i1 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -306,7 +306,7 @@ define i1 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -326,7 +326,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -340,7 +340,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -355,7 +355,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -371,7 +371,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -395,7 +395,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -411,7 +411,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -428,7 +428,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -445,7 +445,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define i1 @test_v4i16(<4 x i16> %a0) { define i1 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -532,7 +532,7 @@ define i1 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -551,7 +551,7 @@ define i1 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -567,7 +567,7 @@ define i1 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -583,7 +583,7 @@ define i1 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -599,7 +599,7 @@ define i1 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -621,7 +621,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -638,7 +638,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -655,7 +655,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -673,7 +673,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -699,7 +699,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -718,7 +718,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -737,7 +737,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -756,7 +756,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -865,7 +865,7 @@ define i1 @test_v8i8(<8 x i8> %a0) { define i1 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -882,7 +882,7 @@ define i1 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -903,7 +903,7 @@ define i1 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -922,7 +922,7 @@ define i1 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -940,7 +940,7 @@ define i1 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -958,7 +958,7 @@ define i1 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -982,7 +982,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -1002,7 +1002,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -1021,7 +1021,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1041,7 +1041,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1069,7 +1069,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -1091,7 +1091,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -1112,7 +1112,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1133,7 +1133,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll index 2fc924b1b12570..4a00c22a267011 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -13,14 +13,14 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq @@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq @@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index bafe112c5dfc8b..f7e1a72f9a91f1 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -15,7 +15,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 @@ -31,7 +31,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -45,7 +45,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -59,7 +59,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512BWVL-LABEL: test_v2i64: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -74,7 +74,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512DQ-LABEL: test_v2i64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vzeroupper @@ -82,7 +82,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512DQVL-LABEL: test_v2i64: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: retq @@ -103,7 +103,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 @@ -128,7 +128,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -152,7 +152,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -176,7 +176,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -200,7 +200,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -218,7 +218,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vzeroupper @@ -228,7 +228,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper @@ -270,7 +270,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 @@ -312,7 +312,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -344,7 +344,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -377,7 +377,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -410,7 +410,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -429,7 +429,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vzeroupper @@ -441,7 +441,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper @@ -523,7 +523,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 @@ -599,7 +599,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -647,7 +647,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -688,7 +688,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -729,7 +729,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -749,7 +749,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vzeroupper @@ -762,7 +762,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper @@ -810,7 +810,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 @@ -821,7 +821,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE41-LABEL: test_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE41-NEXT: pmulld %xmm1, %xmm0 @@ -830,7 +830,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -839,7 +839,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -856,7 +856,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,0,2,2] ; SSE2-NEXT: pmuludq %xmm3, %xmm0 @@ -867,7 +867,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE41-LABEL: test_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE41-NEXT: pmulld %xmm1, %xmm0 @@ -878,7 +878,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -890,7 +890,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -902,7 +902,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -926,7 +926,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm2 ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,0,2,2] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 @@ -939,7 +939,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE41-NEXT: pmulld %xmm3, %xmm1 ; SSE41-NEXT: pmulld %xmm2, %xmm1 ; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 @@ -953,7 +953,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -966,7 +966,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -980,7 +980,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1016,7 +1016,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pmuludq %xmm5, %xmm1 ; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,0,2,2] ; SSE2-NEXT: pmuludq %xmm11, %xmm1 @@ -1033,7 +1033,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE41-NEXT: pmulld %xmm4, %xmm2 ; SSE41-NEXT: pmulld %xmm3, %xmm2 ; SSE41-NEXT: pmulld %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 @@ -1053,7 +1053,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1068,7 +1068,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1083,7 +1083,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1165,7 +1165,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pmullw %xmm1, %xmm0 @@ -1178,7 +1178,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1190,7 +1190,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX512-LABEL: test_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1207,7 +1207,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pmullw %xmm1, %xmm0 @@ -1222,7 +1222,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1237,7 +1237,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1252,7 +1252,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1272,7 +1272,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: pmullw %xmm3, %xmm1 ; SSE-NEXT: pmullw %xmm2, %xmm1 ; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pmullw %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 @@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1306,7 +1306,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1323,7 +1323,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1340,7 +1340,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1357,7 +1357,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1374,7 +1374,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1398,7 +1398,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: pmullw %xmm4, %xmm2 ; SSE-NEXT: pmullw %xmm3, %xmm2 ; SSE-NEXT: pmullw %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pmullw %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 @@ -1422,7 +1422,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1440,7 +1440,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1458,7 +1458,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1476,7 +1476,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1496,7 +1496,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1516,7 +1516,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1634,7 +1634,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] @@ -1650,7 +1650,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1663,7 +1663,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1711,7 +1711,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: packuswb %xmm3, %xmm0 @@ -1735,7 +1735,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1764,7 +1764,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512BW-LABEL: test_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -1786,7 +1786,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512BWVL-LABEL: test_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -1808,7 +1808,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512DQ-LABEL: test_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -1833,7 +1833,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512DQVL-LABEL: test_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -1908,7 +1908,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm4, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: packuswb %xmm3, %xmm2 @@ -1937,7 +1937,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1982,7 +1982,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 @@ -2010,7 +2010,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 @@ -2158,7 +2158,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: packuswb %xmm3, %xmm1 @@ -2196,7 +2196,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2288,10 +2288,8 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmovwb %ymm2, %xmm2 +; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero @@ -2495,7 +2493,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm0, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm0 @@ -2551,7 +2549,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2657,10 +2655,8 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 8223e6bd23804b..e1253975d5a6ab 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -823,7 +823,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { define i1 @trunc_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: trunc_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: testw %ax, %ax diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll index 14eb3d27d8dffc..95bff8e03afaeb 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -13,14 +13,14 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq @@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq @@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll index c0705ab96e0391..26bbfed521968e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -14,7 +14,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: test_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-LABEL: test_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 @@ -54,7 +54,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-LABEL: test_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE42-NEXT: movq %xmm2, %rax @@ -62,7 +62,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax @@ -71,7 +71,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -79,7 +79,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: test_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq @@ -106,7 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 @@ -139,7 +139,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -158,7 +158,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -170,7 +170,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -182,7 +182,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -194,7 +194,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -204,7 +204,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -262,7 +262,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -317,7 +317,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm5 @@ -343,7 +343,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE42-NEXT: movapd %xmm2, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -360,7 +360,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -387,7 +387,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -399,7 +399,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -517,7 +517,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm8 @@ -616,7 +616,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm9 @@ -654,7 +654,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE42-NEXT: movapd %xmm6, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm7, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE42-NEXT: movdqa %xmm7, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 @@ -681,7 +681,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -699,7 +699,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -713,7 +713,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -726,7 +726,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -778,7 +778,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -795,7 +795,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE4-LABEL: test_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 @@ -804,7 +804,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -813,7 +813,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -831,7 +831,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -849,7 +849,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE4-LABEL: test_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 @@ -860,7 +860,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -884,7 +884,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -913,7 +913,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -933,7 +933,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE4-NEXT: pmaxsd %xmm3, %xmm1 ; SSE4-NEXT: pmaxsd %xmm2, %xmm1 ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 @@ -947,7 +947,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -960,7 +960,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -974,7 +974,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1023,7 +1023,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -1047,7 +1047,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE4-NEXT: pmaxsd %xmm4, %xmm2 ; SSE4-NEXT: pmaxsd %xmm3, %xmm2 ; SSE4-NEXT: pmaxsd %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm2, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 @@ -1067,7 +1067,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1082,7 +1082,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1097,7 +1097,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1179,7 +1179,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1224,7 +1224,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-LABEL: test_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-NEXT: pmaxsw %xmm3, %xmm1 ; SSE2-NEXT: pmaxsw %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1368,7 +1368,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-NEXT: pmaxsw %xmm4, %xmm2 ; SSE2-NEXT: pmaxsw %xmm3, %xmm2 ; SSE2-NEXT: pmaxsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1622,7 +1622,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -1697,7 +1697,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -1803,7 +1803,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -1937,7 +1937,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll index e8259e147133c5..6f561c27d71dc5 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -14,7 +14,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: test_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-LABEL: test_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -53,7 +53,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-LABEL: test_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -62,7 +62,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax @@ -71,7 +71,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -79,7 +79,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: test_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq @@ -106,7 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 @@ -138,7 +138,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -158,7 +158,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -170,7 +170,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -182,7 +182,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -194,7 +194,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -204,7 +204,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -262,7 +262,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -317,7 +317,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 @@ -343,7 +343,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE42-NEXT: movapd %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -360,7 +360,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -387,7 +387,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -399,7 +399,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -517,7 +517,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm8 @@ -616,7 +616,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm9 @@ -654,7 +654,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE42-NEXT: movapd %xmm7, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm7, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 @@ -681,7 +681,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -699,7 +699,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -713,7 +713,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -726,7 +726,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -778,7 +778,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -795,7 +795,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE4-LABEL: test_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pminsd %xmm1, %xmm0 @@ -804,7 +804,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -813,7 +813,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -831,7 +831,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -849,7 +849,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE4-LABEL: test_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pminsd %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pminsd %xmm1, %xmm0 @@ -860,7 +860,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -884,7 +884,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -913,7 +913,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -933,7 +933,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE4-NEXT: pminsd %xmm3, %xmm1 ; SSE4-NEXT: pminsd %xmm2, %xmm1 ; SSE4-NEXT: pminsd %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: pminsd %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 @@ -947,7 +947,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -960,7 +960,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -974,7 +974,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1023,7 +1023,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -1047,7 +1047,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE4-NEXT: pminsd %xmm4, %xmm2 ; SSE4-NEXT: pminsd %xmm3, %xmm2 ; SSE4-NEXT: pminsd %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE4-NEXT: pminsd %xmm2, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 @@ -1067,7 +1067,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpminsd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpminsd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1082,7 +1082,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1097,7 +1097,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1179,7 +1179,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1224,7 +1224,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-LABEL: test_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-NEXT: pminsw %xmm3, %xmm1 ; SSE2-NEXT: pminsw %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1368,7 +1368,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-NEXT: pminsw %xmm4, %xmm2 ; SSE2-NEXT: pminsw %xmm3, %xmm2 ; SSE2-NEXT: pminsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: pminsw %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1622,7 +1622,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -1697,7 +1697,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -1803,7 +1803,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -1937,7 +1937,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 75e81335813387..a11fff0a5b5c89 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -14,7 +14,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: test_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-LABEL: test_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 @@ -54,7 +54,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-LABEL: test_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm3 @@ -65,7 +65,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -77,7 +77,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -85,7 +85,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: test_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq @@ -112,7 +112,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 @@ -145,7 +145,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -168,7 +168,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm3 @@ -185,7 +185,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -202,7 +202,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -216,7 +216,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -226,7 +226,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -284,7 +284,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -339,7 +339,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm5 @@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE42-NEXT: xorpd %xmm5, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm5, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm5 @@ -400,7 +400,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -421,7 +421,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -436,7 +436,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -448,7 +448,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -566,7 +566,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm8 @@ -665,7 +665,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm9 @@ -725,7 +725,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE42-NEXT: xorpd %xmm9, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE42-NEXT: movdqa %xmm7, %xmm0 ; SSE42-NEXT: pxor %xmm9, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm9 @@ -769,7 +769,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -798,7 +798,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -814,7 +814,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -827,7 +827,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -882,7 +882,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -905,7 +905,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE4-LABEL: test_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pmaxud %xmm1, %xmm0 @@ -914,7 +914,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -923,7 +923,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -945,7 +945,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -968,7 +968,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE4-LABEL: test_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxud %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pmaxud %xmm1, %xmm0 @@ -979,7 +979,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -991,7 +991,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1003,7 +1003,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1042,7 +1042,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1067,7 +1067,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE4-NEXT: pmaxud %xmm3, %xmm1 ; SSE4-NEXT: pmaxud %xmm2, %xmm1 ; SSE4-NEXT: pmaxud %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 @@ -1081,7 +1081,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1094,7 +1094,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1108,7 +1108,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1179,7 +1179,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1208,7 +1208,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE4-NEXT: pmaxud %xmm4, %xmm2 ; SSE4-NEXT: pmaxud %xmm3, %xmm2 ; SSE4-NEXT: pmaxud %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm2, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 @@ -1228,7 +1228,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmaxud %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1243,7 +1243,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1258,7 +1258,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1368,7 +1368,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1433,7 +1433,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1520,7 +1520,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-NEXT: pmaxsw %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1625,7 +1625,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-NEXT: pmaxsw %xmm5, %xmm1 ; SSE2-NEXT: pmaxsw %xmm4, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1839,7 +1839,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -1909,7 +1909,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-LABEL: test_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -2004,7 +2004,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: pmaxub %xmm3, %xmm1 ; SSE2-NEXT: pmaxub %xmm2, %xmm1 ; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 @@ -2113,7 +2113,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: pmaxub %xmm4, %xmm2 ; SSE2-NEXT: pmaxub %xmm3, %xmm2 ; SSE2-NEXT: pmaxub %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index 8f6a7266d97b98..9da8d61223efd6 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -14,7 +14,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: test_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-LABEL: test_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -53,7 +53,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-LABEL: test_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] ; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pxor %xmm0, %xmm3 @@ -65,7 +65,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -77,7 +77,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -85,7 +85,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: test_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq @@ -112,7 +112,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 @@ -144,7 +144,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -168,7 +168,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm3 @@ -186,7 +186,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -203,7 +203,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -217,7 +217,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -227,7 +227,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -285,7 +285,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -340,7 +340,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 @@ -376,7 +376,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE42-NEXT: xorpd %xmm4, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm4, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm4 @@ -403,7 +403,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -424,7 +424,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -439,7 +439,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -451,7 +451,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -569,7 +569,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm8 @@ -668,7 +668,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm9 @@ -728,7 +728,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE42-NEXT: xorpd %xmm8, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE42-NEXT: movdqa %xmm7, %xmm0 ; SSE42-NEXT: pxor %xmm8, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm8 @@ -773,7 +773,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm4, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -802,7 +802,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -818,7 +818,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -831,7 +831,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -886,7 +886,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -909,7 +909,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE4-LABEL: test_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pminud %xmm1, %xmm0 @@ -918,7 +918,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -927,7 +927,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -949,7 +949,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -972,7 +972,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE4-LABEL: test_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pminud %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pminud %xmm1, %xmm0 @@ -983,7 +983,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -995,7 +995,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1007,7 +1007,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1046,7 +1046,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm6, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1071,7 +1071,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE4-NEXT: pminud %xmm3, %xmm1 ; SSE4-NEXT: pminud %xmm2, %xmm1 ; SSE4-NEXT: pminud %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: pminud %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 @@ -1085,7 +1085,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1098,7 +1098,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1112,7 +1112,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1183,7 +1183,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1212,7 +1212,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE4-NEXT: pminud %xmm4, %xmm2 ; SSE4-NEXT: pminud %xmm3, %xmm2 ; SSE4-NEXT: pminud %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE4-NEXT: pminud %xmm2, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 @@ -1232,7 +1232,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1247,7 +1247,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1262,7 +1262,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1372,7 +1372,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1418,7 +1418,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1482,7 +1482,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-NEXT: pminsw %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1562,7 +1562,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-NEXT: pminsw %xmm5, %xmm1 ; SSE2-NEXT: pminsw %xmm4, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1750,7 +1750,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminub %xmm1, %xmm0 @@ -1799,7 +1799,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-LABEL: test_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminub %xmm1, %xmm0 @@ -1869,7 +1869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: pminub %xmm3, %xmm1 ; SSE2-NEXT: pminub %xmm2, %xmm1 ; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pminub %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 @@ -1951,7 +1951,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: pminub %xmm4, %xmm2 ; SSE2-NEXT: pminub %xmm3, %xmm2 ; SSE2-NEXT: pminub %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: pminub %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll index 35e6db38a584fa..2d69190d9d18a2 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -13,14 +13,14 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq @@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq @@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rem.ll b/llvm/test/CodeGen/X86/vector-rem.ll index 15e11986966591..deaab1c9161b98 100644 --- a/llvm/test/CodeGen/X86/vector-rem.ll +++ b/llvm/test/CodeGen/X86/vector-rem.ll @@ -11,9 +11,9 @@ define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) nounwind { ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm3, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; CHECK-NEXT: movd %xmm3, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx @@ -49,9 +49,9 @@ define <4 x i32> @bar(<4 x i32> %t, <4 x i32> %u) nounwind { ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm3, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; CHECK-NEXT: movd %xmm3, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 67dd15ee87abe6..d140fb5c092956 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -24,13 +24,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psllq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psllq %xmm1, %xmm4 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: psrlq %xmm2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: orpd %xmm4, %xmm0 @@ -42,13 +42,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE41-NEXT: psubq %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllq %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psllq %xmm1, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlq %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: psrlq %xmm2, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm4, %xmm0 @@ -59,11 +59,11 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 @@ -117,13 +117,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: psubq %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 ; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm2, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm4, %xmm0 @@ -719,7 +719,7 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: psllq %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 31fe5750247535..a850ab5ba7822d 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -21,20 +21,20 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index d162f5c4a97ab9..2a4efccc07c7f6 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -72,7 +72,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss ; SSE41-LABEL: sext_16i8_to_16i16: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -80,7 +80,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss ; AVX1-LABEL: sext_16i8_to_16i16: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -107,7 +107,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss ; X32-SSE41-LABEL: sext_16i8_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl @@ -149,9 +149,9 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxbw %xmm0, %xmm5 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -160,12 +160,12 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; AVX1-LABEL: sext_32i8_to_32i16: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -210,9 +210,9 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5 ; X32-SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm3 ; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 ; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -367,7 +367,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; SSE41-NEXT: pmovsxbd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovsxbd %xmm0, %xmm3 @@ -380,9 +380,9 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -391,7 +391,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX2-LABEL: sext_16i8_to_16i32: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -421,7 +421,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X32-SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxbd %xmm2, %xmm2 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm3 @@ -736,7 +736,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-LABEL: sext_8i16_to_8i32: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -744,7 +744,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: sext_8i16_to_8i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -771,7 +771,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE41-LABEL: sext_8i16_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl @@ -813,9 +813,9 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -824,12 +824,12 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; AVX1-LABEL: sext_16i16_to_16i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -866,9 +866,9 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5 ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm3 ; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 ; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -1043,7 +1043,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 @@ -1056,9 +1056,9 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -1067,7 +1067,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX2-LABEL: sext_8i16_to_8i64: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -1101,7 +1101,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X32-SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwq %xmm2, %xmm2 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm3 @@ -1160,7 +1160,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1171,7 +1171,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1180,7 +1180,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSE41-LABEL: sext_4i32_to_4i64: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1188,7 +1188,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: sext_4i32_to_4i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -1208,7 +1208,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE2-NEXT: pxor %xmm2, %xmm2 ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1217,7 +1217,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE41-LABEL: sext_4i32_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl @@ -1235,12 +1235,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -1254,12 +1254,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -1269,9 +1269,9 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxdq %xmm0, %xmm5 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -1280,12 +1280,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: sext_8i32_to_8i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -1312,12 +1312,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pxor %xmm5, %xmm5 ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -1327,9 +1327,9 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5 ; X32-SSE41-NEXT: pmovsxdq %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm3 ; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 ; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -1849,7 +1849,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX1-NEXT: negl %eax ; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -3046,7 +3046,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3058,7 +3058,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3094,7 +3094,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; X32-SSE2-NEXT: pxor %xmm2, %xmm2 ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3166,7 +3166,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3179,7 +3179,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3190,7 +3190,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -3200,7 +3200,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -3226,7 +3226,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X32-SSE2-NEXT: pxor %xmm2, %xmm2 ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3237,7 +3237,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X32-SSE41-NEXT: pslld $31, %xmm0 ; X32-SSE41-NEXT: psrad $31, %xmm0 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl @@ -3737,7 +3737,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE41-NEXT: psrad $26, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: psllq $58, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 @@ -3767,9 +3767,9 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq @@ -3782,7 +3782,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX2-NEXT: vpsllw $10, %xmm0, %xmm0 ; AVX2-NEXT: vpsraw $10, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 ; AVX2-NEXT: retq ; @@ -3861,7 +3861,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; X32-SSE41-NEXT: psrad $26, %xmm1 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; X32-SSE41-NEXT: psllq $58, %xmm2 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm4 @@ -3916,7 +3916,7 @@ define <8 x i32> @zext_negate_sext(<8 x i8> %x) { ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: psubw %xmm0, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3962,7 +3962,7 @@ define <8 x i32> @zext_negate_sext(<8 x i8> %x) { ; X32-SSE41-NEXT: pxor %xmm1, %xmm1 ; X32-SSE41-NEXT: psubw %xmm0, %xmm1 ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; X32-SSE41-NEXT: retl %z = zext <8 x i8> %x to <8 x i16> @@ -4002,7 +4002,7 @@ define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) { ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -4049,7 +4049,7 @@ define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) { ; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; X32-SSE41-NEXT: paddw %xmm0, %xmm1 ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; X32-SSE41-NEXT: retl %z = zext <8 x i8> %x to <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 5764d19f4c7f77..9b1fb29cb02979 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -23,7 +23,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: psrlq %xmm4, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE41-NEXT: psrlq %xmm4, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 @@ -54,7 +54,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 @@ -99,7 +99,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X32-SSE-NEXT: movdqa %xmm2, %xmm3 ; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm2 ; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 @@ -123,7 +123,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrad %xmm3, %xmm4 @@ -139,7 +139,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrad %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 @@ -205,7 +205,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrad %xmm4, %xmm2 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrad %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 358f9b8cc4dec6..ae9c375eec2540 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -22,7 +22,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -32,7 +32,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 @@ -92,7 +92,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X32-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; X32-AVX1-NEXT: # xmm3 = mem[0,0] ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -102,7 +102,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index 11d118bf31c3d0..a994d6610d73db 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -27,7 +27,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrad %xmm3, %xmm4 @@ -43,7 +43,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrad %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 @@ -109,7 +109,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrad %xmm4, %xmm2 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrad %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 1a2889ab861e28..2e19f753722d3b 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -22,7 +22,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: psrlq %xmm1, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE2-NEXT: retq @@ -31,7 +31,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: psrlq %xmm1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq @@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-LABEL: var_shift_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq @@ -75,7 +75,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm1, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-SSE-NEXT: retl @@ -93,7 +93,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -109,7 +109,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -175,7 +175,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrld %xmm4, %xmm2 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrld %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 154c35b51db9fe..9119e32bda3755 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -22,11 +22,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -69,11 +69,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index b540421fb3de94..8a843ef652e7fa 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -27,7 +27,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -43,7 +43,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -109,7 +109,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrld %xmm4, %xmm2 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrld %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index eee45615047648..5f1325aacb4d1f 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -22,7 +22,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: psllq %xmm1, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE2-NEXT: retq @@ -31,7 +31,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psllq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: psllq %xmm1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq @@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-LABEL: var_shift_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq @@ -73,7 +73,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psllq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm1, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 56ebce709a8d7b..1296fcf8bb902e 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -22,11 +22,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -66,11 +66,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 04064f1c1970d1..a11fc5b7c8ed35 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1013,19 +1013,19 @@ define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) { ; SSE-LABEL: shuffle_v8i16_0c1d2e3f: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512VL-SLOW-NEXT: retq ; @@ -1055,19 +1055,19 @@ define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) { ; SSE-LABEL: shuffle_v8i16_48596a7b: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_v8i16_48596a7b: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512VL-SLOW-NEXT: retq ; @@ -1513,7 +1513,7 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_cde3XXXX: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 @@ -1529,25 +1529,25 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { ; ; SSE41-LABEL: shuffle_v8i16_cde3XXXX: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,0,1,2,3] +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,4,5,6,7] ; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2 ; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512VL-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 635d94b9e72397..131c621162a508 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3167,7 +3167,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z ; AVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -3191,7 +3191,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z ; XOPAVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -5064,7 +5064,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,u,u,u,u,u,u,u,u] @@ -5090,7 +5090,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3 ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13],xmm3[14,15],xmm1[u,u,u,u,u,u,u,u] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -5161,7 +5161,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,14,15] @@ -5187,7 +5187,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2 ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,2,3,4,5],xmm2[6,7] ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -5752,9 +5752,9 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -5776,9 +5776,9 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 04759bf104cb35..0508c6bac2bb11 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4053,7 +4053,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ; AVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -4083,7 +4083,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ; XOPAVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; XOPAVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; XOPAVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 973045696fbd5f..4798b4b1d38a27 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -3216,7 +3216,7 @@ entry: define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) { ; AVX1-LABEL: broadcast_concat_crash: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index b2d4dc76a10e02..4c8073614d6dd3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -434,7 +434,7 @@ define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_z ; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: ; KNL: ## %bb.0: ; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; KNL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 3199cc0fa9b9bf..4237c4107d4774 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -397,7 +397,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_ ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq @@ -410,7 +410,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_ ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 72c1aefb5da7d0..622eb0881052ee 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3228,7 +3228,7 @@ define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) { ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll index 25c44964a55117..6adb6b0c2c0b86 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -372,14 +372,14 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) { ; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; BTVER1: # %bb.0: ; BTVER1-NEXT: psrld $16, %xmm1 -; BTVER1-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; BTVER1-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; BTVER1-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; BTVER1-NEXT: retq ; ; BTVER2-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vpsrld $16, %xmm1, %xmm1 -; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; BTVER2-NEXT: retq %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 42bf2d9eac2846..5d065213f925b3 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -57,7 +57,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) { ; SSE41-LABEL: zext_16i8_to_16i16: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -113,9 +113,9 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -277,7 +277,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -290,9 +290,9 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -301,7 +301,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX2-LABEL: zext_16i8_to_16i32: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -515,7 +515,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-LABEL: zext_8i16_to_8i32: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -571,9 +571,9 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -727,7 +727,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -740,9 +740,9 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -751,7 +751,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX2-LABEL: zext_8i16_to_8i64: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -813,7 +813,7 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSE41-LABEL: zext_4i32_to_4i64: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -869,9 +869,9 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -1885,7 +1885,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1894,7 +1894,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -2213,14 +2213,14 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -2245,14 +2245,14 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-NEXT: vmovaps %ymm4, %ymm0 @@ -2263,9 +2263,9 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vmovdqa %ymm4, %ymm0 ; AVX2-NEXT: retq @@ -2499,7 +2499,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] @@ -2518,9 +2518,9 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq @@ -2532,7 +2532,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vsel-cmp-load.ll b/llvm/test/CodeGen/X86/vsel-cmp-load.ll index c809090c8955c4..89d1b549182ab1 100644 --- a/llvm/test/CodeGen/X86/vsel-cmp-load.ll +++ b/llvm/test/CodeGen/X86/vsel-cmp-load.ll @@ -80,7 +80,7 @@ define <16 x i16> @sgt_zero(<16 x i8>* %p, <16 x i16> %x, <16 x i16> %y) { ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 3cd5654771c5a3..24849c2b850cde 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -42,7 +42,7 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX1: ## %bb.0: ## %bb ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq (%rdi,%rsi,8), %rax diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index 4c56c654defaec..42fbdb186357fb 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -364,7 +364,7 @@ define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double> ; AVX1-LABEL: signbit_sel_v4f64_small_mask: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 @@ -397,7 +397,7 @@ define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double> ; XOP-LABEL: signbit_sel_v4f64_small_mask: ; XOP: # %bb.0: ; XOP-NEXT: vpmovsxdq %xmm2, %xmm3 -; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; XOP-NEXT: vpmovsxdq %xmm2, %xmm2 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; XOP-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/vshift-4.ll b/llvm/test/CodeGen/X86/vshift-4.ll index 9362c41a3dc8a0..1131d07b15d35c 100644 --- a/llvm/test/CodeGen/X86/vshift-4.ll +++ b/llvm/test/CodeGen/X86/vshift-4.ll @@ -32,7 +32,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movdqa %xmm0, %xmm2 ; X32-NEXT: psllq %xmm1, %xmm2 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-NEXT: psllq %xmm1, %xmm0 ; X32-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-NEXT: movapd %xmm0, (%eax) @@ -42,7 +42,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: psllq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: psllq %xmm1, %xmm0 ; X64-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X64-NEXT: movapd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll index 9c567fc3841570..d5022b24eabe1a 100644 --- a/llvm/test/CodeGen/X86/widen_conv-4.ll +++ b/llvm/test/CodeGen/X86/widen_conv-4.ll @@ -28,7 +28,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin ; X86-SSE42-LABEL: convert_v7i16_v7f32: ; X86-SSE42: # %bb.0: # %entry ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X86-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1 ; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -55,7 +55,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin ; ; X64-SSE42-LABEL: convert_v7i16_v7f32: ; X64-SSE42: # %bb.0: # %entry -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X64-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1 ; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll index 6f335c00b589f0..e258cdf0035a00 100644 --- a/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll +++ b/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll @@ -30,7 +30,7 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwin ; CHECK: ## %bb.0: ; CHECK-NEXT: cmpeqps %xmm1, %xmm0 ; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index e836b00bb62069..e1d6929503f15b 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -407,7 +407,7 @@ define i32 @PR17487(i1 %tobool) { ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X32-NEXT: pandn {{\.LCPI.*}}, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-NEXT: movd %xmm0, %ecx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: cmpl $1, %ecx