diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 098ecc158e0c97..d2d8176838fcee 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW @@ -602,95 +602,50 @@ vector.ph: ; FIXME: match this to UMIN+TRUNC+PSUBUS define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { -; SSE2-LABEL: test14: -; SSE2: # %bb.0: # %vector.ph -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: packuswb %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm5, %xmm9 -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm10 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: packssdw %xmm6, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: packsswb %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test14: -; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: movdqa %xmm3, %xmm10 -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: packuswb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: packuswb %xmm3, %xmm1 -; SSSE3-NEXT: psubb %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; SSSE3-NEXT: movdqa %xmm6, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: pxor %xmm5, %xmm9 -; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm10 -; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: packssdw %xmm6, %xmm3 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 -; SSSE3-NEXT: packssdw %xmm2, %xmm0 -; SSSE3-NEXT: packsswb %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSE2OR3-LABEL: test14: +; SSE2OR3: # %bb.0: # %vector.ph +; SSE2OR3-NEXT: pxor %xmm8, %xmm8 +; SSE2OR3-NEXT: movdqa %xmm0, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm7 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2OR3-NEXT: pand %xmm5, %xmm4 +; SSE2OR3-NEXT: pand %xmm5, %xmm3 +; SSE2OR3-NEXT: packuswb %xmm4, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm1, %xmm4 +; SSE2OR3-NEXT: pand %xmm5, %xmm2 +; SSE2OR3-NEXT: pand %xmm5, %xmm1 +; SSE2OR3-NEXT: packuswb %xmm2, %xmm1 +; SSE2OR3-NEXT: packuswb %xmm3, %xmm1 +; SSE2OR3-NEXT: psubb %xmm0, %xmm1 +; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 +; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE2OR3-NEXT: movdqa %xmm2, %xmm0 +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; SSE2OR3-NEXT: movdqa %xmm6, %xmm3 +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE2OR3-NEXT: pxor %xmm5, %xmm9 +; SSE2OR3-NEXT: por %xmm5, %xmm6 +; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2OR3-NEXT: pxor %xmm5, %xmm10 +; SSE2OR3-NEXT: por %xmm5, %xmm3 +; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2OR3-NEXT: packssdw %xmm6, %xmm3 +; SSE2OR3-NEXT: pxor %xmm5, %xmm7 +; SSE2OR3-NEXT: por %xmm5, %xmm2 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm2 +; SSE2OR3-NEXT: pxor %xmm5, %xmm4 +; SSE2OR3-NEXT: por %xmm5, %xmm0 +; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE2OR3-NEXT: packssdw %xmm2, %xmm0 +; SSE2OR3-NEXT: packsswb %xmm3, %xmm0 +; SSE2OR3-NEXT: pandn %xmm1, %xmm0 +; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: test14: ; SSE41: # %bb.0: # %vector.ph @@ -1475,145 +1430,75 @@ vector.ph: } define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { -; SSE2-LABEL: psubus_8i64_max: -; SSE2: # %bb.0: # %vector.ph -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pxor %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE2-NEXT: psubusw %xmm3, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: psubus_8i64_max: -; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] -; SSSE3-NEXT: movdqa %xmm8, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm9, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: movdqa %xmm8, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm9, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm8, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pandn %xmm9, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm9, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSSE3-NEXT: psubusw %xmm3, %xmm0 -; SSSE3-NEXT: retq +; SSE2OR3-LABEL: psubus_8i64_max: +; SSE2OR3: # %bb.0: # %vector.ph +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE2OR3-NEXT: movdqa %xmm2, %xmm7 +; SSE2OR3-NEXT: pxor %xmm5, %xmm7 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] +; SSE2OR3-NEXT: movdqa %xmm8, %xmm6 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm9, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: por %xmm7, %xmm6 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] +; SSE2OR3-NEXT: pand %xmm6, %xmm2 +; SSE2OR3-NEXT: pandn %xmm9, %xmm6 +; SSE2OR3-NEXT: por %xmm2, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] +; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] +; SSE2OR3-NEXT: movdqa %xmm1, %xmm6 +; SSE2OR3-NEXT: pxor %xmm5, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm8, %xmm7 +; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm2, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: por %xmm6, %xmm2 +; SSE2OR3-NEXT: pand %xmm2, %xmm1 +; SSE2OR3-NEXT: pandn %xmm9, %xmm2 +; SSE2OR3-NEXT: por %xmm1, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 +; SSE2OR3-NEXT: pxor %xmm5, %xmm2 +; SSE2OR3-NEXT: movdqa %xmm8, %xmm6 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm7, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: por %xmm2, %xmm6 +; SSE2OR3-NEXT: pand %xmm6, %xmm4 +; SSE2OR3-NEXT: pandn %xmm9, %xmm6 +; SSE2OR3-NEXT: por %xmm4, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] +; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE2OR3-NEXT: pxor %xmm3, %xmm5 +; SSE2OR3-NEXT: movdqa %xmm8, %xmm4 +; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm6, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2OR3-NEXT: por %xmm5, %xmm4 +; SSE2OR3-NEXT: pand %xmm4, %xmm3 +; SSE2OR3-NEXT: pandn %xmm9, %xmm4 +; SSE2OR3-NEXT: por %xmm3, %xmm4 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2OR3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE2OR3-NEXT: psubusw %xmm3, %xmm0 +; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph @@ -1730,95 +1615,50 @@ vector.ph: } define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { -; SSE2-LABEL: psubus_16i32_max: -; SSE2: # %bb.0: # %vector.ph -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: pxor %xmm9, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm10, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pslld $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: packssdw %xmm6, %xmm7 -; SSE2-NEXT: psubusw %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm10, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm10, %xmm8 -; SSE2-NEXT: pand %xmm4, %xmm10 -; SSE2-NEXT: por %xmm8, %xmm10 -; SSE2-NEXT: pslld $16, %xmm10 -; SSE2-NEXT: psrad $16, %xmm10 -; SSE2-NEXT: packssdw %xmm3, %xmm10 -; SSE2-NEXT: psubusw %xmm10, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: psubus_16i32_max: -; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSSE3-NEXT: pxor %xmm9, %xmm8 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm10, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm3, %xmm6 -; SSSE3-NEXT: pslld $16, %xmm6 -; SSSE3-NEXT: psrad $16, %xmm6 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSSE3-NEXT: movdqa %xmm10, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: pslld $16, %xmm7 -; SSSE3-NEXT: psrad $16, %xmm7 -; SSSE3-NEXT: packssdw %xmm6, %xmm7 -; SSSE3-NEXT: psubusw %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: movdqa %xmm10, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: pslld $16, %xmm3 -; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm10, %xmm8 -; SSSE3-NEXT: pand %xmm4, %xmm10 -; SSSE3-NEXT: por %xmm8, %xmm10 -; SSSE3-NEXT: pslld $16, %xmm10 -; SSSE3-NEXT: psrad $16, %xmm10 -; SSSE3-NEXT: packssdw %xmm3, %xmm10 -; SSSE3-NEXT: psubusw %xmm10, %xmm1 -; SSSE3-NEXT: retq +; SSE2OR3-LABEL: psubus_16i32_max: +; SSE2OR3: # %bb.0: # %vector.ph +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] +; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 +; SSE2OR3-NEXT: pxor %xmm9, %xmm8 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] +; SSE2OR3-NEXT: movdqa %xmm10, %xmm6 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2OR3-NEXT: pand %xmm6, %xmm3 +; SSE2OR3-NEXT: pxor %xmm8, %xmm6 +; SSE2OR3-NEXT: por %xmm3, %xmm6 +; SSE2OR3-NEXT: pslld $16, %xmm6 +; SSE2OR3-NEXT: psrad $16, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 +; SSE2OR3-NEXT: pxor %xmm9, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm10, %xmm7 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE2OR3-NEXT: pand %xmm7, %xmm2 +; SSE2OR3-NEXT: pxor %xmm8, %xmm7 +; SSE2OR3-NEXT: por %xmm2, %xmm7 +; SSE2OR3-NEXT: pslld $16, %xmm7 +; SSE2OR3-NEXT: psrad $16, %xmm7 +; SSE2OR3-NEXT: packssdw %xmm6, %xmm7 +; SSE2OR3-NEXT: psubusw %xmm7, %xmm0 +; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 +; SSE2OR3-NEXT: pxor %xmm9, %xmm2 +; SSE2OR3-NEXT: movdqa %xmm10, %xmm3 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2OR3-NEXT: pand %xmm3, %xmm5 +; SSE2OR3-NEXT: pxor %xmm8, %xmm3 +; SSE2OR3-NEXT: por %xmm5, %xmm3 +; SSE2OR3-NEXT: pslld $16, %xmm3 +; SSE2OR3-NEXT: psrad $16, %xmm3 +; SSE2OR3-NEXT: pxor %xmm4, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2OR3-NEXT: pxor %xmm10, %xmm8 +; SSE2OR3-NEXT: pand %xmm4, %xmm10 +; SSE2OR3-NEXT: por %xmm8, %xmm10 +; SSE2OR3-NEXT: pslld $16, %xmm10 +; SSE2OR3-NEXT: psrad $16, %xmm10 +; SSE2OR3-NEXT: packssdw %xmm3, %xmm10 +; SSE2OR3-NEXT: psubusw %xmm10, %xmm1 +; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph @@ -2115,23 +1955,14 @@ define void @subus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) { } define void @subus_v2i8(<2 x i8>* %p1, <2 x i8>* %p2) { -; SSE2-LABEL: subus_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: psubusb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: subus_v2i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: psubusb %xmm1, %xmm0 -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: retq +; SSE2OR3-LABEL: subus_v2i8: +; SSE2OR3: # %bb.0: +; SSE2OR3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2OR3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2OR3-NEXT: psubusb %xmm1, %xmm0 +; SSE2OR3-NEXT: movd %xmm0, %eax +; SSE2OR3-NEXT: movw %ax, (%rdi) +; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: subus_v2i8: ; SSE41: # %bb.0: @@ -2663,159 +2494,82 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) { ; v8i32/v8i64 - sub(x,trunc(umin(y,zext(x)))) define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { -; SSE2-LABEL: test33: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pxor %xmm12, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm14, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pandn %xmm11, %xmm7 -; SSE2-NEXT: por %xmm3, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pxor %xmm12, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE2-NEXT: psubd %xmm3, %xmm0 -; SSE2-NEXT: psubd %xmm5, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test33: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm9 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSSE3-NEXT: movdqa %xmm0, %xmm11 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm12, %xmm6 -; SSSE3-NEXT: movdqa %xmm11, %xmm7 -; SSSE3-NEXT: pxor %xmm12, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm14, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm3 -; SSSE3-NEXT: pandn %xmm11, %xmm7 -; SSSE3-NEXT: por %xmm3, %xmm7 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm12, %xmm3 -; SSSE3-NEXT: movdqa %xmm10, %xmm6 -; SSSE3-NEXT: pxor %xmm12, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm10, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm12, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pxor %xmm12, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm5 -; SSSE3-NEXT: pandn %xmm9, %xmm6 -; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm12, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm12 -; SSSE3-NEXT: movdqa %xmm12, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSSE3-NEXT: psubd %xmm3, %xmm0 -; SSSE3-NEXT: psubd %xmm5, %xmm1 -; SSSE3-NEXT: retq +; SSE2OR3-LABEL: test33: +; SSE2OR3: # %bb.0: +; SSE2OR3-NEXT: pxor %xmm7, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm1, %xmm8 +; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE2OR3-NEXT: movdqa %xmm1, %xmm9 +; SSE2OR3-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE2OR3-NEXT: movdqa %xmm0, %xmm10 +; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE2OR3-NEXT: movdqa %xmm0, %xmm11 +; SSE2OR3-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259456,9223372039002259456] +; SSE2OR3-NEXT: movdqa %xmm3, %xmm6 +; SSE2OR3-NEXT: pxor %xmm12, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm11, %xmm7 +; SSE2OR3-NEXT: pxor %xmm12, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm13 +; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm13 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm14, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,3,3] +; SSE2OR3-NEXT: por %xmm6, %xmm7 +; SSE2OR3-NEXT: pand %xmm7, %xmm3 +; SSE2OR3-NEXT: pandn %xmm11, %xmm7 +; SSE2OR3-NEXT: por %xmm3, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 +; SSE2OR3-NEXT: pxor %xmm12, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm10, %xmm6 +; SSE2OR3-NEXT: pxor %xmm12, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm11 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm13, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2OR3-NEXT: por %xmm6, %xmm3 +; SSE2OR3-NEXT: pand %xmm3, %xmm2 +; SSE2OR3-NEXT: pandn %xmm10, %xmm3 +; SSE2OR3-NEXT: por %xmm2, %xmm3 +; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2] +; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 +; SSE2OR3-NEXT: pxor %xmm12, %xmm2 +; SSE2OR3-NEXT: movdqa %xmm9, %xmm6 +; SSE2OR3-NEXT: pxor %xmm12, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm7 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm10, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: por %xmm2, %xmm6 +; SSE2OR3-NEXT: pand %xmm6, %xmm5 +; SSE2OR3-NEXT: pandn %xmm9, %xmm6 +; SSE2OR3-NEXT: por %xmm5, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 +; SSE2OR3-NEXT: pxor %xmm12, %xmm2 +; SSE2OR3-NEXT: pxor %xmm8, %xmm12 +; SSE2OR3-NEXT: movdqa %xmm12, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm2, %xmm12 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm7, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2OR3-NEXT: por %xmm2, %xmm5 +; SSE2OR3-NEXT: pand %xmm5, %xmm4 +; SSE2OR3-NEXT: pandn %xmm8, %xmm5 +; SSE2OR3-NEXT: por %xmm4, %xmm5 +; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] +; SSE2OR3-NEXT: psubd %xmm3, %xmm0 +; SSE2OR3-NEXT: psubd %xmm5, %xmm1 +; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: test33: ; SSE41: # %bb.0: @@ -2970,165 +2724,85 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; v8i32/v8i64 - sub(x,trunc(umin(zext(and(x,1)),y))) define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { -; SSE2-LABEL: test34: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pxor %xmm11, %xmm7 -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: por %xmm11, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm14, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm12 -; SSE2-NEXT: pandn %xmm3, %xmm7 -; SSE2-NEXT: por %xmm12, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm11, %xmm3 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: por %xmm11, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm10 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm10, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: por %xmm11, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm9 -; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: por %xmm9, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 -; SSE2-NEXT: por %xmm8, %xmm11 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm8 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm8, %xmm5 -; SSE2-NEXT: packuswb %xmm6, %xmm5 -; SSE2-NEXT: psubd %xmm3, %xmm0 -; SSE2-NEXT: psubd %xmm5, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test34: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSSE3-NEXT: movdqa %xmm1, %xmm9 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSSE3-NEXT: movdqa %xmm0, %xmm12 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm3, %xmm7 -; SSSE3-NEXT: pxor %xmm11, %xmm7 -; SSSE3-NEXT: movdqa %xmm12, %xmm6 -; SSSE3-NEXT: por %xmm11, %xmm6 -; SSSE3-NEXT: movdqa %xmm7, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm14, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm12 -; SSSE3-NEXT: pandn %xmm3, %xmm7 -; SSSE3-NEXT: por %xmm12, %xmm7 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm11, %xmm3 -; SSSE3-NEXT: movdqa %xmm10, %xmm6 -; SSSE3-NEXT: por %xmm11, %xmm6 -; SSSE3-NEXT: movdqa %xmm3, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm10 -; SSSE3-NEXT: pandn %xmm2, %xmm3 -; SSSE3-NEXT: por %xmm10, %xmm3 -; SSSE3-NEXT: packuswb %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm11, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: por %xmm11, %xmm6 -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm9 -; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: por %xmm9, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm11, %xmm2 -; SSSE3-NEXT: por %xmm8, %xmm11 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm8 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm8, %xmm5 -; SSSE3-NEXT: packuswb %xmm6, %xmm5 -; SSSE3-NEXT: psubd %xmm3, %xmm0 -; SSSE3-NEXT: psubd %xmm5, %xmm1 -; SSSE3-NEXT: retq +; SSE2OR3-LABEL: test34: +; SSE2OR3: # %bb.0: +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] +; SSE2OR3-NEXT: pand %xmm6, %xmm0 +; SSE2OR3-NEXT: pand %xmm6, %xmm1 +; SSE2OR3-NEXT: pxor %xmm7, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm1, %xmm8 +; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE2OR3-NEXT: movdqa %xmm1, %xmm9 +; SSE2OR3-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE2OR3-NEXT: movdqa %xmm0, %xmm10 +; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE2OR3-NEXT: movdqa %xmm0, %xmm12 +; SSE2OR3-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] +; SSE2OR3-NEXT: movdqa %xmm3, %xmm7 +; SSE2OR3-NEXT: pxor %xmm11, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm12, %xmm6 +; SSE2OR3-NEXT: por %xmm11, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm13 +; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm13 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm14, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,3,3] +; SSE2OR3-NEXT: por %xmm6, %xmm7 +; SSE2OR3-NEXT: pand %xmm7, %xmm12 +; SSE2OR3-NEXT: pandn %xmm3, %xmm7 +; SSE2OR3-NEXT: por %xmm12, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 +; SSE2OR3-NEXT: pxor %xmm11, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm10, %xmm6 +; SSE2OR3-NEXT: por %xmm11, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm12 +; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm12 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm13, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2OR3-NEXT: por %xmm6, %xmm3 +; SSE2OR3-NEXT: pand %xmm3, %xmm10 +; SSE2OR3-NEXT: pandn %xmm2, %xmm3 +; SSE2OR3-NEXT: por %xmm10, %xmm3 +; SSE2OR3-NEXT: packuswb %xmm7, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 +; SSE2OR3-NEXT: pxor %xmm11, %xmm2 +; SSE2OR3-NEXT: movdqa %xmm9, %xmm6 +; SSE2OR3-NEXT: por %xmm11, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm7 +; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm10, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: por %xmm2, %xmm6 +; SSE2OR3-NEXT: pand %xmm6, %xmm9 +; SSE2OR3-NEXT: pandn %xmm5, %xmm6 +; SSE2OR3-NEXT: por %xmm9, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 +; SSE2OR3-NEXT: pxor %xmm11, %xmm2 +; SSE2OR3-NEXT: por %xmm8, %xmm11 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm11, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm2, %xmm11 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm7, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2OR3-NEXT: por %xmm2, %xmm5 +; SSE2OR3-NEXT: pand %xmm5, %xmm8 +; SSE2OR3-NEXT: pandn %xmm4, %xmm5 +; SSE2OR3-NEXT: por %xmm8, %xmm5 +; SSE2OR3-NEXT: packuswb %xmm6, %xmm5 +; SSE2OR3-NEXT: psubd %xmm3, %xmm0 +; SSE2OR3-NEXT: psubd %xmm5, %xmm1 +; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: test34: ; SSE41: # %bb.0: