diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 429175a10818b3..5f15d07d0bab39 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSE3,SSE3_SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSE3,SSE3_FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSSE3,SSSE3_SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSSE3,SSSE3_FAST ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW @@ -10,10 +12,10 @@ ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111 define <4 x float> @hadd_v4f32(<4 x float> %a) { -; SSSE3-LABEL: hadd_v4f32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSE-LABEL: hadd_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: hadd_v4f32: ; AVX: # %bb.0: @@ -27,21 +29,21 @@ define <4 x float> @hadd_v4f32(<4 x float> %a) { } define <8 x float> @hadd_v8f32a(<8 x float> %a) { -; SSSE3_SLOW-LABEL: hadd_v8f32a: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1 -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v8f32a: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2 -; SSSE3_FAST-NEXT: haddps %xmm1, %xmm2 -; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 -; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hadd_v8f32a: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movaps %xmm0, %xmm2 +; SSE_SLOW-NEXT: haddps %xmm1, %xmm2 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSE_SLOW-NEXT: movaps %xmm2, %xmm1 +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hadd_v8f32a: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: movaps %xmm0, %xmm2 +; SSE_FAST-NEXT: haddps %xmm1, %xmm2 +; SSE_FAST-NEXT: haddps %xmm0, %xmm0 +; SSE_FAST-NEXT: movaps %xmm2, %xmm1 +; SSE_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: hadd_v8f32a: ; AVX1_SLOW: # %bb.0: @@ -71,11 +73,11 @@ define <8 x float> @hadd_v8f32a(<8 x float> %a) { } define <8 x float> @hadd_v8f32b(<8 x float> %a) { -; SSSE3-LABEL: hadd_v8f32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: haddps %xmm1, %xmm1 -; SSSE3-NEXT: retq +; SSE-LABEL: hadd_v8f32b: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: haddps %xmm1, %xmm1 +; SSE-NEXT: retq ; ; AVX-LABEL: hadd_v8f32b: ; AVX: # %bb.0: @@ -89,10 +91,10 @@ define <8 x float> @hadd_v8f32b(<8 x float> %a) { } define <4 x float> @hsub_v4f32(<4 x float> %a) { -; SSSE3-LABEL: hsub_v4f32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubps %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSE-LABEL: hsub_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: hsubps %xmm0, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: hsub_v4f32: ; AVX: # %bb.0: @@ -106,21 +108,21 @@ define <4 x float> @hsub_v4f32(<4 x float> %a) { } define <8 x float> @hsub_v8f32a(<8 x float> %a) { -; SSSE3_SLOW-LABEL: hsub_v8f32a: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: hsubps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1 -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v8f32a: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2 -; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm2 -; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 -; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hsub_v8f32a: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movaps %xmm0, %xmm2 +; SSE_SLOW-NEXT: hsubps %xmm1, %xmm2 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSE_SLOW-NEXT: movaps %xmm2, %xmm1 +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hsub_v8f32a: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: movaps %xmm0, %xmm2 +; SSE_FAST-NEXT: hsubps %xmm1, %xmm2 +; SSE_FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE_FAST-NEXT: movaps %xmm2, %xmm1 +; SSE_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: hsub_v8f32a: ; AVX1_SLOW: # %bb.0: @@ -150,11 +152,11 @@ define <8 x float> @hsub_v8f32a(<8 x float> %a) { } define <8 x float> @hsub_v8f32b(<8 x float> %a) { -; SSSE3-LABEL: hsub_v8f32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubps %xmm0, %xmm0 -; SSSE3-NEXT: hsubps %xmm1, %xmm1 -; SSSE3-NEXT: retq +; SSE-LABEL: hsub_v8f32b: +; SSE: # %bb.0: +; SSE-NEXT: hsubps %xmm0, %xmm0 +; SSE-NEXT: hsubps %xmm1, %xmm1 +; SSE-NEXT: retq ; ; AVX-LABEL: hsub_v8f32b: ; AVX: # %bb.0: @@ -168,18 +170,18 @@ define <8 x float> @hsub_v8f32b(<8 x float> %a) { } define <2 x double> @hadd_v2f64(<2 x double> %a) { -; SSSE3_SLOW-LABEL: hadd_v2f64: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v2f64: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hadd_v2f64: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hadd_v2f64: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: hadd_v2f64: ; AVX1_SLOW: # %bb.0: @@ -212,18 +214,18 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) { } define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) { -; SSSE3_SLOW-LABEL: hadd_v2f64_scalar_splat: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v2f64_scalar_splat: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hadd_v2f64_scalar_splat: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hadd_v2f64_scalar_splat: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat: ; AVX1_SLOW: # %bb.0: @@ -257,23 +259,23 @@ define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) { } define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) { -; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_splat: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v4f64_scalar_splat: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hadd_v4f64_scalar_splat: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE_SLOW-NEXT: addsd %xmm0, %xmm2 +; SSE_SLOW-NEXT: movapd %xmm1, %xmm3 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE_SLOW-NEXT: addsd %xmm1, %xmm3 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hadd_v4f64_scalar_splat: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE_FAST-NEXT: haddpd %xmm1, %xmm1 +; SSE_FAST-NEXT: retq ; ; AVX-LABEL: hadd_v4f64_scalar_splat: ; AVX: # %bb.0: @@ -292,20 +294,20 @@ define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) { } define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) { -; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_broadcast: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] -; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v4f64_scalar_broadcast: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: movapd %xmm0, %xmm1 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hadd_v4f64_scalar_broadcast: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] +; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hadd_v4f64_scalar_broadcast: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE_FAST-NEXT: movapd %xmm0, %xmm1 +; SSE_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast: ; AVX1_SLOW: # %bb.0: @@ -346,23 +348,23 @@ define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) { } define <4 x double> @hadd_v4f64(<4 x double> %a) { -; SSSE3_SLOW-LABEL: hadd_v4f64: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v4f64: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hadd_v4f64: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE_SLOW-NEXT: addsd %xmm0, %xmm2 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSE_SLOW-NEXT: movapd %xmm1, %xmm2 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE_SLOW-NEXT: addsd %xmm1, %xmm2 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hadd_v4f64: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE_FAST-NEXT: haddpd %xmm1, %xmm1 +; SSE_FAST-NEXT: retq ; ; AVX-LABEL: hadd_v4f64: ; AVX: # %bb.0: @@ -376,18 +378,18 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) { } define <2 x double> @hsub_v2f64(<2 x double> %a) { -; SSSE3_SLOW-LABEL: hsub_v2f64: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3_SLOW-NEXT: subsd %xmm1, %xmm0 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v2f64: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hsub_v2f64: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE_SLOW-NEXT: subsd %xmm1, %xmm0 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hsub_v2f64: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: hsubpd %xmm0, %xmm0 +; SSE_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: hsub_v2f64: ; AVX1_SLOW: # %bb.0: @@ -420,23 +422,23 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) { } define <4 x double> @hsub_v4f64(<4 x double> %a) { -; SSSE3_SLOW-LABEL: hsub_v4f64: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm0 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] -; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm1 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v4f64: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq +; SSE_SLOW-LABEL: hsub_v4f64: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE_SLOW-NEXT: subsd %xmm2, %xmm0 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSE_SLOW-NEXT: movapd %xmm1, %xmm2 +; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE_SLOW-NEXT: subsd %xmm2, %xmm1 +; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] +; SSE_SLOW-NEXT: retq +; +; SSE_FAST-LABEL: hsub_v4f64: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: hsubpd %xmm0, %xmm0 +; SSE_FAST-NEXT: hsubpd %xmm1, %xmm1 +; SSE_FAST-NEXT: retq ; ; AVX-LABEL: hsub_v4f64: ; AVX: # %bb.0: @@ -450,6 +452,13 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) { } define <4 x i32> @hadd_v4i32(<4 x i32> %a) { +; SSE3-LABEL: hadd_v4i32: +; SSE3: # %bb.0: +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE3-NEXT: paddd %xmm1, %xmm0 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hadd_v4i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddd %xmm0, %xmm0 @@ -467,6 +476,16 @@ define <4 x i32> @hadd_v4i32(<4 x i32> %a) { } define <8 x i32> @hadd_v8i32a(<8 x i32> %a) { +; SSE3-LABEL: hadd_v8i32a: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm2 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; SSE3-NEXT: paddd %xmm0, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3_SLOW-LABEL: hadd_v8i32a: ; SSSE3_SLOW: # %bb.0: ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 @@ -513,6 +532,16 @@ define <8 x i32> @hadd_v8i32a(<8 x i32> %a) { } define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { +; SSE3-LABEL: hadd_v8i32b: +; SSE3: # %bb.0: +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2] +; SSE3-NEXT: paddd %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hadd_v8i32b: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddd %xmm0, %xmm0 @@ -539,6 +568,13 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { } define <4 x i32> @hsub_v4i32(<4 x i32> %a) { +; SSE3-LABEL: hsub_v4i32: +; SSE3: # %bb.0: +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,1,3] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; SSE3-NEXT: psubd %xmm1, %xmm0 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hsub_v4i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phsubd %xmm0, %xmm0 @@ -556,6 +592,16 @@ define <4 x i32> @hsub_v4i32(<4 x i32> %a) { } define <8 x i32> @hsub_v8i32a(<8 x i32> %a) { +; SSE3-LABEL: hsub_v8i32a: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm2 +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE3-NEXT: psubd %xmm0, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3_SLOW-LABEL: hsub_v8i32a: ; SSSE3_SLOW: # %bb.0: ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 @@ -602,6 +648,16 @@ define <8 x i32> @hsub_v8i32a(<8 x i32> %a) { } define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { +; SSE3-LABEL: hsub_v8i32b: +; SSE3: # %bb.0: +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE3-NEXT: psubd %xmm2, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2] +; SSE3-NEXT: psubd %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hsub_v8i32b: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phsubd %xmm0, %xmm0 @@ -628,6 +684,18 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { } define <8 x i16> @hadd_v8i16(<8 x i16> %a) { +; SSE3-LABEL: hadd_v8i16: +; SSE3: # %bb.0: +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE3-NEXT: paddw %xmm1, %xmm0 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hadd_v8i16: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddw %xmm0, %xmm0 @@ -645,6 +713,28 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %a) { } define <16 x i16> @hadd_v16i16a(<16 x i16> %a) { +; SSE3-LABEL: hadd_v16i16a: +; SSE3: # %bb.0: +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,0,3,2,4,5,6,7] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE3-NEXT: paddw %xmm3, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSE3-NEXT: retq +; ; SSSE3_SLOW-LABEL: hadd_v16i16a: ; SSSE3_SLOW: # %bb.0: ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 @@ -691,6 +781,32 @@ define <16 x i16> @hadd_v16i16a(<16 x i16> %a) { } define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { +; SSE3-LABEL: hadd_v16i16b: +; SSE3: # %bb.0: +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] +; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5] +; SSE3-NEXT: paddw %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hadd_v16i16b: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddw %xmm0, %xmm0 @@ -717,6 +833,14 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { } define <8 x i16> @hsub_v8i16(<8 x i16> %a) { +; SSE3-LABEL: hsub_v8i16: +; SSE3: # %bb.0: +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; SSE3-NEXT: psubw %xmm1, %xmm0 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hsub_v8i16: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phsubw %xmm0, %xmm0 @@ -734,6 +858,29 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %a) { } define <16 x i16> @hsub_v16i16a(<16 x i16> %a) { +; SSE3-LABEL: hsub_v16i16a: +; SSE3: # %bb.0: +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE3-NEXT: psubw %xmm0, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3_SLOW-LABEL: hsub_v16i16a: ; SSSE3_SLOW: # %bb.0: ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 @@ -780,6 +927,32 @@ define <16 x i16> @hsub_v16i16a(<16 x i16> %a) { } define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { +; SSE3-LABEL: hsub_v16i16b: +; SSE3: # %bb.0: +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] +; SSE3-NEXT: psubw %xmm2, %xmm0 +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5] +; SSE3-NEXT: psubw %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hsub_v16i16b: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phsubw %xmm0, %xmm0 @@ -806,11 +979,11 @@ define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { } define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) { -; SSSE3-LABEL: broadcast_haddps_v4f32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; SSSE3-NEXT: retq +; SSE-LABEL: broadcast_haddps_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE-NEXT: retq ; ; AVX1-LABEL: broadcast_haddps_v4f32: ; AVX1: # %bb.0: @@ -831,10 +1004,10 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) { declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { -; SSSE3-LABEL: PR34724_1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSE-LABEL: PR34724_1: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: PR34724_1: ; AVX: # %bb.0: @@ -851,10 +1024,10 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { } define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { -; SSSE3-LABEL: PR34724_2: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSE-LABEL: PR34724_2: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: PR34724_2: ; AVX: # %bb.0: @@ -876,11 +1049,11 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { ; define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) { -; SSSE3-LABEL: hadd_4f32_v8f32_shuffle: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm1, %xmm0 -; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: retq +; SSE-LABEL: hadd_4f32_v8f32_shuffle: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm1, %xmm0 +; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: retq ; ; AVX-LABEL: hadd_4f32_v8f32_shuffle: ; AVX: # %bb.0: @@ -899,11 +1072,11 @@ define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) { } define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) { -; SSSE3-LABEL: hsub_4f32_v8f32_shuffle: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm1, %xmm0 -; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: retq +; SSE-LABEL: hsub_4f32_v8f32_shuffle: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm1, %xmm0 +; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: retq ; ; AVX-LABEL: hsub_4f32_v8f32_shuffle: ; AVX: # %bb.0: @@ -922,6 +1095,14 @@ define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) { } define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) { +; SSE3-LABEL: hadd_4i32_v8i32_shuffle: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm2 +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hadd_4i32_v8i32_shuffle: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddd %xmm1, %xmm0 @@ -953,6 +1134,14 @@ define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) { } define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) { +; SSE3-LABEL: hsub_4i32_v8i32_shuffle: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm2 +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hsub_4i32_v8i32_shuffle: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddd %xmm1, %xmm0 @@ -988,12 +1177,12 @@ define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) { ; define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) { -; SSSE3-LABEL: hadd_4f64_v4f64_shuffle: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddpd %xmm1, %xmm0 -; SSSE3-NEXT: haddpd %xmm3, %xmm2 -; SSSE3-NEXT: movapd %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSE-LABEL: hadd_4f64_v4f64_shuffle: +; SSE: # %bb.0: +; SSE-NEXT: haddpd %xmm1, %xmm0 +; SSE-NEXT: haddpd %xmm3, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_4f64_v4f64_shuffle: ; AVX1: # %bb.0: @@ -1016,12 +1205,12 @@ define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) } define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) { -; SSSE3-LABEL: hsub_4f64_v4f64_shuffle: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubpd %xmm1, %xmm0 -; SSSE3-NEXT: hsubpd %xmm3, %xmm2 -; SSSE3-NEXT: movapd %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSE-LABEL: hsub_4f64_v4f64_shuffle: +; SSE: # %bb.0: +; SSE-NEXT: hsubpd %xmm1, %xmm0 +; SSE-NEXT: hsubpd %xmm3, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: hsub_4f64_v4f64_shuffle: ; AVX1: # %bb.0: @@ -1044,12 +1233,12 @@ define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) } define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { -; SSSE3-LABEL: hadd_8f32_v8f32_shuffle: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm1, %xmm0 -; SSSE3-NEXT: haddps %xmm3, %xmm2 -; SSSE3-NEXT: movaps %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSE-LABEL: hadd_8f32_v8f32_shuffle: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_8f32_v8f32_shuffle: ; AVX1: # %bb.0: @@ -1072,12 +1261,12 @@ define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { } define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { -; SSSE3-LABEL: hsub_8f32_v8f32_shuffle: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm1, %xmm0 -; SSSE3-NEXT: haddps %xmm3, %xmm2 -; SSSE3-NEXT: movaps %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSE-LABEL: hsub_8f32_v8f32_shuffle: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: hsub_8f32_v8f32_shuffle: ; AVX1: # %bb.0: @@ -1100,6 +1289,19 @@ define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { } define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { +; SSE3-LABEL: hadd_8i32_v8i32_shuffle: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm2, %xmm4 +; SSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] +; SSE3-NEXT: movaps %xmm0, %xmm5 +; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE3-NEXT: paddd %xmm4, %xmm2 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE3-NEXT: paddd %xmm5, %xmm0 +; SSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hadd_8i32_v8i32_shuffle: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddd %xmm1, %xmm0 @@ -1130,6 +1332,20 @@ define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { } define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { +; SSE3-LABEL: hsub_8i32_v8i32_shuffle: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm2, %xmm4 +; SSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] +; SSE3-NEXT: movaps %xmm0, %xmm5 +; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE3-NEXT: psubd %xmm2, %xmm4 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE3-NEXT: psubd %xmm0, %xmm5 +; SSE3-NEXT: movdqa %xmm5, %xmm0 +; SSE3-NEXT: movdqa %xmm4, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hsub_8i32_v8i32_shuffle: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phsubd %xmm1, %xmm0 @@ -1160,6 +1376,45 @@ define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { } define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) { +; SSE3-LABEL: hadd_16i16_16i16_shuffle: +; SSE3: # %bb.0: +; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE3-NEXT: paddw %xmm5, %xmm2 +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE3-NEXT: paddw %xmm6, %xmm0 +; SSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE3-NEXT: retq +; ; SSSE3-LABEL: hadd_16i16_16i16_shuffle: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddw %xmm1, %xmm0