From 49ba9b8e2ffcbe78de5971074a76a60191d59d86 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 6 Apr 2015 18:39:00 +0000 Subject: [PATCH] [X86][SSE] Use (V)PINSRB for direct byte insertion in 16i8 buildvector on SSE4.1 targets This patch allows SSE4.1 targets to use (V)PINSRB to create 16i8 vectors by inserting i8 scalars directly into a XMM register instead of merging pairs of i8 scalars into a i16 and using the SSE2 PINSRW instruction. This allows folding of byte loads and reduces scalar register usage as well. Differential Revision: http://reviews.llvm.org/D8839 llvm-svn: 234193 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++++ llvm/test/CodeGen/X86/vec_cast2.ll | 74 +++++++--------- .../CodeGen/X86/vector-shuffle-128-v16.ll | 86 +++++++++++++------ 3 files changed, 115 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3055256605a18..2101724588b9f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4460,6 +4460,29 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, SDLoc dl(Op); SDValue V; bool First = true; + + // SSE4.1 - use PINSRB to insert each byte directly. + if (Subtarget->hasSSE41()) { + for (unsigned i = 0; i < 16; ++i) { + bool isNonZero = (NonZeros & (1 << i)) != 0; + if (isNonZero) { + if (First) { + if (NumZero) + V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); + else + V = DAG.getUNDEF(MVT::v16i8); + First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + MVT::v16i8, V, Op.getOperand(i), + DAG.getIntPtrConstant(i)); + } + } + + return V; + } + + // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll index 07cd1951365dd..e50789570ed03 100644 --- a/llvm/test/CodeGen/X86/vec_cast2.ll +++ b/llvm/test/CodeGen/X86/vec_cast2.ll @@ -100,37 +100,29 @@ define <8 x i8> @foo3_8(<8 x float> %src) { ; ; CHECK-WIDE-LABEL: foo3_8: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx -; CHECK-WIDE-NEXT: movzbl %cl, %ecx -; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx -; CHECK-WIDE-NEXT: movzbl %dl, %edx -; CHECK-WIDE-NEXT: orl %eax, %edx -; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: movzbl %cl, %ecx -; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: movzbl %cl, %ecx -; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 ; CHECK-WIDE-NEXT: vzeroupper ; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> @@ -145,21 +137,17 @@ define <4 x i8> @foo3_4(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: foo3_4: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx -; CHECK-WIDE-NEXT: movzbl %cl, %ecx -; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx -; CHECK-WIDE-NEXT: movzbl %dl, %edx -; CHECK-WIDE-NEXT: orl %eax, %edx -; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 ; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 60cfec688fd23..53d13c86657b5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -651,18 +651,30 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( } define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE: # BB#0: -; SSE-NEXT: shll $8, %edi -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $2, %edi, %xmm0 -; SSE-NEXT: retq - +; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: shll $8, %edi +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pinsrw $2, %edi, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: shll $8, %edi +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pinsrw $2, %edi, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrb $5, %edi, %xmm0 +; SSE41-NEXT: retq +; ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: shll $8, %edi ; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %edi, %xmm0 +; AVX-NEXT: vpinsrb $5, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> @@ -670,18 +682,30 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( } define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { -; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; SSE: # BB#0: -; SSE-NEXT: shll $8, %edi -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $7, %edi, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE2: # BB#0: +; SSE2-NEXT: shll $8, %edi +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pinsrw $7, %edi, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSSE3: # BB#0: +; SSSE3-NEXT: shll $8, %edi +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pinsrw $7, %edi, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrb $15, %edi, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; AVX: # BB#0: -; AVX-NEXT: shll $8, %edi ; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %edi, %xmm0 +; AVX-NEXT: vpinsrb $15, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> @@ -689,18 +713,30 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16( } define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE: # BB#0: -; SSE-NEXT: movzbl %dil, %eax -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $1, %eax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pinsrw $1, %eax, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrb $2, %edi, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: movzbl %dil, %eax ; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, %eax, %xmm0 +; AVX-NEXT: vpinsrb $2, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 3 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32>