Skip to content

Commit

Permalink
[X86][SSE] Use (V)PINSRB for direct byte insertion in 16i8 buildvecto…
Browse files Browse the repository at this point in the history
…r on SSE4.1 targets

This patch allows SSE4.1 targets to use (V)PINSRB to create 16i8 vectors by inserting i8 scalars directly into a XMM register instead of merging pairs of i8 scalars into a i16 and using the SSE2 PINSRW instruction.

This allows folding of byte loads and reduces scalar register usage as well.

Differential Revision: http://reviews.llvm.org/D8839

llvm-svn: 234193
  • Loading branch information
RKSimon committed Apr 6, 2015
1 parent 7431fb0 commit 49ba9b8
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 68 deletions.
23 changes: 23 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -4460,6 +4460,29 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
SDLoc dl(Op);
SDValue V;
bool First = true;

// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget->hasSSE41()) {
for (unsigned i = 0; i < 16; ++i) {
bool isNonZero = (NonZeros & (1 << i)) != 0;
if (isNonZero) {
if (First) {
if (NumZero)
V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
else
V = DAG.getUNDEF(MVT::v16i8);
First = false;
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
MVT::v16i8, V, Op.getOperand(i),
DAG.getIntPtrConstant(i));
}
}

return V;
}

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
if (ThisIsNonZero && First) {
Expand Down
74 changes: 31 additions & 43 deletions llvm/test/CodeGen/X86/vec_cast2.ll
Expand Up @@ -100,37 +100,29 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
;
; CHECK-WIDE-LABEL: foo3_8:
; CHECK-WIDE: ## BB#0:
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
; CHECK-WIDE-NEXT: orl %eax, %ecx
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
; CHECK-WIDE-NEXT: movzbl %dl, %edx
; CHECK-WIDE-NEXT: orl %eax, %edx
; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1
; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
; CHECK-WIDE-NEXT: orl %eax, %ecx
; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
; CHECK-WIDE-NEXT: orl %eax, %ecx
; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: retl
%res = fptosi <8 x float> %src to <8 x i8>
Expand All @@ -145,21 +137,17 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
;
; CHECK-WIDE-LABEL: foo3_4:
; CHECK-WIDE: ## BB#0:
; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
; CHECK-WIDE-NEXT: movzbl %cl, %ecx
; CHECK-WIDE-NEXT: orl %eax, %ecx
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: shll $8, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
; CHECK-WIDE-NEXT: movzbl %dl, %edx
; CHECK-WIDE-NEXT: orl %eax, %edx
; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: retl
%res = fptosi <4 x float> %src to <4 x i8>
ret <4 x i8> %res
Expand Down
86 changes: 61 additions & 25 deletions llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
Expand Up @@ -651,56 +651,92 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
}

define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE: # BB#0:
; SSE-NEXT: shll $8, %edi
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $2, %edi, %xmm0
; SSE-NEXT: retq

; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE2: # BB#0:
; SSE2-NEXT: shll $8, %edi
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $2, %edi, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSSE3: # BB#0:
; SSSE3-NEXT: shll $8, %edi
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE41: # BB#0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $5, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
; AVX-NEXT: shll $8, %edi
; AVX-NEXT: vpxor %xmm0, %xmm0
; AVX-NEXT: vpinsrw $2, %edi, %xmm0
; AVX-NEXT: vpinsrb $5, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 0
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i8> %shuffle
}

define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSE: # BB#0:
; SSE-NEXT: shll $8, %edi
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $7, %edi, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSE2: # BB#0:
; SSE2-NEXT: shll $8, %edi
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $7, %edi, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSSE3: # BB#0:
; SSSE3-NEXT: shll $8, %edi
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSE41: # BB#0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $15, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; AVX: # BB#0:
; AVX-NEXT: shll $8, %edi
; AVX-NEXT: vpxor %xmm0, %xmm0
; AVX-NEXT: vpinsrw $7, %edi, %xmm0
; AVX-NEXT: vpinsrb $15, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 0
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
ret <16 x i8> %shuffle
}

define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE: # BB#0:
; SSE-NEXT: movzbl %dil, %eax
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $1, %eax, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE2: # BB#0:
; SSE2-NEXT: movzbl %dil, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $1, %eax, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSSE3: # BB#0:
; SSSE3-NEXT: movzbl %dil, %eax
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE41: # BB#0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $2, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
; AVX-NEXT: movzbl %dil, %eax
; AVX-NEXT: vpxor %xmm0, %xmm0
; AVX-NEXT: vpinsrw $1, %eax, %xmm0
; AVX-NEXT: vpinsrb $2, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 3
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
Expand Down

0 comments on commit 49ba9b8

Please sign in to comment.