Skip to content

Commit

Permalink
[X86][SSE] Improve i16 splatting shuffles
Browse files Browse the repository at this point in the history
Better handling of the annoying pshuflw/pshufhw ops which only shuffle lower/upper halves of a vector.

Added vXi16 unary shuffle support for cases where i16 elements (from the same half of the source) are being splatted to the whole of one of the halves. This avoids the general lowering case which must shuffle the 32-bit elements first - meaning that we used to end up with unnecessary duplicate pshuflw/pshufhw shuffles.

Note this has the side effect of a lot of SSSE3 test cases no longer needing to use PSHUFB, as it falls below the 3 op combine threshold for when PSHUFB is typically worth it. I've raised PR26183 to discuss if the threshold should be changed and whether we need to make it more specific to the target CPU.

Differential Revision: http://reviews.llvm.org/D14901

llvm-svn: 258440
  • Loading branch information
RKSimon committed Jan 21, 2016
1 parent ca05160 commit 5ba1c12
Show file tree
Hide file tree
Showing 10 changed files with 189 additions and 212 deletions.
20 changes: 20 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -9052,6 +9052,26 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

// If we are splatting two values from one half - one to each half, then
// we can shuffle that half so each is splatted to a dword, then splat those
// to their respective halves.
auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
int DOffset) {
int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
V = DAG.getNode(ShufWOp, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
V = DAG.getBitcast(PSHUFDVT, V);
V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
return DAG.getBitcast(VT, V);
};

if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/X86/avx-splat.ll
Expand Up @@ -15,7 +15,8 @@ entry:
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
Expand Down
95 changes: 36 additions & 59 deletions llvm/test/CodeGen/X86/psubus.ll
Expand Up @@ -54,30 +54,21 @@ vector.ph:
}

define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
; SSE2-LABEL: test3:
; SSE2: ## BB#0: ## %vector.ph
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: movdqu (%rdi), %xmm1
; SSE2-NEXT: psubusw %xmm0, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test3:
; SSSE3: ## BB#0: ## %vector.ph
; SSSE3-NEXT: movd %esi, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: movdqu (%rdi), %xmm1
; SSSE3-NEXT: psubusw %xmm0, %xmm1
; SSSE3-NEXT: movdqu %xmm1, (%rdi)
; SSSE3-NEXT: retq
; SSE-LABEL: test3:
; SSE: ## BB#0: ## %vector.ph
; SSE-NEXT: movd %esi, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: movdqu (%rdi), %xmm1
; SSE-NEXT: psubusw %xmm0, %xmm1
; SSE-NEXT: movdqu %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: test3:
; AVX1: ## BB#0: ## %vector.ph
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vmovdqu (%rdi), %xmm1
; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
Expand Down Expand Up @@ -159,9 +150,8 @@ define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
; SSE2: ## BB#0: ## %vector.ph
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movdqu (%rdi), %xmm1
; SSE2-NEXT: psubusb %xmm0, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rdi)
Expand Down Expand Up @@ -304,46 +294,34 @@ vector.ph:
}

define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
; SSE2-LABEL: test9:
; SSE2: ## BB#0: ## %vector.ph
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: movdqu (%rdi), %xmm1
; SSE2-NEXT: movdqu 16(%rdi), %xmm2
; SSE2-NEXT: psubusw %xmm0, %xmm1
; SSE2-NEXT: psubusw %xmm0, %xmm2
; SSE2-NEXT: movdqu %xmm2, 16(%rdi)
; SSE2-NEXT: movdqu %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test9:
; SSSE3: ## BB#0: ## %vector.ph
; SSSE3-NEXT: movd %esi, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: movdqu (%rdi), %xmm1
; SSSE3-NEXT: movdqu 16(%rdi), %xmm2
; SSSE3-NEXT: psubusw %xmm0, %xmm1
; SSSE3-NEXT: psubusw %xmm0, %xmm2
; SSSE3-NEXT: movdqu %xmm2, 16(%rdi)
; SSSE3-NEXT: movdqu %xmm1, (%rdi)
; SSSE3-NEXT: retq
; SSE-LABEL: test9:
; SSE: ## BB#0: ## %vector.ph
; SSE-NEXT: movd %esi, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: movdqu (%rdi), %xmm1
; SSE-NEXT: movdqu 16(%rdi), %xmm2
; SSE-NEXT: psubusw %xmm0, %xmm1
; SSE-NEXT: psubusw %xmm0, %xmm2
; SSE-NEXT: movdqu %xmm2, 16(%rdi)
; SSE-NEXT: movdqu %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: test9:
; AVX1: ## BB#0: ## %vector.ph
; AVX1-NEXT: vmovups (%rdi), %ymm0
; AVX1-NEXT: vmovd %esi, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovd %esi, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm4
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm4
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
Expand Down Expand Up @@ -471,9 +449,8 @@ define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
; SSE2: ## BB#0: ## %vector.ph
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movdqu (%rdi), %xmm1
; SSE2-NEXT: movdqu 16(%rdi), %xmm2
; SSE2-NEXT: psubusb %xmm0, %xmm1
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
Expand Up @@ -745,9 +745,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSE2-NEXT: psllw $5, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
Expand Down Expand Up @@ -949,9 +948,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; X32-SSE-NEXT: psllw $5, %xmm3
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
Expand Up @@ -597,9 +597,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
Expand Down Expand Up @@ -727,9 +726,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; X32-SSE-NEXT: psllw $5, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/vector-shift-shl-128.ll
Expand Up @@ -545,9 +545,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
Expand Down Expand Up @@ -667,9 +666,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; X32-SSE-NEXT: psllw $5, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
Expand Down

0 comments on commit 5ba1c12

Please sign in to comment.