Skip to content

Commit

Permalink
[X86][SSE] getFauxShuffle - support insert(truncate/extend(extract(ve…
Browse files Browse the repository at this point in the history
…c0,c0)),vec1,c1) shuffle patterns at the byte level

Followup to the PR45604 fix at rGe71dd7c011a3 where we disabled most of these cases.

By creating the shuffle at the byte level we can handle any extension/truncation as long as we track how small the scalar got and assume that the upper bytes will need to be zero.
  • Loading branch information
RKSimon committed Apr 26, 2020
1 parent 33f043c commit acbc5ed
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 81 deletions.
36 changes: 18 additions & 18 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -7462,16 +7462,18 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}

// Peek through trunc/aext/zext.
// TODO: handle elements smaller than VT.
// TODO: aext shouldn't require SM_SentinelZero padding.
// TODO: handle shift of scalars.
unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
while (Scl.getOpcode() == ISD::TRUNCATE ||
Scl.getOpcode() == ISD::ANY_EXTEND ||
Scl.getOpcode() == ISD::ZERO_EXTEND) {
Scl = Scl.getOperand(0);
if (Scl.getScalarValueSizeInBits() < NumBitsPerElt)
return false;
if (MinBitsPerElt > Scl.getScalarValueSizeInBits())
MinBitsPerElt = Scl.getScalarValueSizeInBits();
}
if ((MinBitsPerElt % 8) != 0)
return false;

// Attempt to find the source vector the scalar was extracted from.
SDValue SrcExtract;
Expand All @@ -7486,31 +7488,29 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

SDValue SrcVec = SrcExtract.getOperand(0);
EVT SrcVT = SrcVec.getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned NumZeros =
std::max<int>((NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1, 0);

if ((NumSrcElts % NumElts) != 0)
if (!SrcVT.getScalarType().isByteSized())
return false;

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
if (NumSrcElts <= SrcIdx)
return false;
unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
unsigned DstByte = DstIdx * NumBytesPerElt;

// Create 'identity' byte level shuffle mask and then add inserted bytes.
if (Opcode == ISD::SCALAR_TO_VECTOR) {
Ops.push_back(SrcVec);
Mask.append(NumSrcElts, SM_SentinelUndef);
Mask.append(NumSizeInBytes, SM_SentinelUndef);
} else {
Ops.push_back(SrcVec);
Ops.push_back(N.getOperand(0));
for (int i = 0; i != (int)NumSrcElts; ++i)
Mask.push_back(NumSrcElts + i);
for (int i = 0; i != (int)NumSizeInBytes; ++i)
Mask.push_back(NumSizeInBytes + i);
}

int Scale = NumSrcElts / NumElts;
Mask[Scale * DstIdx] = SrcIdx;
for (int i = 0; i != (int)NumZeros; ++i)
Mask[(Scale * DstIdx) + i + 1] = SM_SentinelZero;
unsigned MinBytesPerElts = MinBitsPerElt / 8;
MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
for (unsigned i = 0; i != MinBytesPerElts; ++i)
Mask[DstByte + i] = SrcByte + i;
for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
Mask[DstByte + i] = SM_SentinelZero;
return true;
}
case X86ISD::PACKSS:
Expand Down
35 changes: 18 additions & 17 deletions llvm/test/CodeGen/X86/buildvec-extract.ll
Expand Up @@ -293,24 +293,19 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract2_i32_zext_insert1_i64_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract2_i32_zext_insert1_i64_zero:
; SSE41: # %bb.0:
; SSE41-NEXT: extractps $2, %xmm0, %eax
; SSE41-NEXT: movq %rax, %xmm0
; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: extract2_i32_zext_insert1_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vextractps $2, %xmm0, %eax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 2
%z = zext i32 %e to i64
Expand Down Expand Up @@ -386,16 +381,22 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
}

define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
; SSE-NEXT: pextrw $0, %xmm0, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: retq
; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: extract0_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vpextrw $0, %xmm0, %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 0
%z = zext i16 %e to i64
Expand Down
5 changes: 1 addition & 4 deletions llvm/test/CodeGen/X86/buildvec-insertvec.ll
Expand Up @@ -21,10 +21,7 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
; SSE41-LABEL: foo:
; SSE41: # %bb.0:
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
; SSE41-NEXT: pextrb $8, %xmm0, %eax
; SSE41-NEXT: pextrb $4, %xmm0, %ecx
; SSE41-NEXT: pinsrb $1, %ecx, %xmm0
; SSE41-NEXT: pinsrb $2, %eax, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movl $255, %eax
; SSE41-NEXT: pinsrb $3, %eax, %xmm0
; SSE41-NEXT: movd %xmm0, (%rdi)
Expand Down
10 changes: 2 additions & 8 deletions llvm/test/CodeGen/X86/extract-concat.ll
Expand Up @@ -24,10 +24,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
; SSE42-LABEL: foo:
; SSE42: # %bb.0:
; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
; SSE42-NEXT: pextrb $8, %xmm0, %eax
; SSE42-NEXT: pextrb $4, %xmm0, %ecx
; SSE42-NEXT: pinsrb $1, %ecx, %xmm0
; SSE42-NEXT: pinsrb $2, %eax, %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movl $255, %eax
; SSE42-NEXT: pinsrb $3, %eax, %xmm0
; SSE42-NEXT: movd %xmm0, (%rdi)
Expand All @@ -36,10 +33,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
; AVX-LABEL: foo:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpextrb $8, %xmm0, %eax
; AVX-NEXT: vpextrb $4, %xmm0, %ecx
; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; AVX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: movl $255, %eax
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)
Expand Down
137 changes: 103 additions & 34 deletions llvm/test/CodeGen/X86/vector-shuffle-combining.ll
Expand Up @@ -3028,40 +3028,109 @@ define void @PR43024() {
}

define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
; SSE-LABEL: PR45604:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rsi), %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: movzwl %ax, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movl $11, %eax
; SSE-NEXT: pinsrw $2, %eax, %xmm0
; SSE-NEXT: pextrw $1, %xmm1, %ecx
; SSE-NEXT: pinsrw $4, %ecx, %xmm0
; SSE-NEXT: pinsrw $6, %eax, %xmm0
; SSE-NEXT: pextrw $2, %xmm1, %ecx
; SSE-NEXT: movd %ecx, %xmm2
; SSE-NEXT: pinsrw $2, %eax, %xmm2
; SSE-NEXT: pextrw $3, %xmm1, %ecx
; SSE-NEXT: pinsrw $4, %ecx, %xmm2
; SSE-NEXT: pinsrw $6, %eax, %xmm2
; SSE-NEXT: pextrw $4, %xmm1, %ecx
; SSE-NEXT: movd %ecx, %xmm3
; SSE-NEXT: pinsrw $2, %eax, %xmm3
; SSE-NEXT: pextrw $5, %xmm1, %ecx
; SSE-NEXT: pinsrw $4, %ecx, %xmm3
; SSE-NEXT: pinsrw $6, %eax, %xmm3
; SSE-NEXT: pextrw $6, %xmm1, %ecx
; SSE-NEXT: movd %ecx, %xmm4
; SSE-NEXT: pinsrw $2, %eax, %xmm4
; SSE-NEXT: pextrw $7, %xmm1, %ecx
; SSE-NEXT: pinsrw $4, %ecx, %xmm4
; SSE-NEXT: pinsrw $6, %eax, %xmm4
; SSE-NEXT: movdqa %xmm4, 48(%rdi)
; SSE-NEXT: movdqa %xmm3, 32(%rdi)
; SSE-NEXT: movdqa %xmm2, 16(%rdi)
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: retq
; SSE2-LABEL: PR45604:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movzwl %ax, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: movl $11, %eax
; SSE2-NEXT: pinsrw $2, %eax, %xmm0
; SSE2-NEXT: pextrw $1, %xmm1, %ecx
; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
; SSE2-NEXT: pinsrw $6, %eax, %xmm0
; SSE2-NEXT: pextrw $2, %xmm1, %ecx
; SSE2-NEXT: movd %ecx, %xmm2
; SSE2-NEXT: pinsrw $2, %eax, %xmm2
; SSE2-NEXT: pextrw $3, %xmm1, %ecx
; SSE2-NEXT: pinsrw $4, %ecx, %xmm2
; SSE2-NEXT: pinsrw $6, %eax, %xmm2
; SSE2-NEXT: pextrw $4, %xmm1, %ecx
; SSE2-NEXT: movd %ecx, %xmm3
; SSE2-NEXT: pinsrw $2, %eax, %xmm3
; SSE2-NEXT: pextrw $5, %xmm1, %ecx
; SSE2-NEXT: pinsrw $4, %ecx, %xmm3
; SSE2-NEXT: pinsrw $6, %eax, %xmm3
; SSE2-NEXT: pextrw $6, %xmm1, %ecx
; SSE2-NEXT: movd %ecx, %xmm4
; SSE2-NEXT: pinsrw $2, %eax, %xmm4
; SSE2-NEXT: pextrw $7, %xmm1, %ecx
; SSE2-NEXT: pinsrw $4, %ecx, %xmm4
; SSE2-NEXT: pinsrw $6, %eax, %xmm4
; SSE2-NEXT: movdqa %xmm4, 48(%rdi)
; SSE2-NEXT: movdqa %xmm3, 32(%rdi)
; SSE2-NEXT: movdqa %xmm2, 16(%rdi)
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR45604:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa (%rsi), %xmm1
; SSSE3-NEXT: movd %xmm1, %eax
; SSSE3-NEXT: movzwl %ax, %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: movl $11, %eax
; SSSE3-NEXT: pinsrw $2, %eax, %xmm0
; SSSE3-NEXT: pextrw $1, %xmm1, %ecx
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
; SSSE3-NEXT: pextrw $2, %xmm1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm2
; SSSE3-NEXT: pinsrw $2, %eax, %xmm2
; SSSE3-NEXT: pextrw $3, %xmm1, %ecx
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2
; SSSE3-NEXT: pinsrw $6, %eax, %xmm2
; SSSE3-NEXT: pextrw $4, %xmm1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm3
; SSSE3-NEXT: pinsrw $2, %eax, %xmm3
; SSSE3-NEXT: pextrw $5, %xmm1, %ecx
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3
; SSSE3-NEXT: pinsrw $6, %eax, %xmm3
; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm4
; SSSE3-NEXT: pinsrw $2, %eax, %xmm4
; SSSE3-NEXT: pextrw $7, %xmm1, %ecx
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4
; SSSE3-NEXT: pinsrw $6, %eax, %xmm4
; SSSE3-NEXT: movdqa %xmm4, 48(%rdi)
; SSSE3-NEXT: movdqa %xmm3, 32(%rdi)
; SSSE3-NEXT: movdqa %xmm2, 16(%rdi)
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR45604:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa (%rsi), %xmm1
; SSE41-NEXT: pextrw $2, %xmm1, %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: movl $11, %eax
; SSE41-NEXT: pinsrw $2, %eax, %xmm0
; SSE41-NEXT: pextrw $3, %xmm1, %ecx
; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
; SSE41-NEXT: pinsrw $6, %eax, %xmm0
; SSE41-NEXT: pextrw $4, %xmm1, %ecx
; SSE41-NEXT: movd %ecx, %xmm2
; SSE41-NEXT: pinsrw $2, %eax, %xmm2
; SSE41-NEXT: pextrw $5, %xmm1, %ecx
; SSE41-NEXT: pinsrw $4, %ecx, %xmm2
; SSE41-NEXT: pinsrw $6, %eax, %xmm2
; SSE41-NEXT: pextrw $6, %xmm1, %ecx
; SSE41-NEXT: movd %ecx, %xmm3
; SSE41-NEXT: pinsrw $2, %eax, %xmm3
; SSE41-NEXT: pextrw $7, %xmm1, %ecx
; SSE41-NEXT: pinsrw $4, %ecx, %xmm3
; SSE41-NEXT: pinsrw $6, %eax, %xmm3
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7]
; SSE41-NEXT: pinsrw $2, %eax, %xmm4
; SSE41-NEXT: pextrw $1, %xmm1, %ecx
; SSE41-NEXT: pinsrw $4, %ecx, %xmm4
; SSE41-NEXT: pinsrw $6, %eax, %xmm4
; SSE41-NEXT: movdqa %xmm4, (%rdi)
; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: PR45604:
; AVX1: # %bb.0:
Expand Down

0 comments on commit acbc5ed

Please sign in to comment.