Skip to content

Commit

Permalink
[X86] Teach combineTruncatedArithmetic to handle some cases of ISD::SUB
Browse files Browse the repository at this point in the history
We can safely avoid interfering with the subus combine if both inputs are freely truncatable. Either both extends, or an extend and a constant vector.

Differential Revision: https://reviews.llvm.org/D50878

llvm-svn: 340212
  • Loading branch information
topperc committed Aug 20, 2018
1 parent 08e7e04 commit 7dcb2c4
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 227 deletions.
47 changes: 21 additions & 26 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36795,34 +36795,20 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();

auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
auto IsFreeTruncation = [VT](SDValue Op) {
unsigned TruncSizeInBits = VT.getScalarSizeInBits();

// Repeated operand, so we are only trading one output truncation for
// one input truncation.
if (Op0 == Op1)
return true;

// See if either operand has been extended from a smaller/equal size to
// See if this has been extended from a smaller/equal size to
// the truncation size, allowing a truncation to combine with the extend.
unsigned Opcode0 = Op0.getOpcode();
if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
Opcode0 == ISD::ZERO_EXTEND) &&
Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
return true;

unsigned Opcode1 = Op1.getOpcode();
if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
Opcode1 == ISD::ZERO_EXTEND) &&
Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
unsigned Opcode = Op.getOpcode();
if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
Opcode == ISD::ZERO_EXTEND) &&
Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
return true;

// See if either operand is a single use constant which can be constant
// folded.
SDValue BC0 = peekThroughOneUseBitcasts(Op0);
SDValue BC1 = peekThroughOneUseBitcasts(Op1);
return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
// See if this is a single use constant which can be constant folded.
SDValue BC = peekThroughOneUseBitcasts(Op);
return ISD::isBuildVectorOfConstantSDNodes(BC.getNode());
};

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
Expand Down Expand Up @@ -36850,7 +36836,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
IsRepeatedOpOrFreeTruncation(Op0, Op1))
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
}
Expand All @@ -36863,11 +36849,20 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
// TODO: ISD::SUB should be here but interferes with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(Opcode, VT) &&
IsRepeatedOpOrFreeTruncation(Op0, Op1))
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
}
case ISD::SUB: {
// TODO: ISD::SUB We are conservative and require both sides to be freely
// truncatable to avoid interfering with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(Opcode, VT) &&
(Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
return TruncateArithmetic(Op0, Op1);
break;
}
Expand Down
218 changes: 17 additions & 201 deletions llvm/test/CodeGen/X86/vector-trunc-math.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1324,79 +1324,13 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
; SSE-NEXT: psubw %xmm4, %xmm3
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE-NEXT: psubw %xmm1, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm3, %xmm0
; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_ext_sub_v16i16_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ext_sub_v16i16_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_ext_sub_v16i16_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_ext_sub_v16i16_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_ext_sub_v16i16_v16i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = zext <16 x i8> %x to <16 x i16>
%b = zext <16 x i8> %y to <16 x i16>
%c = sub <16 x i16> %a, %b
Expand Down Expand Up @@ -1856,69 +1790,13 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
; SSE-NEXT: psubw {{.*}}(%rip), %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%a = zext <16 x i8> %x to <16 x i16>
%b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%c = trunc <16 x i16> %b to <16 x i8>
Expand All @@ -1928,78 +1806,16 @@ define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; SSE-NEXT: psubw %xmm0, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [8,9,10,11,12,13,14,15]
; SSE-NEXT: psubw %xmm2, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: packuswb %xmm0, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; SSE-NEXT: psubb %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,12,13,14,15]
; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512F-NEXT: vpsubw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpsubw %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpsubw %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%a = zext <16 x i8> %x to <16 x i16>
%b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
%c = trunc <16 x i16> %b to <16 x i8>
Expand Down

0 comments on commit 7dcb2c4

Please sign in to comment.