Skip to content

Commit

Permalink
[DAGCombiner] narrow vector binops when extraction is cheap
Browse files Browse the repository at this point in the history
Narrowing vector binops came up in the demanded bits discussion in D52912.

I don't think we're going to be able to do this transform in IR as a canonicalization 
because of the risk of creating unsupported widths for vector ops, but we already have 
a DAG TLI hook to allow what I was hoping for: isExtractSubvectorCheap(). This is 
currently enabled for x86, ARM, and AArch64 (although only x86 has existing regression 
test diffs).

This is artificially limited to not look through bitcasts because there are so many 
test diffs already, but that's marked with a TODO and is a small follow-up.

Differential Revision: https://reviews.llvm.org/D53784

llvm-svn: 345602
  • Loading branch information
rotateright committed Oct 30, 2018
1 parent bd74554 commit 8b207de
Show file tree
Hide file tree
Showing 24 changed files with 387 additions and 468 deletions.
41 changes: 30 additions & 11 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16673,10 +16673,8 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
return SDValue();
}

/// If we are extracting a subvector produced by a wide binary operator with at
/// at least one operand that was the result of a vector concatenation, then try
/// to use the narrow vector operands directly to avoid the concatenation and
/// extraction.
/// If we are extracting a subvector produced by a wide binary operator try
/// to use a narrow binary operator and/or avoid concatenation and extraction.
static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
// TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
// some of these bailouts with other transforms.
Expand All @@ -16697,22 +16695,43 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
if (!WideBVT.isVector())
return SDValue();

EVT VT = Extract->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
unsigned ExtractIndex = ExtractIndexC->getZExtValue();
assert(ExtractIndex % NumElems == 0 &&
"Extract index is not a multiple of the vector length.");
EVT SrcVT = Extract->getOperand(0).getValueType();
unsigned NumSrcElems = SrcVT.getVectorNumElements();
unsigned NarrowingRatio = NumSrcElems / NumElems;

// Bail out if the target does not support a narrower version of the binop.
unsigned BOpcode = BinOp.getOpcode();
unsigned WideNumElts = WideBVT.getVectorNumElements();
EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
WideBVT.getVectorNumElements() / 2);
WideNumElts / NarrowingRatio);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
return SDValue();

// If extraction is cheap, we don't need to look at the binop operands
// for concat ops. The narrow binop alone makes this transform profitable.
// TODO: We're not dealing with the bitcasted pattern here. That limitation
// should be lifted.
if (Extract->getOperand(0) == BinOp && BinOp.hasOneUse() &&
TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtractIndex)) {
// extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
SDLoc DL(Extract);
SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
BinOp.getOperand(0), Extract->getOperand(1));
SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
BinOp.getOperand(1), Extract->getOperand(1));
return DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
BinOp.getNode()->getFlags());
}

// Only handle the case where we are doubling and then halving. A larger ratio
// may require more than two narrow binops to replace the wide binop.
EVT VT = Extract->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
unsigned ExtractIndex = ExtractIndexC->getZExtValue();
assert(ExtractIndex % NumElems == 0 &&
"Extract index is not a multiple of the vector length.");
if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
if (NarrowingRatio != 2)
return SDValue();

// TODO: The motivating case for this transform is an x86 AVX1 target. That
Expand Down
20 changes: 8 additions & 12 deletions llvm/test/CodeGen/X86/2012-04-26-sdglue.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,19 @@
define void @func(<4 x float> %a, <16 x i8> %b, <16 x i8> %c, <8 x float> %d, <8 x float> %e, <8 x float>* %f) nounwind ssp {
; CHECK-LABEL: func:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu 0, %xmm3
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
; CHECK-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; CHECK-NEXT: vmovdqu 32, %xmm3
; CHECK-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; CHECK-NEXT: vmulps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vmulps %ymm1, %ymm1, %ymm1
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vmovdqu 0, %xmm0
; CHECK-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vmulps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vhaddps %ymm4, %ymm0, %ymm0
; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vhaddps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vhaddps %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/X86/avx-logic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -338,17 +338,17 @@ define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; AVX1-LABEL: andn_disguised_i8_elts:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; INT256-LABEL: andn_disguised_i8_elts:
Expand Down Expand Up @@ -417,17 +417,17 @@ define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %
define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
; AVX1-LABEL: andn_variable_mask_operand_concat:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm1
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; INT256-LABEL: andn_variable_mask_operand_concat:
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/X86/avx-vzeroupper.ll
Original file line number Diff line number Diff line change
Expand Up @@ -96,28 +96,24 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
; VZ-LABEL: test02:
; VZ: # %bb.0:
; VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0
; VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0
; VZ-NEXT: vzeroupper
; VZ-NEXT: jmp do_sse # TAILCALL
;
; FAST-ymm-zmm-LABEL: test02:
; FAST-ymm-zmm: # %bb.0:
; FAST-ymm-zmm-NEXT: vaddps %ymm1, %ymm0, %ymm0
; FAST-ymm-zmm-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0
; FAST-ymm-zmm-NEXT: jmp do_sse # TAILCALL
;
; BDVER2-LABEL: test02:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; BDVER2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; BDVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0
; BDVER2-NEXT: vzeroupper
; BDVER2-NEXT: jmp do_sse # TAILCALL
;
; BTVER2-LABEL: test02:
; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: jmp do_sse # TAILCALL
%add.i = fadd <8 x float> %a, %b
%add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
Expand Down
56 changes: 24 additions & 32 deletions llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovd %xmm0, %eax
; KNL-NEXT: retq
;
Expand All @@ -17,7 +17,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vmovd %xmm0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
Expand All @@ -35,7 +35,7 @@ define i32 @hsub_16(<16 x i32> %x225) {
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; KNL-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovd %xmm0, %eax
; KNL-NEXT: retq
;
Expand All @@ -44,7 +44,7 @@ define i32 @hsub_16(<16 x i32> %x225) {
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vmovd %xmm0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
Expand All @@ -62,17 +62,15 @@ define float @fhadd_16(<16 x float> %x225) {
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhadd_16:
; SKX: # %bb.0:
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -89,17 +87,15 @@ define float @fhsub_16(<16 x float> %x225) {
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; KNL-NEXT: vsubps %zmm1, %zmm0, %zmm0
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; KNL-NEXT: vsubps %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhsub_16:
; SKX: # %bb.0:
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SKX-NEXT: vsubps %zmm1, %zmm0, %zmm0
; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; SKX-NEXT: vsubps %xmm1, %xmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand Down Expand Up @@ -180,16 +176,14 @@ define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) {
; KNL: # %bb.0:
; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; KNL-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: fadd_noundef_low:
; SKX: # %bb.0:
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; SKX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; SKX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
Expand All @@ -203,16 +197,18 @@ define <4 x double> @fadd_noundef_high(<8 x double> %x225, <8 x double> %x227) {
; KNL: # %bb.0:
; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vextractf64x4 $1, %zmm2, %ymm1
; KNL-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: fadd_noundef_high:
; SKX: # %bb.0:
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX-NEXT: vextractf64x4 $1, %zmm2, %ymm1
; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
Expand All @@ -227,16 +223,14 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) {
; KNL: # %bb.0:
; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; KNL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: hadd_16_3_sv:
; SKX: # %bb.0:
; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; SKX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
Expand All @@ -253,15 +247,13 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
; KNL-LABEL: fadd_noundef_eel:
; KNL: # %bb.0:
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; KNL-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fadd_noundef_eel:
; SKX: # %bb.0:
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
Expand All @@ -277,18 +269,18 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
; KNL-LABEL: fsub_noundef_ee:
; KNL: # %bb.0:
; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm0
; KNL-NEXT: vbroadcastsd %xmm0, %zmm0
; KNL-NEXT: vsubpd %zmm1, %zmm0, %zmm0
; KNL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; KNL-NEXT: vbroadcastsd %xmm0, %zmm1
; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; KNL-NEXT: vsubpd %xmm0, %xmm1, %xmm0
; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; KNL-NEXT: retq
;
; SKX-LABEL: fsub_noundef_ee:
; SKX: # %bb.0:
; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm0
; SKX-NEXT: vbroadcastsd %xmm0, %zmm0
; SKX-NEXT: vsubpd %zmm1, %zmm0, %zmm0
; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; SKX-NEXT: vbroadcastsd %xmm0, %zmm1
; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; SKX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
; SKX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/avx512-insert-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; SKX-NEXT: kxorb %k2, %k1, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $5, %k1, %k1
; SKX-NEXT: kxorb %k1, %k0, %k0
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
Expand Down Expand Up @@ -890,7 +890,7 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
; SKX-NEXT: kshiftrb $7, %k0, %k0
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kshiftlb $1, %k1, %k1
; SKX-NEXT: korb %k1, %k0, %k0
; SKX-NEXT: korw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
Expand Down Expand Up @@ -1019,8 +1019,8 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
; KNL: ## %bb.0:
; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
Expand Down Expand Up @@ -1054,8 +1054,8 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
; KNL: ## %bb.0:
; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
Expand Down
Loading

0 comments on commit 8b207de

Please sign in to comment.