Skip to content

Commit

Permalink
[DAGCombiner][x86] scalarize splatted vector FP ops
Browse files Browse the repository at this point in the history
There are a variety of vector patterns that may be profitably reduced to a
scalar op when scalar ops are performed using a subset (typically, the
first lane) of the vector register file.

For x86, this is true for float/double ops and element 0 because
insert/extract is just a sub-register rename.

Other targets should likely enable the hook in a similar way.

Differential Revision: https://reviews.llvm.org/D60150

llvm-svn: 357760
  • Loading branch information
rotateright committed Apr 5, 2019
1 parent 2b2f35a commit 50a8652
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 96 deletions.
8 changes: 8 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -2441,6 +2441,14 @@ class TargetLoweringBase {
return false;
}

/// Return true if extraction of a scalar element from the given vector type
/// at the given index is cheap. For example, if scalar operations occur on
/// the same register file as vector operations, then an extract element may
/// be a sub-register rename rather than an actual instruction.
virtual bool isExtractVecEltCheap(EVT VT, unsigned Index) const {
return false;
}

/// Try to convert math with an overflow comparison into the corresponding DAG
/// node operation. Targets may want to override this independently of whether
/// the operation is legal/custom for the given type because it may obscure
Expand Down
21 changes: 19 additions & 2 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18078,11 +18078,28 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
// If it is a splat, check if the argument vector is another splat or a
// build_vector.
if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
SDNode *V = N0.getNode();
int SplatIndex = SVN->getSplatIndex();
if (TLI.isExtractVecEltCheap(VT, SplatIndex) &&
ISD::isBinaryOp(N0.getNode())) {
// splat (vector_bo L, R), Index -->
// splat (scalar_bo (extelt L, Index), (extelt R, Index))
SDValue L = N0.getOperand(0), R = N0.getOperand(1);
SDLoc DL(N);
EVT EltVT = VT.getScalarType();
SDValue Index = DAG.getIntPtrConstant(SplatIndex, DL);
SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR,
N0.getNode()->getFlags());
SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
}

// If this is a bit convert that changes the element type of the vector but
// not the number of vector elements, look through it. Be careful not to
// look though conversions that change things like v4f32 to v2f64.
SDNode *V = N0.getNode();
if (V->getOpcode() == ISD::BITCAST) {
SDValue ConvInput = V->getOperand(0);
if (ConvInput.getValueType().isVector() &&
Expand Down Expand Up @@ -18115,7 +18132,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return N0;

// Canonicalize any other splat as a build_vector.
const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
SDValue Splatted = V->getOperand(SplatIndex);
SmallVector<SDValue, 8> Ops(NumElts, Splatted);
SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);

Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1074,6 +1074,12 @@ namespace llvm {
/// supported.
bool shouldScalarizeBinop(SDValue) const override;

/// Extract of a scalar FP value from index 0 of a vector is free.
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
EVT EltVT = VT.getScalarType();
return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
}

/// Overflow nodes should get combined/lowered to optimal instructions
/// (they should allow eliminating explicit compares by getting flags from
/// math ops).
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/X86/haddsub-shuf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSSE3_SLOW-NEXT: retq
;
Expand All @@ -286,7 +286,7 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
; AVX1_SLOW-LABEL: hadd_v2f64:
; AVX1_SLOW: # %bb.0:
; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1_SLOW-NEXT: retq
;
Expand All @@ -298,7 +298,7 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
; AVX2_SLOW-LABEL: hadd_v2f64:
; AVX2_SLOW: # %bb.0:
; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX2_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX2_SLOW-NEXT: retq
;
Expand Down Expand Up @@ -398,12 +398,12 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) {
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
; SSSE3_SLOW-NEXT: addpd %xmm1, %xmm3
; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm2
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0]
; SSSE3_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hadd_v4f64:
Expand Down Expand Up @@ -447,7 +447,7 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3_SLOW-NEXT: subpd %xmm1, %xmm0
; SSSE3_SLOW-NEXT: subsd %xmm1, %xmm0
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3_SLOW-NEXT: retq
;
Expand All @@ -459,7 +459,7 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
; AVX1_SLOW-LABEL: hsub_v2f64:
; AVX1_SLOW: # %bb.0:
; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; AVX1_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1_SLOW-NEXT: retq
;
Expand All @@ -471,7 +471,7 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
; AVX2_SLOW-LABEL: hsub_v2f64:
; AVX2_SLOW: # %bb.0:
; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX2_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; AVX2_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX2_SLOW-NEXT: retq
;
Expand All @@ -491,11 +491,11 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) {
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
; SSSE3_SLOW-NEXT: subpd %xmm3, %xmm1
; SSSE3_SLOW-NEXT: subpd %xmm2, %xmm0
; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm0
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm1
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
; SSSE3_SLOW-NEXT: retq
;
Expand Down

0 comments on commit 50a8652

Please sign in to comment.