Skip to content

Commit

Permalink
[x86] lower shuffle of extracts to AVX2 vperm instructions
Browse files Browse the repository at this point in the history
I was trying to prevent shuffle regressions while matching more horizontal ops 
and ended up here:
  shuf (extract X, 0), (extract X, 4), Mask --> extract (shuf X, undef, Mask'), 0

The affected tests were added for:
https://bugs.llvm.org/show_bug.cgi?id=34380

This patch won't change the examples in the bug report itself, but we should be 
able to extend this to catch more types.

Differential Revision: https://reviews.llvm.org/D56756

llvm-svn: 351346
  • Loading branch information
rotateright committed Jan 16, 2019
1 parent cbdb4ef commit 0dbecd0
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 112 deletions.
113 changes: 91 additions & 22 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -11629,6 +11629,81 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
}

/// Test whether this can be lowered with a single SHUFPS instruction.
///
/// This is used to disable more specialized lowerings when the shufps lowering
/// will happen to be efficient.
static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
// This routine only handles 128-bit shufps.
assert(Mask.size() == 4 && "Unsupported mask size!");
assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

// To lower with a single SHUFPS we need to have the low half and high half
// each requiring a single input.
if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
return false;
if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
return false;

return true;
}

/// If we are extracting two 128-bit halves of a vector and shuffling the
/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
/// multi-shuffle lowering.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
SDValue N1, ArrayRef<int> Mask,
SelectionDAG &DAG) {
EVT VT = N0.getValueType();
assert((VT.is128BitVector() &&
(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
"VPERM* family of shuffles requires 32-bit or 64-bit elements");

// Check that both sources are extracts of the same source vector.
if (!N0.hasOneUse() || !N1.hasOneUse() ||
N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
N0.getOperand(0) != N1.getOperand(0))
return SDValue();

SDValue WideVec = N0.getOperand(0);
EVT WideVT = WideVec.getValueType();
if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
!isa<ConstantSDNode>(N1.getOperand(1)))
return SDValue();

// Match extracts of each half of the wide source vector. Commute the shuffle
// if the extract of the low half is N1.
unsigned NumElts = VT.getVectorNumElements();
SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
APInt ExtIndex0 = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
APInt ExtIndex1 = cast<ConstantSDNode>(N1.getOperand(1))->getAPIntValue();
if (ExtIndex1 == 0 && ExtIndex0 == NumElts) {
std::swap(ExtIndex0, ExtIndex1);
ShuffleVectorSDNode::commuteMask(NewMask);
}
if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
return SDValue();

// Final bailout: if the mask is simple, we are better off using an extract
// and a simple narrow shuffle.
if (NumElts == 4 && isSingleSHUFPSMask(NewMask))
return SDValue();

// Extend the shuffle mask with undef elements.
NewMask.append(NumElts, -1);

// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
NewMask);
// This is free: ymm -> xmm.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
DAG.getIntPtrConstant(0, DL));
}

/// Try to lower broadcast of a single element.
///
/// For convenience, this code also bundles all of the subtarget feature set
Expand Down Expand Up @@ -12116,6 +12191,10 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;

// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
Expand Down Expand Up @@ -12193,6 +12272,10 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;

// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
Expand Down Expand Up @@ -12252,28 +12335,6 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}

/// Test whether this can be lowered with a single SHUFPS instruction.
///
/// This is used to disable more specialized lowerings when the shufps lowering
/// will happen to be efficient.
static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
// This routine only handles 128-bit shufps.
assert(Mask.size() == 4 && "Unsupported mask size!");
assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

// To lower with a single SHUFPS we need to have the low half and high half
// each requiring a single input.
if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
return false;
if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
return false;

return true;
}

/// Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
Expand Down Expand Up @@ -12413,6 +12474,10 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}

if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;

// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
Expand Down Expand Up @@ -12501,6 +12566,10 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}

if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;

// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
Expand Down

0 comments on commit 0dbecd0

Please sign in to comment.