Skip to content

Commit

Permalink
[X86] lowerShuffleAsElementInsertion - fold to or(vzext_movl(scalar_t…
Browse files Browse the repository at this point in the history
…o_vector(zext(x))), and(constant, mask))

The logic in this function is a bit of a mess, but masking a vector constant should allow us to OR the zero-extended i8/i16 scalar value in place.

We can do more here - reusing the OR pattern if the relevant unused elements are known zero etc. but this is enough to address a regression from D127115.
  • Loading branch information
RKSimon committed May 7, 2023
1 parent 5014830 commit 17dd1ad
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 17 deletions.
43 changes: 31 additions & 12 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -14693,20 +14693,31 @@ static SDValue lowerShuffleAsElementInsertion(
SelectionDAG &DAG) {
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
unsigned EltBits = VT.getScalarSizeInBits();

if (isSoftFP16(EltVT, Subtarget))
return SDValue();

int V2Index =
find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
Mask.begin();
bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
bool IsV1Zeroable = true;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (i != V2Index && !Zeroable[i]) {
IsV1Zeroable = false;
break;
}

// Bail if a non-zero V1 isn't used in place.
if (!IsV1Zeroable) {
SmallVector<int, 8> V1Mask(Mask);
V1Mask[V2Index] = -1;
if (!isNoopShuffleMask(V1Mask))
return SDValue();
}

// Check for a single input from a SCALAR_TO_VECTOR node.
// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
// all the smarts here sunk into that routine. However, the current
Expand All @@ -14719,13 +14730,26 @@ static SDValue lowerShuffleAsElementInsertion(
V2S = DAG.getBitcast(EltVT, V2S);
if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
// Using zext to expand a narrow element won't work for non-zero
// insertions.
if (!IsV1Zeroable)
// insertions. But we can use a masked constant vector if we're
// inserting V2 into the bottom of V1.
if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
return SDValue();

// Zero-extend directly to i32.
ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

// If we're inserting into a constant, mask off the inserted index
// and OR with the zero-extended scalar.
if (!IsV1Zeroable) {
SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
Bits[V2Index] = APInt::getZero(EltBits);
SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
return DAG.getNode(ISD::OR, DL, VT, V1, V2);
}
}
V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
Expand All @@ -14737,15 +14761,10 @@ static SDValue lowerShuffleAsElementInsertion(

if (!IsV1Zeroable) {
// If V1 can't be treated as a zero vector we have fewer options to lower
// this. We can't support integer vectors or non-zero targets cheaply, and
// the V1 elements can't be permuted in any way.
// this. We can't support integer vectors or non-zero targets cheaply.
assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
if (!VT.isFloatingPoint() || V2Index != 0)
return SDValue();
SmallVector<int, 8> V1Mask(Mask);
V1Mask[V2Index] = -1;
if (!isNoopShuffleMask(V1Mask))
return SDValue();
if (!VT.is128BitVector())
return SDValue();

Expand Down Expand Up @@ -14775,15 +14794,15 @@ static SDValue lowerShuffleAsElementInsertion(
// the desired position. Otherwise it is more efficient to do a vector
// shift left. We know that we can do a vector shift left because all
// the inputs are zero.
if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
if (VT.isFloatingPoint() || NumElts <= 4) {
SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
V2Shuffle[V2Index] = 0;
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
} else {
V2 = DAG.getBitcast(MVT::v16i8, V2);
V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
DAG.getTargetConstant(
V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
V2 = DAG.getNode(
X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
V2 = DAG.getBitcast(VT, V2);
}
}
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/X86/insert-into-constant-vector.ll
Expand Up @@ -13,15 +13,15 @@
define <16 x i8> @elt0_v16i8(i8 %x) {
; X86-SSE2-LABEL: elt0_v16i8:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movd %eax, %xmm0
; X86-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: elt0_v16i8:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movd %edi, %xmm0
; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE2-NEXT: movzbl %dil, %eax
; X64-SSE2-NEXT: movd %eax, %xmm0
; X64-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE2-NEXT: retq
;
Expand Down

0 comments on commit 17dd1ad

Please sign in to comment.