Skip to content

Commit

Permalink
[X86] Shrink zero extends of gather indices from type less than i32 t…
Browse files Browse the repository at this point in the history
…o types larger than i32.

Gather instructions can use i32 or i64 elements for indices. If
the index is zero extended from a type smaller than i32 to i64, we
can shrink the extend to just extend to i32.

llvm-svn: 373982
  • Loading branch information
topperc committed Oct 7, 2019
1 parent 7647d3e commit be7f81e
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 86 deletions.
70 changes: 26 additions & 44 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -42572,16 +42572,17 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
SDValue Base = GorS->getBasePtr();
SDValue Scale = GorS->getScale();

// Shrink constant indices if they are larger than 32-bits.
// Only do this before legalize types since v2i64 could become v2i32.
// FIXME: We could check that the type is legal if we're after legalize types,
// but then we would need to construct test cases where that happens.
// FIXME: We could support more than just constant vectors, but we need to
// careful with costing. A truncate that can be optimized out would be fine.
// Otherwise we might only want to create a truncate if it avoids a split.
if (DCI.isBeforeLegalize()) {
unsigned IndexWidth = Index.getScalarValueSizeInBits();

// Shrink constant indices if they are larger than 32-bits.
// Only do this before legalize types since v2i64 could become v2i32.
// FIXME: We could check that the type is legal if we're after legalize
// types, but then we would need to construct test cases where that happens.
// FIXME: We could support more than just constant vectors, but we need to
// careful with costing. A truncate that can be optimized out would be fine.
// Otherwise we might only want to create a truncate if it avoids a split.
if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
unsigned IndexWidth = Index.getScalarValueSizeInBits();
if (BV->isConstant() && IndexWidth > 32 &&
DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
unsigned NumElts = Index.getValueType().getVectorNumElements();
Expand All @@ -42604,16 +42605,18 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Scatter->getIndexType());
}
}
}

if (DCI.isBeforeLegalizeOps()) {
// Remove any sign extends from 32 or smaller to larger than 32.
// Only do this before LegalizeOps in case we need the sign extend for
// legalization.
if (Index.getOpcode() == ISD::SIGN_EXTEND &&
Index.getScalarValueSizeInBits() > 32 &&
Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
Index = Index.getOperand(0);
// Shrink any sign/zero extends from 32 or smaller to larger than 32 if
// there are sufficient sign bits. Only do this before legalize types to
// avoid creating illegal types in truncate.
if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
Index.getOpcode() == ISD::ZERO_EXTEND) &&
IndexWidth > 32 &&
Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
unsigned NumElts = Index.getValueType().getVectorNumElements();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
SDValue Ops[] = { Chain, Gather->getPassThru(),
Mask, Base, Index, Scale } ;
Expand All @@ -42630,11 +42633,14 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Ops, Scatter->getMemOperand(),
Scatter->getIndexType());
}
}

if (DCI.isBeforeLegalizeOps()) {
unsigned IndexWidth = Index.getScalarValueSizeInBits();

// Make sure the index is either i32 or i64
unsigned ScalarSize = Index.getScalarValueSizeInBits();
if (ScalarSize != 32 && ScalarSize != 64) {
MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
if (IndexWidth != 32 && IndexWidth != 64) {
MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Index.getValueType().getVectorNumElements());
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
Expand All @@ -42654,30 +42660,6 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Ops, Scatter->getMemOperand(),
Scatter->getIndexType());
}

// Try to remove zero extends from 32->64 if we know the sign bit of
// the input is zero.
if (Index.getOpcode() == ISD::ZERO_EXTEND &&
Index.getScalarValueSizeInBits() == 64 &&
Index.getOperand(0).getScalarValueSizeInBits() == 32 &&
DAG.SignBitIsZero(Index.getOperand(0))) {
Index = Index.getOperand(0);
if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
SDValue Ops[] = { Chain, Gather->getPassThru(),
Mask, Base, Index, Scale } ;
return DAG.getMaskedGather(Gather->getVTList(),
Gather->getMemoryVT(), DL, Ops,
Gather->getMemOperand(),
Gather->getIndexType());
}
auto *Scatter = cast<MaskedScatterSDNode>(GorS);
SDValue Ops[] = { Chain, Scatter->getValue(),
Mask, Base, Index, Scale };
return DAG.getMaskedScatter(Scatter->getVTList(),
Scatter->getMemoryVT(), DL,
Ops, Scatter->getMemOperand(),
Scatter->getIndexType());
}
}

// With vector masks we only demand the upper bit of the mask.
Expand Down
64 changes: 22 additions & 42 deletions llvm/test/CodeGen/X86/masked_gather_scatter.ll
Expand Up @@ -2693,56 +2693,32 @@ declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1
define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
; KNL_64-LABEL: zext_i8_index:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: kxnorw %k0, %k0, %k2
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: zext_i8_index:
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: kxnorw %k0, %k0, %k2
; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; KNL_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: zext_i8_index:
; SKX: # %bb.0:
; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; SKX-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: kxnorw %k0, %k0, %k2
; SKX-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: zext_i8_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX_32-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: kxnorw %k0, %k0, %k2
; SKX_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; SKX_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl

%zext_ind = zext <16 x i8> %ind to <16 x i64>
Expand All @@ -2756,32 +2732,36 @@ define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_64-LABEL: zext_v8i8_index:
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; KNL_64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL_64-NEXT: movw $255, %ax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: zext_v8i8_index:
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; KNL_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL_32-NEXT: movw $255, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: zext_v8i8_index:
; SKX: # %bb.0:
; SKX-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: zext_v8i8_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; SKX_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl

%zext_ind = zext <8 x i8> %ind to <8 x i64>
Expand Down

0 comments on commit be7f81e

Please sign in to comment.