Skip to content

Commit

Permalink
[DAG] convert vector select-of-constants to logic/math
Browse files Browse the repository at this point in the history
This goes back to a discussion about IR canonicalization. We'd like to preserve and convert
more IR to 'select' than we currently do because that's likely the best choice in IR:
http://lists.llvm.org/pipermail/llvm-dev/2016-September/105335.html
...but that's often not true for codegen, so we need to account for this pattern coming in
to the backend and transform it to better DAG ops.

Steps in this patch:

  1. Add an EVT param to the existing convertSelectOfConstantsToMath() TLI hook to more finely
     enable this transform. Other targets will probably want that anyway to distinguish scalars
     from vectors. We're using that here to exclude AVX512 targets, but it may not be necessary.

  2. Convert a vselect to ext+add. This eliminates a constant load/materialization, and the
     vector ext is often free.

Implementing a more general fold using xor+and can be a follow-up for targets that don't have
a legal vselect. It's also possible that we can remove the TLI hook for the special case fold
implemented here because we're eliminating a constant, but it needs to be tested on other
targets.

Differential Revision: https://reviews.llvm.org/D36840

llvm-svn: 311731
  • Loading branch information
rotateright committed Aug 24, 2017
1 parent 872f689 commit e404cbf
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 114 deletions.
2 changes: 1 addition & 1 deletion llvm/include/llvm/Target/TargetLowering.h
Expand Up @@ -1591,7 +1591,7 @@ class TargetLoweringBase {
/// Return true if a select of constants (select Cond, C1, C2) should be
/// transformed into simple math ops with the condition value. For example:
/// select Cond, C1, C1-1 --> add (zext Cond), C1-1
virtual bool convertSelectOfConstantsToMath() const {
virtual bool convertSelectOfConstantsToMath(EVT VT) const {
return false;
}

Expand Down
59 changes: 57 additions & 2 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -348,6 +348,7 @@ namespace {
SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);

SDValue foldSelectOfConstants(SDNode *N);
SDValue foldVSelectOfConstants(SDNode *N);
SDValue foldBinOpIntoSelect(SDNode *BO);
bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
Expand Down Expand Up @@ -6191,7 +6192,7 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
// For any constants that differ by 1, we can transform the select into an
// extend and add. Use a target hook because some targets may prefer to
// transform in the other direction.
if (TLI.convertSelectOfConstantsToMath()) {
if (TLI.convertSelectOfConstantsToMath(VT)) {
if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
// select Cond, C1, C1-1 --> add (zext Cond), C1-1
if (VT != MVT::i1)
Expand Down Expand Up @@ -6760,6 +6761,57 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
return SDValue();
}

/// A vector select of 2 constant vectors can be simplified to math/logic to
/// avoid a variable select instruction and possibly avoid constant loads.
SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
SDValue Cond = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
EVT VT = N->getValueType(0);
if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 ||
!TLI.convertSelectOfConstantsToMath(VT) ||
!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) ||
!ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
return SDValue();

// Check if we can use the condition value to increment/decrement a single
// constant value. This simplifies a select to an add and removes a constant
// load/materialization from the general case.
bool AllAddOne = true;
bool AllSubOne = true;
unsigned Elts = VT.getVectorNumElements();
for (unsigned i = 0; i != Elts; ++i) {
SDValue N1Elt = N1.getOperand(i);
SDValue N2Elt = N2.getOperand(i);
if (N1Elt.isUndef() || N2Elt.isUndef())
continue;

const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
if (C1 != C2 + 1)
AllAddOne = false;
if (C1 != C2 - 1)
AllSubOne = false;
}

// Further simplifications for the extra-special cases where the constants are
// all 0 or all -1 should be implemented as folds of these patterns.
SDLoc DL(N);
if (AllAddOne || AllSubOne) {
// vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
// vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
}

// The general case for select-of-constants:
// vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
// ...but that only makes sense if a vselect is slower than 2 logic ops, so
// leave that to a machine-specific pass.
return SDValue();
}

SDValue DAGCombiner::visitVSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
Expand Down Expand Up @@ -6824,6 +6876,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
return CV;
}

if (SDValue V = foldVSelectOfConstants(N))
return V;

return SDValue();
}

Expand Down Expand Up @@ -7409,7 +7464,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
return SCC;

if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath()) {
if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
EVT SetCCVT = getSetCCResultType(N00VT);
// Don't do this transform for i1 because there's a select transform
// that would reverse it.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/PowerPC/PPCISelLowering.h
Expand Up @@ -765,7 +765,7 @@ namespace llvm {
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;

bool convertSelectOfConstantsToMath() const override {
bool convertSelectOfConstantsToMath(EVT VT) const override {
return true;
}

Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -4574,6 +4574,15 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
// TODO: It might be a win to ease or lift this restriction, but the generic
// folds in DAGCombiner conflict with vector folds for an AVX512 target.
if (VT.isVector() && Subtarget.hasAVX512())
return false;

return true;
}

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
Expand Down
4 changes: 1 addition & 3 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -1030,9 +1030,7 @@ namespace llvm {
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;

bool convertSelectOfConstantsToMath() const override {
return true;
}
bool convertSelectOfConstantsToMath(EVT VT) const override;

/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
/// with this index.
Expand Down
80 changes: 20 additions & 60 deletions llvm/test/CodeGen/PowerPC/vselect-constants.ll
Expand Up @@ -47,18 +47,12 @@ define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_Cplus1_or_C_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_Cplus1_or_C_vec:
; CHECK: # BB#0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: vspltisw 3, 1
; CHECK-NEXT: addis 3, 2, .LCPI2_0@toc@ha
; CHECK-NEXT: addis 4, 2, .LCPI2_1@toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI2_0@toc@l
; CHECK-NEXT: addi 4, 4, .LCPI2_1@toc@l
; CHECK-NEXT: lvx 18, 0, 3
; CHECK-NEXT: lvx 19, 0, 4
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: xxsel 34, 51, 50, 34
; CHECK-NEXT: lvx 19, 0, 3
; CHECK-NEXT: xxland 34, 34, 35
; CHECK-NEXT: vadduwm 2, 2, 19
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
ret <4 x i32> %add
Expand All @@ -69,12 +63,9 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK: # BB#0:
; CHECK-NEXT: vcmpequw 2, 2, 3
; CHECK-NEXT: addis 3, 2, .LCPI3_0@toc@ha
; CHECK-NEXT: addis 4, 2, .LCPI3_1@toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI3_0@toc@l
; CHECK-NEXT: addi 4, 4, .LCPI3_1@toc@l
; CHECK-NEXT: lvx 19, 0, 3
; CHECK-NEXT: lvx 4, 0, 4
; CHECK-NEXT: xxsel 34, 36, 51, 34
; CHECK-NEXT: vsubuwm 2, 19, 2
; CHECK-NEXT: blr
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
Expand All @@ -87,15 +78,12 @@ define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha
; CHECK-NEXT: addis 4, 2, .LCPI4_1@toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l
; CHECK-NEXT: addi 4, 4, .LCPI4_1@toc@l
; CHECK-NEXT: lvx 18, 0, 3
; CHECK-NEXT: lvx 19, 0, 4
; CHECK-NEXT: lvx 19, 0, 3
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: xxsel 34, 51, 50, 34
; CHECK-NEXT: vadduwm 2, 2, 19
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
ret <4 x i32> %add
Expand All @@ -106,12 +94,9 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK: # BB#0:
; CHECK-NEXT: vcmpequw 2, 2, 3
; CHECK-NEXT: addis 3, 2, .LCPI5_0@toc@ha
; CHECK-NEXT: addis 4, 2, .LCPI5_1@toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI5_0@toc@l
; CHECK-NEXT: addi 4, 4, .LCPI5_1@toc@l
; CHECK-NEXT: lvx 19, 0, 3
; CHECK-NEXT: lvx 4, 0, 4
; CHECK-NEXT: xxsel 34, 36, 51, 34
; CHECK-NEXT: vadduwm 2, 2, 19
; CHECK-NEXT: blr
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
Expand All @@ -123,12 +108,9 @@ define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) {
; CHECK: # BB#0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: vspltisb 19, -1
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: xxsel 34, 0, 51, 34
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
ret <4 x i32> %add
Expand All @@ -138,9 +120,6 @@ define <4 x i32> @cmp_sel_minus1_or_0_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: cmp_sel_minus1_or_0_vec:
; CHECK: # BB#0:
; CHECK-NEXT: vcmpequw 2, 2, 3
; CHECK-NEXT: vspltisb 19, -1
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: xxsel 34, 0, 51, 34
; CHECK-NEXT: blr
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
Expand All @@ -150,14 +129,10 @@ define <4 x i32> @cmp_sel_minus1_or_0_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_0_or_minus1_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_0_or_minus1_vec:
; CHECK: # BB#0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: vspltisb 19, -1
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: xxsel 34, 51, 0, 34
; CHECK-NEXT: vspltisw 3, 1
; CHECK-NEXT: vspltisb 4, -1
; CHECK-NEXT: xxland 34, 34, 35
; CHECK-NEXT: vadduwm 2, 2, 4
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
ret <4 x i32> %add
Expand All @@ -167,9 +142,7 @@ define <4 x i32> @cmp_sel_0_or_minus1_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: cmp_sel_0_or_minus1_vec:
; CHECK: # BB#0:
; CHECK-NEXT: vcmpequw 2, 2, 3
; CHECK-NEXT: vspltisb 19, -1
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: xxsel 34, 51, 0, 34
; CHECK-NEXT: xxlnor 34, 34, 34
; CHECK-NEXT: blr
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
Expand All @@ -179,14 +152,8 @@ define <4 x i32> @cmp_sel_0_or_minus1_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_1_or_0_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_1_or_0_vec:
; CHECK: # BB#0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: vspltisw 19, 1
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: xxsel 34, 0, 51, 34
; CHECK-NEXT: vspltisw 3, 1
; CHECK-NEXT: xxland 34, 34, 35
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
ret <4 x i32> %add
Expand All @@ -197,8 +164,7 @@ define <4 x i32> @cmp_sel_1_or_0_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK: # BB#0:
; CHECK-NEXT: vcmpequw 2, 2, 3
; CHECK-NEXT: vspltisw 19, 1
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: xxsel 34, 0, 51, 34
; CHECK-NEXT: xxland 34, 34, 51
; CHECK-NEXT: blr
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
Expand All @@ -208,14 +174,8 @@ define <4 x i32> @cmp_sel_1_or_0_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_0_or_1_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_0_or_1_vec:
; CHECK: # BB#0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: vspltisw 19, 1
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: xxsel 34, 51, 0, 34
; CHECK-NEXT: vspltisw 3, 1
; CHECK-NEXT: xxlandc 34, 35, 34
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %add
Expand All @@ -226,8 +186,8 @@ define <4 x i32> @cmp_sel_0_or_1_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK: # BB#0:
; CHECK-NEXT: vcmpequw 2, 2, 3
; CHECK-NEXT: vspltisw 19, 1
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: xxsel 34, 51, 0, 34
; CHECK-NEXT: xxlnor 0, 34, 34
; CHECK-NEXT: xxland 34, 0, 51
; CHECK-NEXT: blr
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
Expand Down
11 changes: 6 additions & 5 deletions llvm/test/CodeGen/X86/vselect-avx.ll
Expand Up @@ -151,21 +151,22 @@ define <32 x i8> @PR22706(<32 x i1> %x) {
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm1
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR22706:
; AVX2: ## BB#0:
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX2-NEXT: vpblendvb %ymm0, {{.*}}(%rip), %ymm1, %ymm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
%tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <32 x i8> %tmp
Expand Down

0 comments on commit e404cbf

Please sign in to comment.