Skip to content

Commit

Permalink
[X86] Unsigned saturation subtraction canonicalization [the backend p…
Browse files Browse the repository at this point in the history
…art]

Summary:
On behalf of julia.koval@intel.com

The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)

There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).

The example of special case code:

```
void foo(unsigned short *p, int max, int n) {

  int i;
  unsigned m;
  for (i = 0; i < n; i++) {
    m = *--p;
    *p = (unsigned short)(m >= max ? m-max : 0);
  }
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.

Here is the table of types, I try to support, special case items are bold:

| Size | 128 | 256 | 512
| -----  | -----  | -----   | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**

Reviewers: zvi, spatel, DavidKreitzer, RKSimon

Reviewed By: zvi

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D37534

llvm-svn: 315237
  • Loading branch information
Zvi Rackover committed Oct 9, 2017
1 parent 663ba15 commit c1d5955
Show file tree
Hide file tree
Showing 2 changed files with 248 additions and 236 deletions.
87 changes: 87 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -35901,6 +35901,89 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
return combineAddOrSubToADCOrSBB(N, DAG);
}

static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);

// PSUBUS is supported, starting from SSE2, but special preprocessing
// for v8i32 requires umin, which appears in SSE41.
if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
!(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
!(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
(VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
VT == MVT::v8i64)))
return SDValue();

SDValue SubusLHS, SubusRHS;
// Try to find umax(a,b) - b or a - umin(a,b) patterns
// they may be converted to subus(a,b).
// TODO: Need to add IR cannonicialization for this code.
if (Op0.getOpcode() == ISD::UMAX) {
SubusRHS = Op1;
SDValue MaxLHS = Op0.getOperand(0);
SDValue MaxRHS = Op0.getOperand(1);
if (DAG.isEqualTo(MaxLHS, Op1))
SubusLHS = MaxRHS;
else if (DAG.isEqualTo(MaxRHS, Op1))
SubusLHS = MaxLHS;
else
return SDValue();
} else if (Op1.getOpcode() == ISD::UMIN) {
SubusLHS = Op0;
SDValue MinLHS = Op1.getOperand(0);
SDValue MinRHS = Op1.getOperand(1);
if (DAG.isEqualTo(MinLHS, Op0))
SubusRHS = MinRHS;
else if (DAG.isEqualTo(MinRHS, Op0))
SubusRHS = MinLHS;
else
return SDValue();
} else
return SDValue();

// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
// special preprocessing in some cases.
if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);

// Special preprocessing case can be only applied
// if the value was zero extended from 16 bit,
// so we require first 16 bits to be zeros for 32 bit
// values, or first 48 bits for 64 bit values.
KnownBits Known;
DAG.computeKnownBits(SubusLHS, Known);
unsigned NumZeros = Known.countMinLeadingZeros();
if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
return SDValue();

EVT ExtType = SubusLHS.getValueType();
EVT ShrinkedType;
if (VT == MVT::v8i32 || VT == MVT::v8i64)
ShrinkedType = MVT::v8i16;
else
ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

// If SubusLHS is zeroextended - truncate SubusRHS to it's
// size SubusRHS = umin(0xFFF.., SubusRHS).
SDValue SaturationConst =
DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
ShrinkedType.getScalarSizeInBits()),
SDLoc(SubusLHS), ExtType);
SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
SaturationConst);
SDValue NewSubusLHS =
DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
NewSubusLHS, NewSubusRHS);
// Zero extend the result, it may be used somewhere as 32 bit,
// if not zext and following trunc will shrink.
return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
}

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
Expand Down Expand Up @@ -35934,6 +36017,10 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineIncDecVector(N, DAG))
return V;

// Try to create PSUBUS if SUB's argument is max/min
if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
return V;

return combineAddOrSubToADCOrSBB(N, DAG);
}

Expand Down

0 comments on commit c1d5955

Please sign in to comment.