Skip to content

Commit

Permalink
[PowerPC] Improve vec_abs on P9
Browse files Browse the repository at this point in the history
Improve the current vec_abs support on P9, generate ISD::ABS node for vector types,
combine ABS node to VABSD node for some special cases to make use of P9 VABSD* insns,
do custom lowering to vsub(vneg later)+vmax if it has no combination opportunity.

Differential Revision: https://reviews.llvm.org/D54783

llvm-svn: 349437
  • Loading branch information
jedilyn committed Dec 18, 2018
1 parent 0aa260d commit 3dac125
Show file tree
Hide file tree
Showing 6 changed files with 376 additions and 175 deletions.
95 changes: 0 additions & 95 deletions llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
Expand Up @@ -327,7 +327,6 @@ namespace {

bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
void transferMemOperands(SDNode *N, SDNode *Result);
MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr);
};

} // end anonymous namespace
Expand Down Expand Up @@ -4138,51 +4137,6 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Result), {MemOp});
}

/// This method returns a node after flipping the MSB of each element
/// of vector integer type. Additionally, if SignBitVec is non-null,
/// this method sets a node with one at MSB of all elements
/// and zero at other bits in SignBitVec.
MachineSDNode *
PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) {
SDLoc dl(N);
EVT VecVT = N.getValueType();
if (VecVT == MVT::v4i32) {
if (SignBitVec) {
SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32);
*SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT,
SDValue(ZV, 0));
}
return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N);
}
else if (VecVT == MVT::v8i16) {
SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32,
getI32Imm(0x8000, dl));
SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32,
SDValue(Hi, 0),
getI32Imm(0x8000, dl));
SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT,
SDValue(ScaImm, 0));
/*
Alternatively, we can do this as follow to use VRF instead of GPR.
vspltish 5, 1
vspltish 6, 15
vslh 5, 6, 5
*/
if (SignBitVec) *SignBitVec = VecImm;
return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N,
SDValue(VecImm, 0));
}
else if (VecVT == MVT::v16i8) {
SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32,
getI32Imm(0x80, dl));
if (SignBitVec) *SignBitVec = VecImm;
return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N,
SDValue(VecImm, 0));
}
else
llvm_unreachable("Unsupported vector data type for flipSignBit");
}

// Select - Convert the specified operand from a target-independent to a
// target-specific node if it hasn't already been changed.
void PPCDAGToDAGISel::Select(SDNode *N) {
Expand Down Expand Up @@ -4993,55 +4947,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
return;
}
}
case ISD::ABS: {
assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector");

// For vector absolute difference, we use VABSDUW instruction of POWER9.
// Since VABSDU instructions are for unsigned integers, we need adjustment
// for signed integers.
// For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000).
// Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1.
// For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000).
EVT VecVT = N->getOperand(0).getValueType();
SDNode *AbsOp = nullptr;
unsigned AbsOpcode;

if (VecVT == MVT::v4i32)
AbsOpcode = PPC::VABSDUW;
else if (VecVT == MVT::v8i16)
AbsOpcode = PPC::VABSDUH;
else if (VecVT == MVT::v16i8)
AbsOpcode = PPC::VABSDUB;
else
llvm_unreachable("Unsupported vector data type for ISD::ABS");

// Even for signed integers, we can skip adjustment if all values are
// known to be positive (as signed integer) due to zero-extended inputs.
if (N->getOperand(0).getOpcode() == ISD::SUB &&
N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) {
AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
SDValue(N->getOperand(0)->getOperand(0)),
SDValue(N->getOperand(0)->getOperand(1)));
ReplaceNode(N, AbsOp);
return;
}
if (N->getOperand(0).getOpcode() == ISD::SUB) {
SDValue SubVal = N->getOperand(0);
SDNode *Op0 = flipSignBit(SubVal->getOperand(0));
SDNode *Op1 = flipSignBit(SubVal->getOperand(1));
AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
SDValue(Op0, 0), SDValue(Op1, 0));
}
else {
SDNode *Op1 = nullptr;
SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1);
AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0),
SDValue(Op1, 0));
}
ReplaceNode(N, AbsOp);
return;
}
}

SelectCode(N);
Expand Down
164 changes: 129 additions & 35 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Expand Up @@ -251,12 +251,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UREM, MVT::i64, Expand);
}

if (Subtarget.hasP9Vector()) {
setOperationAction(ISD::ABS, MVT::v4i32, Legal);
setOperationAction(ISD::ABS, MVT::v8i16, Legal);
setOperationAction(ISD::ABS, MVT::v16i8, Legal);
}

// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
Expand Down Expand Up @@ -556,6 +550,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// add/sub are legal for all supported vector VT's.
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::SUB, VT, Legal);
setOperationAction(ISD::ABS, VT, Custom);

// Vector instructions introduced in P8
if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
Expand Down Expand Up @@ -661,6 +656,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);

// Without hasP8Altivec set, v2i64 SMAX isn't available.
// But ABS custom lowering requires SMAX support.
if (!Subtarget.hasP8Altivec())
setOperationAction(ISD::ABS, MVT::v2i64, Expand);

addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
Expand Down Expand Up @@ -1083,6 +1083,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setTargetDAGCombine(ISD::FSQRT);
}

if (Subtarget.hasP9Altivec()) {
setTargetDAGCombine(ISD::ABS);
}

// Darwin long double math library functions have $LDBL128 appended.
if (Subtarget.isDarwin()) {
setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
Expand Down Expand Up @@ -1343,6 +1347,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::RFEBB: return "PPCISD::RFEBB";
case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
case PPCISD::VABSD: return "PPCISD::VABSD";
case PPCISD::QVFPERM: return "PPCISD::QVFPERM";
case PPCISD::QVGPCI: return "PPCISD::QVGPCI";
case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI";
Expand Down Expand Up @@ -9003,35 +9008,6 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getRegister(PPC::R2, MVT::i32);
}

// We are looking for absolute values here.
// The idea is to try to fit one of two patterns:
// max (a, (0-a)) OR max ((0-a), a)
if (Subtarget.hasP9Vector() &&
(IntrinsicID == Intrinsic::ppc_altivec_vmaxsw ||
IntrinsicID == Intrinsic::ppc_altivec_vmaxsh ||
IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) {
SDValue V1 = Op.getOperand(1);
SDValue V2 = Op.getOperand(2);
if (V1.getSimpleValueType() == V2.getSimpleValueType() &&
(V1.getSimpleValueType() == MVT::v4i32 ||
V1.getSimpleValueType() == MVT::v8i16 ||
V1.getSimpleValueType() == MVT::v16i8)) {
if ( V1.getOpcode() == ISD::SUB &&
ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
V1.getOperand(1) == V2 ) {
// Generate the abs instruction with the operands
return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2);
}

if ( V2.getOpcode() == ISD::SUB &&
ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
V2.getOperand(1) == V1 ) {
// Generate the abs instruction with the operands
return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1);
}
}
}

// If this is a lowered altivec predicate compare, CompareOpc is set to the
// opcode number of the comparison.
int CompareOpc;
Expand Down Expand Up @@ -9572,6 +9548,44 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
}
}

SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {

assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");

EVT VT = Op.getValueType();
assert(VT.isVector() &&
"Only set vector abs as custom, scalar abs shouldn't reach here!");
assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
VT == MVT::v16i8) &&
"Unexpected vector element type!");
assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) &&
"Current subtarget doesn't support smax v2i64!");

// For vector abs, it can be lowered to:
// abs x
// ==>
// y = -x
// smax(x, y)

SDLoc dl(Op);
SDValue X = Op.getOperand(0);
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);

// SMAX patch https://reviews.llvm.org/D47332
// hasn't landed yet, so use intrinsic first here.
// TODO: Should use SMAX directly once SMAX patch landed
Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
if (VT == MVT::v2i64)
BifID = Intrinsic::ppc_altivec_vmaxsd;
else if (VT == MVT::v8i16)
BifID = Intrinsic::ppc_altivec_vmaxsh;
else if (VT == MVT::v16i8)
BifID = Intrinsic::ppc_altivec_vmaxsb;

return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
}

/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
Expand Down Expand Up @@ -9624,6 +9638,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::ABS: return LowerABS(Op, DAG);

// For counter-based loop handling.
case ISD::INTRINSIC_W_CHAIN: return SDValue();
Expand Down Expand Up @@ -12985,6 +13000,39 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
}
}

// Combine vmaxsw/h/b(a, a's negation) to abs(a)
// Expose the vabsduw/h/b opportunity for down stream
if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
(IID == Intrinsic::ppc_altivec_vmaxsw ||
IID == Intrinsic::ppc_altivec_vmaxsh ||
IID == Intrinsic::ppc_altivec_vmaxsb)) {
SDValue V1 = N->getOperand(1);
SDValue V2 = N->getOperand(2);
if ((V1.getSimpleValueType() == MVT::v4i32 ||
V1.getSimpleValueType() == MVT::v8i16 ||
V1.getSimpleValueType() == MVT::v16i8) &&
V1.getSimpleValueType() == V2.getSimpleValueType()) {
// (0-a, a)
if (V1.getOpcode() == ISD::SUB &&
ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
V1.getOperand(1) == V2) {
return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
}
// (a, 0-a)
if (V2.getOpcode() == ISD::SUB &&
ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
V2.getOperand(1) == V1) {
return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
}
// (x-y, y-x)
if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
V1.getOperand(0) == V2.getOperand(1) &&
V1.getOperand(1) == V2.getOperand(0)) {
return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
}
}
}
}

break;
Expand Down Expand Up @@ -13217,6 +13265,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::BUILD_VECTOR:
return DAGCombineBuildVector(N, DCI);
case ISD::ABS:
return combineABS(N, DCI);
}

return SDValue();
Expand Down Expand Up @@ -14503,3 +14553,47 @@ isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
// For non-constant masks, we can always use the record-form and.
return true;
}

// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0)
// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0)
// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0)
// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0)
// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32
SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const {
assert((N->getOpcode() == ISD::ABS) && "Need ABS node here");
assert(Subtarget.hasP9Altivec() &&
"Only combine this when P9 altivec supported!");
EVT VT = N->getValueType(0);
if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
return SDValue();

SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
if (N->getOperand(0).getOpcode() == ISD::SUB) {
// Even for signed integers, if it's known to be positive (as signed
// integer) due to zero-extended inputs.
unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode();
unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode();
if ((SubOpcd0 == ISD::ZERO_EXTEND ||
SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) &&
(SubOpcd1 == ISD::ZERO_EXTEND ||
SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) {
return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
N->getOperand(0)->getOperand(0),
N->getOperand(0)->getOperand(1),
DAG.getTargetConstant(0, dl, MVT::i32));
}

// For type v4i32, it can be optimized with xvnegsp + vabsduw
if (N->getOperand(0).getValueType() == MVT::v4i32 &&
N->getOperand(0).hasOneUse()) {
return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
N->getOperand(0)->getOperand(0),
N->getOperand(0)->getOperand(1),
DAG.getTargetConstant(1, dl, MVT::i32));
}
}

return SDValue();
}

17 changes: 17 additions & 0 deletions llvm/lib/Target/PowerPC/PPCISelLowering.h
Expand Up @@ -373,6 +373,21 @@ namespace llvm {
/// An SDNode for swaps that are not associated with any loads/stores
/// and thereby have no chain.
SWAP_NO_CHAIN,

/// An SDNode for Power9 vector absolute value difference.
/// operand #0 vector
/// operand #1 vector
/// operand #2 constant i32 0 or 1, to indicate whether needs to patch
/// the most significant bit for signed i32
///
/// Power9 VABSD* instructions are designed to support unsigned integer
/// vectors (byte/halfword/word), if we want to make use of them for signed
/// integer vectors, we have to flip their sign bits first. To flip sign bit
/// for byte/halfword integer vector would become inefficient, but for word
/// integer vector, we can leverage XVNEGSP to make it efficiently. eg:
/// abs(sub(a,b)) => VABSDUW(a+0x80000000, b+0x80000000)
/// => VABSDUW((XVNEGSP a), (XVNEGSP b))
VABSD,

/// QVFPERM = This corresponds to the QPX qvfperm instruction.
QVFPERM,
Expand Down Expand Up @@ -998,6 +1013,7 @@ namespace llvm {
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
Expand Down Expand Up @@ -1101,6 +1117,7 @@ namespace llvm {
SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const;

/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
/// SETCC with integer subtraction when (1) there is a legal way of doing it
Expand Down

0 comments on commit 3dac125

Please sign in to comment.