diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 17f04d0fd05e8..f303d237e5cc2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -652,6 +652,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); + if (Subtarget.isISA3_0() && isPPC64) { + setOperationAction(ISD::VP_STORE, MVT::v16i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v8i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v4i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v2i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v16i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v8i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v4i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v2i1, Custom); + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom); @@ -11909,6 +11920,59 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op, return getDataClassTest(LHS, Category, Dl, DAG, Subtarget); } +static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left, + SelectionDAG &DAG) { + SDLoc dl(Val); + EVT VT = Val->getValueType(0); + unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0; + unsigned TypeAdj = llvm::countr_zero(Bits / 8); + SDValue SHLAmt = DAG.getConstant(LeftAdj + TypeAdj, dl, VT); + return DAG.getNode(ISD::SHL, dl, VT, Val, SHLAmt); +} + +SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const { + auto VPLD = cast(Op); + bool Future = Subtarget.isISAFuture(); + SDLoc dl(Op); + assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) && + "Mask predication not supported"); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPLD->getOperand(4)); + unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl; + unsigned EltBits = Op->getValueType(0).getScalarType().getSizeInBits(); + Len = AdjustLength(Len, EltBits, !Future, DAG); + SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(IID, dl, MVT::i32), + VPLD->getOperand(1), Len}; + SDVTList Tys = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDValue VPL = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys, Ops, + VPLD->getMemoryVT(), VPLD->getMemOperand()); + return VPL; +} + +SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const { + auto VPST = cast(Op); + assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) && + "Mask predication not supported"); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPST->getOperand(5)); + unsigned EltBits = + Op->getOperand(1).getValueType().getScalarType().getSizeInBits(); + bool Future = Subtarget.isISAFuture(); + unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl; + Len = AdjustLength(Len, EltBits, !Future, DAG); + SDValue Ops[] = { + VPST->getChain(), DAG.getConstant(IID, dl, MVT::i32), + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, VPST->getOperand(1)), + VPST->getOperand(2), Len}; + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue VPS = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, + VPST->getMemoryVT(), VPST->getMemOperand()); + return VPS; +} + SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -12763,6 +12827,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { if (Op->getFlags().hasNoFPExcept()) return Op; return SDValue(); + case ISD::VP_LOAD: + return LowerVP_LOAD(Op, DAG); + case ISD::VP_STORE: + return LowerVP_STORE(Op, DAG); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 880aca751d7d6..d967018982734 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1345,6 +1345,9 @@ namespace llvm { SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 2fba090f2d501..6373343f2b2e3 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -24,6 +24,10 @@ using namespace llvm; #define DEBUG_TYPE "ppctti" +static cl::opt Pwr9EVL("ppc-pwr9-evl", + cl::desc("Allow vp.load and vp.store for pwr9"), + cl::init(false), cl::Hidden); + static cl::opt VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden); @@ -1031,3 +1035,39 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, bool PPCTTIImpl::supportsTailCallFor(const CallBase *CB) const { return TLI->supportsTailCallFor(CB); } + +TargetTransformInfo::VPLegalization +PPCTTIImpl::getVPLegalizationStrategy(const VPIntrinsic &PI) const { + using VPLegalization = TargetTransformInfo::VPLegalization; + unsigned Directive = ST->getCPUDirective(); + VPLegalization DefaultLegalization = BaseT::getVPLegalizationStrategy(PI); + if (Directive != PPC::DIR_PWR10 && Directive != PPC::DIR_PWR_FUTURE && + (!Pwr9EVL || Directive != PPC::DIR_PWR9)) + return DefaultLegalization; + + if (!ST->isPPC64()) + return DefaultLegalization; + + unsigned IID = PI.getIntrinsicID(); + if (IID != Intrinsic::vp_load && IID != Intrinsic::vp_store) + return DefaultLegalization; + + bool IsLoad = IID == Intrinsic::vp_load; + Type *VecTy = IsLoad ? PI.getType() : PI.getOperand(0)->getType(); + EVT VT = TLI->getValueType(DL, VecTy, true); + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + VT != MVT::v16i8) + return DefaultLegalization; + + auto IsAllTrueMask = [](Value *MaskVal) { + if (Value *SplattedVal = getSplatValue(MaskVal)) + if (auto *ConstValue = dyn_cast(SplattedVal)) + return ConstValue->isAllOnesValue(); + return false; + }; + unsigned MaskIx = IsLoad ? 1 : 2; + if (!IsAllTrueMask(PI.getOperand(MaskIx))) + return DefaultLegalization; + + return VPLegalization(VPLegalization::Legal, VPLegalization::Legal); +} diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 475472ac3720f..385ad89876b93 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -150,6 +150,9 @@ class PPCTTIImpl final : public BasicTTIImplBase { const ArrayRef &Types) const override; bool supportsTailCallFor(const CallBase *CB) const override; + TargetTransformInfo::VPLegalization + getVPLegalizationStrategy(const VPIntrinsic &PI) const override; + private: // The following constant is used for estimating costs on power9. static const InstructionCost::CostType P9PipelineFlushEstimate = 80; diff --git a/llvm/test/CodeGen/PowerPC/vp-ld-st.ll b/llvm/test/CodeGen/PowerPC/vp-ld-st.ll new file mode 100644 index 0000000000000..f0f9943e901ec --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vp-ld-st.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -mcpu=pwr10 \ +; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=future \ +; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck -check-prefix=FUTURE %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr10 \ +; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=future \ +; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck --check-prefix=FUTURE %s + +; Function Attrs: nounwind readnone +define void @stxvl1(<16 x i8> %a, ptr %b, i64 %c) { +; CHECK-LABEL: stxvl1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 3, 6, 56 +; CHECK-NEXT: stxvl 34, 5, 3 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: stxvl1: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: stxvrl 34, 5, 6 +; FUTURE-NEXT: blr +entry: + %cconv = trunc i64 %c to i32 + tail call void @llvm.vp.store.v16i8.p0(<16 x i8> %a, ptr %b, <16 x i1> splat (i1 true), i32 %cconv) + ret void +} + +; Function Attrs: nounwind readnone +define void @stxvl2(<8 x i16> %a, ptr %b, i64 %c) { +; CHECK-LABEL: stxvl2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 3, 6, 57 +; CHECK-NEXT: stxvl 34, 5, 3 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: stxvl2: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 3, 6, 1 +; FUTURE-NEXT: stxvrl 34, 5, 3 +; FUTURE-NEXT: blr +entry: + %cconv = trunc i64 %c to i32 + tail call void @llvm.vp.store.v8i16.p0(<8 x i16> %a, ptr %b, <8 x i1> splat (i1 true), i32 %cconv) + ret void +} + +; Function Attrs: nounwind readnone +define void @stxvl4(<4 x i32> %a, ptr %b, i64 %c) { +; CHECK-LABEL: stxvl4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 3, 6, 58 +; CHECK-NEXT: stxvl 34, 5, 3 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: stxvl4: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 3, 6, 2 +; FUTURE-NEXT: stxvrl 34, 5, 3 +; FUTURE-NEXT: blr +entry: + %cconv = trunc i64 %c to i32 + tail call void @llvm.vp.store.v4i32.p0(<4 x i32> %a, ptr %b, <4 x i1> splat (i1 true), i32 %cconv) + ret void +} + +; Function Attrs: nounwind readnone +define void @stxvl8(<2 x i64> %a, ptr %b, i64 %c) { +; CHECK-LABEL: stxvl8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 3, 6, 59 +; CHECK-NEXT: stxvl 34, 5, 3 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: stxvl8: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 3, 6, 3 +; FUTURE-NEXT: stxvrl 34, 5, 3 +; FUTURE-NEXT: blr +entry: + %cconv = trunc i64 %c to i32 + tail call void @llvm.vp.store.v2i64.p0(<2 x i64> %a, ptr %b, <2 x i1> splat (i1 true), i32 %cconv) + ret void +} + +; Function Attrs: nounwind readnone +define <16 x i8> @lxvl1(ptr %a, i64 %b) { +; CHECK-LABEL: lxvl1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 4, 4, 56 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: lxvl1: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: lxvrl 34, 3, 4 +; FUTURE-NEXT: blr +entry: + %bconv = trunc i64 %b to i32 + %0 = tail call <16 x i8> @llvm.vp.load.v16i8.p0(ptr %a, <16 x i1> splat (i1 true), i32 %bconv) + ret <16 x i8> %0 +} + +; Function Attrs: nounwind readnone +define <8 x i16> @lxvl2(ptr %a, i64 %b) { +; CHECK-LABEL: lxvl2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 4, 4, 57 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: lxvl2: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 4, 4, 1 +; FUTURE-NEXT: lxvrl 34, 3, 4 +; FUTURE-NEXT: blr +entry: + %bconv = trunc i64 %b to i32 + %0 = tail call <8 x i16> @llvm.vp.load.v8i16.p0(ptr %a, <8 x i1> splat (i1 true), i32 %bconv) + ret <8 x i16> %0 +} + +; Function Attrs: nounwind readnone +define <4 x i32> @lxvl4(ptr %a, i64 %b) { +; CHECK-LABEL: lxvl4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 4, 4, 58 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: lxvl4: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 4, 4, 2 +; FUTURE-NEXT: lxvrl 34, 3, 4 +; FUTURE-NEXT: blr +entry: + %bconv = trunc i64 %b to i32 + %0 = tail call <4 x i32> @llvm.vp.load.v4i32.p0(ptr %a, <4 x i1> splat (i1 true), i32 %bconv) + ret <4 x i32> %0 +} + +; Function Attrs: nounwind readnone +define <2 x i64> @lxvl8(ptr %a, i64 %b) { +; CHECK-LABEL: lxvl8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi 4, 4, 59 +; CHECK-NEXT: lxvl 34, 3, 4 +; CHECK-NEXT: blr +; +; FUTURE-LABEL: lxvl8: +; FUTURE: # %bb.0: # %entry +; FUTURE-NEXT: sldi 4, 4, 3 +; FUTURE-NEXT: lxvrl 34, 3, 4 +; FUTURE-NEXT: blr +entry: + %bconv = trunc i64 %b to i32 + %0 = tail call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %a, <2 x i1> splat (i1 true), i32 %bconv) + ret <2 x i64> %0 +}