diff --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td index 67b873d16cb5a..20ba51ade35a7 100644 --- a/llvm/include/llvm/IR/IntrinsicsHexagon.td +++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td @@ -447,3 +447,15 @@ def int_hexagon_instrprof_custom include "llvm/IR/IntrinsicsHexagonDep.td" + +class Hexagon__ptri32i32v64i16_Intrinsic intr_properties = [IntrNoMem]> + : Hexagon_Intrinsic; + +def int_hexagon_V6_vgather_vscattermh : +Hexagon__ptri32i32v64i16_Intrinsic<"HEXAGON_V6_vgather_vscattermh", [IntrArgMemOnly]>; + +def int_hexagon_V6_vgather_vscattermh_128B : +Hexagon__ptri32i32v32i32_Intrinsic<"HEXAGON_V6_vgather_vscattermh_128B", [IntrArgMemOnly]>; diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index e285e04543694..7ee280d8fc8b0 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -654,7 +654,9 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) { IntNo == Intrinsic::hexagon_V6_vgathermh || IntNo == Intrinsic::hexagon_V6_vgathermh_128B || IntNo == Intrinsic::hexagon_V6_vgathermhw || - IntNo == Intrinsic::hexagon_V6_vgathermhw_128B) { + IntNo == Intrinsic::hexagon_V6_vgathermhw_128B || + IntNo == Intrinsic::hexagon_V6_vgather_vscattermh || + IntNo == Intrinsic::hexagon_V6_vgather_vscattermh_128B) { SelectV65Gather(N); return; } diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index c7a4f6803a243..3cc146b13f8f8 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -2953,6 +2953,10 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { case Intrinsic::hexagon_V6_vgathermhw_128B: Opcode = Hexagon::V6_vgathermhw_pseudo; break; + case Intrinsic::hexagon_V6_vgather_vscattermh: + case Intrinsic::hexagon_V6_vgather_vscattermh_128B: + Opcode = Hexagon::V6_vgather_vscatter_mh_pseudo; + break; } SDVTList VTs = CurDAG->getVTList(MVT::Other); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 9f7f434b66fa1..526b4de975915 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -2145,7 +2145,9 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::hexagon_V6_vgathermhq: case Intrinsic::hexagon_V6_vgathermhq_128B: case Intrinsic::hexagon_V6_vgathermhwq: - case Intrinsic::hexagon_V6_vgathermhwq_128B: { + case Intrinsic::hexagon_V6_vgathermhwq_128B: + case Intrinsic::hexagon_V6_vgather_vscattermh: + case Intrinsic::hexagon_V6_vgather_vscattermh_128B: { const Module &M = *I.getParent()->getParent()->getParent(); Info.opc = ISD::INTRINSIC_W_CHAIN; Type *VecTy = I.getArgOperand(1)->getType(); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 939841ae817c3..47726d6447ad8 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1554,80 +1554,93 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const { MachineBasicBlock::iterator First; switch (Opc) { - case Hexagon::V6_vgathermh_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermw_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermhw_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermhq_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)) - .add(MI.getOperand(5)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermwq_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)) - .add(MI.getOperand(5)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermhwq_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)) - .add(MI.getOperand(5)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); + case Hexagon::V6_vgather_vscatter_mh_pseudo: + // This is mainly a place holder. It will be extended. + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vscattermh)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + case Hexagon::V6_vgathermh_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermw_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermhw_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermhq_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermwq_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermhwq_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); } return MI.getIterator(); @@ -2806,6 +2819,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::V6_vL32b_nt_tmp_npred_ai: case Hexagon::V6_vS32Ub_npred_ai: case Hexagon::V6_vgathermh_pseudo: + case Hexagon::V6_vgather_vscatter_mh_pseudo: case Hexagon::V6_vgathermw_pseudo: case Hexagon::V6_vgathermhw_pseudo: case Hexagon::V6_vgathermhq_pseudo: diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td index f927f9b9e7c34..42393d081f1a7 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td @@ -40,6 +40,19 @@ defm V6_vgathermh_pseudo : vgathermh; defm V6_vgathermw_pseudo : vgathermw; defm V6_vgathermhw_pseudo : vgathermhw; + +multiclass vgather_scatter_mh { + let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, + mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in + def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), + (ins IntRegs:$_dst_, s4_0Imm:$Ii, + IntRegs:$Rt, ModRegs:$Mu, RC:$Vv), + ".error \"should not emit\" ", + []>; +} + +defm V6_vgather_vscatter_mh_pseudo : vgather_scatter_mh; + multiclass vgathermhq { let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 171e2949366ad..e925e041eb64e 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -31,6 +31,10 @@ using namespace llvm; static cl::opt HexagonAutoHVX("hexagon-autohvx", cl::init(false), cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); +cl::opt HexagonAllowScatterGatherHVX( + "hexagon-allow-scatter-gather-hvx", cl::init(false), cl::Hidden, + cl::desc("Allow auto-generation of HVX scatter-gather")); + static cl::opt EnableV68FloatAutoHVX( "force-hvx-float", cl::Hidden, cl::desc("Enable auto-vectorization of floatint point types on v68.")); @@ -354,6 +358,61 @@ bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/, return HexagonMaskedVMem && ST.isTypeForHVX(DataType); } +bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const { + // For now assume we can not deal with all HVX datatypes. + if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) || + !HexagonAllowScatterGatherHVX) + return false; + // This must be in sync with HexagonVectorCombine pass. + switch (Ty->getScalarSizeInBits()) { + case 8: + return (getTypeNumElements(Ty) == 128); + case 16: + if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32) + return (Alignment >= 2); + break; + case 32: + if (getTypeNumElements(Ty) == 32) + return (Alignment >= 4); + break; + default: + break; + } + return false; +} + +bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const { + if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) || + !HexagonAllowScatterGatherHVX) + return false; + // This must be in sync with HexagonVectorCombine pass. + switch (Ty->getScalarSizeInBits()) { + case 8: + return (getTypeNumElements(Ty) == 128); + case 16: + if (getTypeNumElements(Ty) == 64) + return (Alignment >= 2); + break; + case 32: + if (getTypeNumElements(Ty) == 32) + return (Alignment >= 4); + break; + default: + break; + } + return false; +} + +bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy, + Align Alignment) const { + return !isLegalMaskedGather(VTy, Alignment); +} + +bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy, + Align Alignment) const { + return !isLegalMaskedScatter(VTy, Alignment); +} + /// --- Vector TTI end --- unsigned HexagonTTIImpl::getPrefetchDistance() const { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index dbf16c99c314c..cec2bf9656ffc 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -169,6 +169,12 @@ class HexagonTTIImpl final : public BasicTTIImplBase { unsigned AddressSpace) const override; bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const override; + bool isLegalMaskedGather(Type *Ty, Align Alignment) const override; + bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override; + bool forceScalarizeMaskedGather(VectorType *VTy, + Align Alignment) const override; + bool forceScalarizeMaskedScatter(VectorType *VTy, + Align Alignment) const override; /// @} diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 9ab52020e2e36..1f68fa4ff1356 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -57,6 +57,11 @@ #define DEBUG_TYPE "hexagon-vc" +// This is a const that represents default HVX VTCM page size. +// It is boot time configurable, so we probably want an API to +// read it, but for now assume 128KB +#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072 + using namespace llvm; namespace { @@ -418,6 +423,18 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) { class HvxIdioms { public: + enum DstQualifier { + Undefined = 0, + Arithmetic, + LdSt, + LLVM_Gather, + LLVM_Scatter, + HEX_Gather_Scatter, + HEX_Gather, + HEX_Scatter, + Call + }; + HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) { auto *Int32Ty = HVC.getIntTy(32); HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false); @@ -473,6 +490,11 @@ class HvxIdioms { auto createMulLong(IRBuilderBase &Builder, ArrayRef WordX, Signedness SgnX, ArrayRef WordY, Signedness SgnY) const -> SmallVector; + // Vector manipulations for Ripple + bool matchScatter(Instruction &In) const; + bool matchGather(Instruction &In) const; + Value *processVScatter(Instruction &In) const; + Value *processVGather(Instruction &In) const; VectorType *HvxI32Ty; VectorType *HvxP32Ty; @@ -1545,7 +1567,7 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool { } auto AlignVectors::run() -> bool { - LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors on " << HVC.F.getName() + LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName() << '\n'); if (!createAddressGroups()) return false; @@ -1797,6 +1819,840 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const return Ext; } +inline bool HvxIdioms::matchScatter(Instruction &In) const { + IntrinsicInst *II = dyn_cast(&In); + if (!II) + return false; + return (II->getIntrinsicID() == Intrinsic::masked_scatter); +} + +inline bool HvxIdioms::matchGather(Instruction &In) const { + IntrinsicInst *II = dyn_cast(&In); + if (!II) + return false; + return (II->getIntrinsicID() == Intrinsic::masked_gather); +} + +Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual); + +// Binary instructions we want to handle as users of gather/scatter. +inline bool isArithmetic(unsigned Opc) { + switch (Opc) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::Shl: + case Instruction::UDiv: + return true; + } + return false; +} + +// TODO: Maybe use MemoryLocation for this. See getLocOrNone above. +inline Value *getPointer(Value *Ptr) { + assert(Ptr && "Unable to extract pointer"); + if (isa(Ptr) || isa(Ptr) || isa(Ptr)) + return Ptr; + if (isa(Ptr) || isa(Ptr)) + return getLoadStorePointerOperand(Ptr); + if (IntrinsicInst *II = dyn_cast(Ptr)) { + if (II->getIntrinsicID() == Intrinsic::masked_store) + return II->getOperand(1); + } + return nullptr; +} + +static Instruction *selectDestination(Instruction *In, + HvxIdioms::DstQualifier &Qual) { + Instruction *Destination = nullptr; + if (!In) + return Destination; + if (isa(In)) { + Destination = In; + Qual = HvxIdioms::LdSt; + } else if (IntrinsicInst *II = dyn_cast(In)) { + if (II->getIntrinsicID() == Intrinsic::masked_gather) { + Destination = In; + Qual = HvxIdioms::LLVM_Gather; + } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) { + Destination = In; + Qual = HvxIdioms::LLVM_Scatter; + } else if (II->getIntrinsicID() == Intrinsic::masked_store) { + Destination = In; + Qual = HvxIdioms::LdSt; + } else if (II->getIntrinsicID() == + Intrinsic::hexagon_V6_vgather_vscattermh) { + Destination = In; + Qual = HvxIdioms::HEX_Gather_Scatter; + } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) { + Destination = In; + Qual = HvxIdioms::HEX_Scatter; + } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) { + Destination = In; + Qual = HvxIdioms::HEX_Gather; + } + } else if (isa(In)) { + return locateDestination(In, Qual); + } else if (isa(In)) { + return locateDestination(In, Qual); + } else if (isa(In)) { + Destination = In; + Qual = HvxIdioms::Call; + } else if (isa(In)) { + return locateDestination(In, Qual); + } else if (isArithmetic(In->getOpcode())) { + Destination = In; + Qual = HvxIdioms::Arithmetic; + } else { + LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n"); + } + return Destination; +} + +// This method attempts to find destination (user) for a given intrinsic. +// Given that these are produced only by Ripple, the number of options is +// limited. Simplest case is explicit store which in fact is redundant (since +// HVX gater creates its own store during packetization). Nevertheless we need +// to figure address where we storing. Other cases are more complicated, but +// still few. +Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) { + Instruction *Destination = nullptr; + if (!In) + return Destination; + // Get all possible destinations + SmallVector Users; + // Iterate over the uses of the instruction + for (auto &U : In->uses()) { + if (auto *UI = dyn_cast(U.getUser())) { + Destination = selectDestination(UI, Qual); + if (Destination) + Users.push_back(Destination); + } + } + // Now see which of the users (if any) is a memory destination. + for (auto *I : Users) + if (getPointer(I)) + return I; + return Destination; +} + +// The two intrinsics we handle here have GEP in a different position. +inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) { + assert(In && "Bad instruction"); + IntrinsicInst *IIn = dyn_cast(In); + assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather || + IIn->getIntrinsicID() == Intrinsic::masked_scatter)) && + "Not a gather Intrinsic"); + GetElementPtrInst *GEPIndex = nullptr; + if (IIn->getIntrinsicID() == Intrinsic::masked_gather) + GEPIndex = dyn_cast(IIn->getOperand(0)); + else + GEPIndex = dyn_cast(IIn->getOperand(1)); + return GEPIndex; +} + +// Given the intrinsic find its GEP argument and extract base address it uses. +// The method relies on the way how Ripple typically forms the GEP for +// scatter/gather. +static Value *locateAddressFromIntrinsic(Instruction *In) { + GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In); + if (!GEPIndex) { + LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n"); + return nullptr; + } + Value *BaseAddress = GEPIndex->getPointerOperand(); + auto *IndexLoad = dyn_cast(BaseAddress); + if (IndexLoad) + return IndexLoad; + + auto *IndexZEx = dyn_cast(BaseAddress); + if (IndexZEx) { + IndexLoad = dyn_cast(IndexZEx->getOperand(0)); + if (IndexLoad) + return IndexLoad; + IntrinsicInst *II = dyn_cast(IndexZEx->getOperand(0)); + if (II && II->getIntrinsicID() == Intrinsic::masked_gather) + return locateAddressFromIntrinsic(II); + } + auto *BaseShuffle = dyn_cast(BaseAddress); + if (BaseShuffle) { + IndexLoad = dyn_cast(BaseShuffle->getOperand(0)); + if (IndexLoad) + return IndexLoad; + auto *IE = dyn_cast(BaseShuffle->getOperand(0)); + if (IE) { + auto *Src = IE->getOperand(1); + IndexLoad = dyn_cast(Src); + if (IndexLoad) + return IndexLoad; + auto *Alloca = dyn_cast(Src); + if (Alloca) + return Alloca; + if (isa(Src)) { + return Src; + } + if (isa(Src)) { + return Src; + } + } + } + LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n"); + return nullptr; +} + +static Type *getIndexType(Value *In) { + if (!In) + return nullptr; + + if (isa(In) || isa(In)) + return getLoadStoreType(In); + + if (IntrinsicInst *II = dyn_cast(In)) { + if (II->getIntrinsicID() == Intrinsic::masked_load) + return II->getType(); + if (II->getIntrinsicID() == Intrinsic::masked_store) + return II->getOperand(0)->getType(); + } + return In->getType(); +} + +static Value *locateIndexesFromGEP(Value *In) { + if (!In) + return nullptr; + if (isa(In)) + return In; + if (IntrinsicInst *II = dyn_cast(In)) { + if (II->getIntrinsicID() == Intrinsic::masked_load) + return In; + if (II->getIntrinsicID() == Intrinsic::masked_gather) + return In; + } + if (auto *IndexZEx = dyn_cast(In)) + return locateIndexesFromGEP(IndexZEx->getOperand(0)); + if (auto *IndexSEx = dyn_cast(In)) + return locateIndexesFromGEP(IndexSEx->getOperand(0)); + if (auto *BaseShuffle = dyn_cast(In)) + return locateIndexesFromGEP(BaseShuffle->getOperand(0)); + if (auto *IE = dyn_cast(In)) + return locateIndexesFromGEP(IE->getOperand(1)); + if (auto *cstDataVector = dyn_cast(In)) + return cstDataVector; + if (auto *GEPIndex = dyn_cast(In)) + return GEPIndex->getOperand(0); + return nullptr; +} + +// Given the intrinsic find its GEP argument and extract offsetts from the base +// address it uses. +static Value *locateIndexesFromIntrinsic(Instruction *In) { + GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In); + if (!GEPIndex) { + LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n"); + return nullptr; + } + Value *Indexes = GEPIndex->getOperand(1); + if (auto *IndexLoad = locateIndexesFromGEP(Indexes)) + return IndexLoad; + + LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n"); + return nullptr; +} + +// Because of aukward definition of many Hex intrinsics we often have to +// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP +// for all use cases, so this only exist to make IR builder happy. +inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, + IRBuilderBase &Builder, + LLVMContext &Ctx, Value *I) { + assert(I && "Unable to reinterprete cast"); + Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); + std::vector shuffleMask; + for (unsigned i = 0; i < 64; ++i) + shuffleMask.push_back(i); + Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask); + Value *CastShuffle = + Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle"); + return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32"); +} + +// Recast <128 x i8> as <32 x i32> +inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, + IRBuilderBase &Builder, + LLVMContext &Ctx, Value *I) { + assert(I && "Unable to reinterprete cast"); + Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); + std::vector shuffleMask; + for (unsigned i = 0; i < 128; ++i) + shuffleMask.push_back(i); + Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask); + Value *CastShuffle = + Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle"); + return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32"); +} + +// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern +inline Value *get_i32_Mask(const HexagonVectorCombine &HVC, + IRBuilderBase &Builder, LLVMContext &Ctx, + unsigned int pattern) { + std::vector byteMask; + for (unsigned i = 0; i < 32; ++i) + byteMask.push_back(pattern); + + return Builder.CreateIntrinsic( + HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt), + {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)}, + nullptr); +} + +Value *HvxIdioms::processVScatter(Instruction &In) const { + auto *InpTy = dyn_cast(In.getOperand(0)->getType()); + assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather"); + unsigned InpSize = HVC.getSizeOf(InpTy); + unsigned Elements = HVC.length(InpTy); + auto *ElemTy = dyn_cast(InpTy->getElementType()); + assert(ElemTy && "llvm.scatter needs integer type argument"); + unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy); + auto *F = In.getFunction(); + LLVMContext &Ctx = F->getContext(); + LLVM_DEBUG(dbgs() << "\n[Process scatter](" << In << ")\n" + << *In.getParent() << "\n"); + LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements(" << Elements + << ") VecLen(" << InpSize << ") type(" << *ElemTy + << ") ElemWidth(" << ElemWidth << ")\n"); + + IRBuilder Builder(In.getParent(), In.getIterator(), + InstSimplifyFolder(HVC.DL)); + + auto *ValueToScatter = In.getOperand(0); + LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n"); + + if (HVC.HST.getVectorLength() != InpSize) { + LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize + << ") for vscatter\n"); + return nullptr; + } + + // Base address of indexes. + auto *IndexLoad = locateAddressFromIntrinsic(&In); + if (!IndexLoad) + return nullptr; + LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n"); + + // Address of destination. Must be in VTCM. + auto *Ptr = getPointer(IndexLoad); + if (!Ptr) + return nullptr; + LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n"); + // Indexes/offsets + auto *Indexes = locateIndexesFromIntrinsic(&In); + if (!Indexes) + return nullptr; + LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n"); + Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx), + "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n"); + // Adjust Indexes + auto *cstDataVector = dyn_cast(Indexes); + Value *CastIndex = nullptr; + if (cstDataVector) { + // Our indexes are represented as a constant. We need it in a reg. + AllocaInst *IndexesAlloca = + Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false)); + auto *StoreIndexes = Builder.CreateStore(cstDataVector, IndexesAlloca); + LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n"); + CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(), + IndexesAlloca, "reload_index"); + } else { + if (ElemWidth == 2) + CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); + else + CastIndex = Indexes; + } + LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n"); + + if (ElemWidth == 1) { + // v128i8 There is no native instruction for this. + // Do this as two Hi/Lo gathers with masking. + Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); + // Extend indexes. We assume that indexes are in 128i8 format - need to + // expand them to Hi/Lo 64i16 + Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32"); + auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub); + auto *UnpackedIndexes = Builder.CreateIntrinsic( + HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr); + LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n"); + + auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi); + auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo); + Value *IndexHi = + HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes); + Value *IndexLo = + HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes); + LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n"); + LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n"); + // Now unpack values to scatter + Value *CastSrc = + getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter); + LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n"); + auto *UnpackedValueToScatter = Builder.CreateIntrinsic( + HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr); + LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter + << ")\n"); + + Value *UVSHi = + HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter); + Value *UVSLo = + HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter); + LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n"); + LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n"); + + // Create the mask for individual bytes + auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff); + LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n"); + + auto *ResHi = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B, + {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + IndexHi, UVSHi}, + nullptr); + LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n"); + return Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B, + {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + IndexLo, UVSLo}, + nullptr); + } else if (ElemWidth == 2) { + Value *CastSrc = + getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter); + LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n"); + return Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B, + {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex, + CastSrc}, + nullptr); + } else if (ElemWidth == 4) { + return Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B, + {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex, + ValueToScatter}, + nullptr); + } else { + LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n"); + return nullptr; + } +} + +Value *HvxIdioms::processVGather(Instruction &In) const { + auto *InpTy = dyn_cast(In.getOperand(0)->getType()); + assert(InpTy && "Cannot handle no vector type for llvm.gather"); + auto *ElemTy = dyn_cast(InpTy->getElementType()); + assert(ElemTy && "llvm.gather needs vector of ptr argument"); + auto *F = In.getFunction(); + LLVMContext &Ctx = F->getContext(); + LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n" + << *In.getParent() << "\n"); + LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements(" + << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy) + << ") type(" << *ElemTy << ") Access alignment(" + << *In.getOperand(1) << ") AddressSpace(" + << ElemTy->getAddressSpace() << ")\n"); + + // TODO: Handle masking of elements. + auto *MaskTy = dyn_cast(In.getOperand(2)->getType()); + assert(MaskTy && "llvm.gather needs vector for mask"); + IRBuilder Builder(In.getParent(), In.getIterator(), + InstSimplifyFolder(HVC.DL)); + + // See who is using the result. The difference between LLVM and HVX vgather + // Intrinsic makes it impossible to handle all cases with temp storage. Alloca + // in VTCM is not yet supported, so for now we just bail out for those cases. + HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined; + Instruction *Dst = locateDestination(&In, Qual); + if (!Dst) { + LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual + << ")\n"); + + // Address of destination. Must be in VTCM. + auto *Ptr = getPointer(Dst); + if (!Ptr) { + LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n"); + return nullptr; + } + + // Result type. Assume it is a vector type. + auto *DstType = cast(getIndexType(Dst)); + assert(DstType && "Cannot handle non vector dst type for llvm.gather"); + + // Base address for sources to be loaded + auto *IndexLoad = locateAddressFromIntrinsic(&In); + if (!IndexLoad) + return nullptr; + LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n"); + + // Gather indexes/offsets + auto *Indexes = locateIndexesFromIntrinsic(&In); + if (!Indexes) + return nullptr; + LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n"); + + Instruction *Gather = nullptr; + Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); + if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) { + // We fully assume the address space is in VTCM. We also assume that all + // pointers in Operand(0) have the same base(!). + // This is the most basic case of all the above. + unsigned OutputSize = HVC.getSizeOf(DstType); + auto *DstElemTy = cast(DstType->getElementType()); + unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy); + LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType() + << " Address space (" + << Ptr->getType()->getPointerAddressSpace() << ")\n" + << " Result type : " << *DstType + << "\n Size in bytes : " << OutputSize + << " element type(" << *DstElemTy + << ")\n ElemWidth : " << ElemWidth << " bytes\n"); + + auto *IndexType = cast(getIndexType(Indexes)); + assert(IndexType && "Cannot handle non vector index type for llvm.gather"); + unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType()); + LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n"); + + // Intrinsic takes i32 instead of pointer so cast. + Value *CastedPtr = Builder.CreateBitOrPointerCast( + IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...] + // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty] + // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty] + // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty] + // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty] + // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty] + // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty] + if (HVC.HST.getVectorLength() == OutputSize) { + if (ElemWidth == 1) { + // v128i8 There is no native instruction for this. + // Do this as two Hi/Lo gathers with masking. + // Unpack indexes. We assume that indexes are in 128i8 format - need to + // expand them to Hi/Lo 64i16 + Value *CastIndexes = + Builder.CreateBitCast(Indexes, NT, "cast_to_32i32"); + auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub); + auto *UnpackedIndexes = + Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true), + V6_vunpack, CastIndexes, nullptr); + LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes + << ")\n"); + + auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi); + auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo); + Value *IndexHi = + HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes); + Value *IndexLo = + HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes); + LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n"); + LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n"); + // Create the mask for individual bytes + auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff); + LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n"); + // We use our destination allocation as a temp storage + // This is unlikely to work properly for masked gather. + auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq); + auto GatherHi = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), V6_vgather, + {Ptr, QByteMask, CastedPtr, + HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi}, + nullptr); + LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n"); + // Rematerialize the result + Value *LoadedResultHi = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi"); + LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n"); + // Same for the low part. Here we use Gather to return non-NULL result + // from this function and continue to iterate. We also are deleting Dst + // store below. + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), V6_vgather, + {Ptr, QByteMask, CastedPtr, + HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo}, + nullptr); + LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n"); + Value *LoadedResultLo = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo"); + LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n"); + // Now we have properly sized bytes in every other position + // B b A a c a A b B c f F g G h H is presented as + // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H + // Use vpack to gather them + auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb); + auto Res = Builder.CreateIntrinsic( + NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr); + LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n"); + auto *StoreRes = Builder.CreateStore(Res, Ptr); + LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n"); + } else if (ElemWidth == 2) { + // v32i16 + if (IndexWidth == 2) { + // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match. + Value *CastIndex = + getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); + LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n"); + // shift all i16 left by 1 to match short addressing mode instead of + // byte. + auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); + Value *AdjustedIndex = HVC.createHvxIntrinsic( + Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)}); + LLVM_DEBUG(dbgs() + << " Shifted half index: " << *AdjustedIndex << ")\n"); + + auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh); + // The 3rd argument is the size of the region to gather from. Probably + // want to set it to max VTCM size. + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), V6_vgather, + {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + AdjustedIndex}, + nullptr); + for (auto &U : Dst->uses()) { + if (auto *UI = dyn_cast(U.getUser())) + dbgs() << " dst used by: " << *UI << "\n"; + } + for (auto &U : In.uses()) { + if (auto *UI = dyn_cast(U.getUser())) + dbgs() << " In used by : " << *UI << "\n"; + } + // Create temp load from result in case the result is used by any + // other instruction. + Value *LoadedResult = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result"); + LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n"); + In.replaceAllUsesWith(LoadedResult); + } else { + dbgs() << " Unhandled index type for vgather\n"; + return nullptr; + } + } else if (ElemWidth == 4) { + if (IndexWidth == 4) { + // v32i32 + auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); + Value *AdjustedIndex = HVC.createHvxIntrinsic( + Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)}); + LLVM_DEBUG(dbgs() + << " Shifted word index: " << *AdjustedIndex << ")\n"); + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B, + {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + AdjustedIndex}, + nullptr); + } else { + LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n"); + return nullptr; + } + } else { + LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n"); + return nullptr; + } + } else if (HVC.HST.getVectorLength() == OutputSize * 2) { + // This is half of the reg width, duplicate low in high + LLVM_DEBUG(dbgs() << " Unhandled half of register size\n"); + return nullptr; + } else if (HVC.HST.getVectorLength() * 2 == OutputSize) { + LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n"); + return nullptr; + } + // Erase the original intrinsic and store that consumes it. + // HVX will create a pseudo for gather that is expanded to gather + store + // during packetization. + Dst->eraseFromParent(); + } else if (Qual == HvxIdioms::LLVM_Scatter) { + // Gather feeds directly into scatter. + auto *DstInpTy = cast(Dst->getOperand(1)->getType()); + assert(DstInpTy && "Cannot handle no vector type for llvm.scatter"); + unsigned DstInpSize = HVC.getSizeOf(DstInpTy); + unsigned DstElements = HVC.length(DstInpTy); + auto *DstElemTy = cast(DstInpTy->getElementType()); + assert(DstElemTy && "llvm.scatter needs vector of ptr argument"); + LLVM_DEBUG(dbgs() << " Gather feeds into scatter\n Values to scatter : " + << *Dst->getOperand(0) << "\n"); + LLVM_DEBUG(dbgs() << " Dst type(" << *DstInpTy << ") elements(" + << DstElements << ") VecLen(" << DstInpSize << ") type(" + << *DstElemTy << ") Access alignment(" + << *Dst->getOperand(2) << ")\n"); + // Address of source + auto *Src = getPointer(IndexLoad); + if (!Src) + return nullptr; + LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n"); + + if (!isa(Src->getType())) { + LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n"); + return nullptr; + } + + Value *CastedSrc = Builder.CreateBitOrPointerCast( + Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n"); + + auto *DstLoad = locateAddressFromIntrinsic(Dst); + if (!DstLoad) { + LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n"); + + Value *Ptr = getPointer(DstLoad); + if (!Ptr) + return nullptr; + LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n"); + Value *CastIndex = + getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad); + LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n"); + // Shift all i16 left by 1 to match short addressing mode instead of + // byte. + auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); + Value *AdjustedIndex = HVC.createHvxIntrinsic( + Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)}); + LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n"); + + return Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, + {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + AdjustedIndex}, + nullptr); + } else if (Qual == HvxIdioms::HEX_Gather_Scatter) { + // Gather feeds into previously inserted pseudo intrinsic. + // These could not be in the same packet, so we need to generate another + // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo + // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt, + // ModRegs:$Mu, HvxVR:$Vv) + if (isa(IndexLoad)) { + auto *cstDataVector = dyn_cast(Indexes); + if (cstDataVector) { + // Our indexes are represented as a constant. We need THEM in a reg. + // This most likely will not work properly since alloca gives us DDR + // stack location. This will be fixed once we teach compiler about VTCM. + AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT); + auto *StoreIndexes = Builder.CreateStore(cstDataVector, IndexesAlloca); + LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n"); + Value *LoadedIndex = Builder.CreateLoad( + IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index"); + AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); + LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n"); + + Value *CastedSrc = Builder.CreateBitOrPointerCast( + IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n"); + + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, + {ResultAlloca, CastedSrc, + HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex}, + nullptr); + Value *LoadedResult = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); + LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n"); + LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n"); + In.replaceAllUsesWith(LoadedResult); + } + } else { + // Address of source + auto *Src = getPointer(IndexLoad); + if (!Src) + return nullptr; + LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n"); + + Value *CastedSrc = Builder.CreateBitOrPointerCast( + Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n"); + + auto *DstLoad = locateAddressFromIntrinsic(Dst); + if (!DstLoad) + return nullptr; + LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n"); + auto *Ptr = getPointer(DstLoad); + if (!Ptr) + return nullptr; + LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n"); + + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh, + {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + Indexes}, + nullptr); + } + return Gather; + } else if (Qual == HvxIdioms::HEX_Scatter) { + // This is the case when result of a gather is used as an argument to + // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it + // ourselves. We have to create alloca, store to it, and replace all uses + // with that. + AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); + Value *CastedSrc = Builder.CreateBitOrPointerCast( + IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n"); + Value *CastIndex = + getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); + LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n"); + + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, + {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + CastIndex}, + nullptr); + Value *LoadedResult = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); + LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n"); + In.replaceAllUsesWith(LoadedResult); + } else if (Qual == HvxIdioms::HEX_Gather) { + // Gather feeds to another gather but already replaced with + // hexagon_V6_vgathermh_128B + if (isa(IndexLoad)) { + auto *cstDataVector = dyn_cast(Indexes); + if (cstDataVector) { + // Our indexes are represented as a constant. We need it in a reg. + AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT); + + auto *StoreIndexes = Builder.CreateStore(cstDataVector, IndexesAlloca); + LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n"); + Value *LoadedIndex = Builder.CreateLoad( + IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index"); + AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); + LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca + << "\n AddressSpace: " + << ResultAlloca->getAddressSpace() << "\n";); + + Value *CastedSrc = Builder.CreateBitOrPointerCast( + IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n"); + + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, + {ResultAlloca, CastedSrc, + HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex}, + nullptr); + Value *LoadedResult = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); + LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n"); + LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n"); + In.replaceAllUsesWith(LoadedResult); + } + } + } else if (Qual == HvxIdioms::LLVM_Gather) { + // Gather feeds into another gather + errs() << " Underimplemented vgather to vgather sequence\n"; + return nullptr; + } else + llvm_unreachable("Unhandled Qual enum"); + + return Gather; +} + auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In, const FxpOp &Op) const -> Value * { assert(Op.X.Val->getType() == Op.Y.Val->getType()); @@ -2138,6 +2994,26 @@ auto HvxIdioms::run() -> bool { It = StartOver ? B.rbegin() : cast(New)->getReverseIterator(); Changed = true; + } else if (matchGather(*It)) { + Value *New = processVGather(*It); + if (!New) + continue; + LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n"); + // We replace original intrinsic with a new pseudo call. + It->eraseFromParent(); + It = cast(New)->getReverseIterator(); + RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI); + Changed = true; + } else if (matchScatter(*It)) { + Value *New = processVScatter(*It); + if (!New) + continue; + LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n"); + // We replace original intrinsic with a new pseudo call. + It->eraseFromParent(); + It = cast(New)->getReverseIterator(); + RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI); + Changed = true; } } } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll new file mode 100644 index 0000000000000..4385da3373de7 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll @@ -0,0 +1,63 @@ +; Make sure we do not assert for the cases we do not handle. +; RUN: llc -march=hexagon -mattr=+hvx,+hvx-length128b,+hvxv75,+v75,-long-calls < %s | FileCheck %s + +; Mainly make sure we do not core dump. +; CHECK-NOT: scatter + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: write, inaccessiblemem: readwrite) +define dso_local void @foo(ptr noundef writeonly captures(none) %cptr, i32 noundef %T, i32 noundef %W) local_unnamed_addr #0 { +entry: + %invariant.gep11 = getelementptr i8, ptr %cptr, i32 0 + %invariant.gep13 = getelementptr i8, ptr %invariant.gep11, i32 0 + %cmp.not15 = icmp ugt i32 8, %T + br i1 %cmp.not15, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: ; preds = %entry + %cmp3.not8 = icmp ugt i32 8, %W + %conv.ripple.LS.instance = trunc i32 %W to i8 + %conv.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <64 x i8> poison, i8 %conv.ripple.LS.instance, i64 0 + %conv.ripple.LS.instance.ripple.bcast.splat = shufflevector <64 x i8> %conv.ripple.LS.instance.ripple.bcast.splatinsert, <64 x i8> poison, <64 x i32> zeroinitializer + br label %for.cond1.preheader + +for.cond.loopexit: ; preds = %for.body5, %for.cond1.preheader + %add = add i32 %add17, 8 + %cmp.not = icmp ugt i32 %add, %T + br i1 %cmp.not, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond.loopexit + %add17 = phi i32 [ 8, %for.cond1.preheader.lr.ph ], [ %add, %for.cond.loopexit ] + %t.016 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add17, %for.cond.loopexit ] + br i1 %cmp3.not8, label %for.cond.loopexit, label %for.body5.lr.ph + +for.body5.lr.ph: ; preds = %for.cond1.preheader + %gep14 = getelementptr i8, ptr %invariant.gep13, i32 %t.016 + br label %for.body5 + +for.cond.cleanup: ; preds = %for.cond.loopexit, %entry + ret void + +for.body5: ; preds = %for.body5.lr.ph, %for.body5 + %add210 = phi i32 [ 8, %for.body5.lr.ph ], [ %add2, %for.body5 ] + %w.09 = phi i32 [ 0, %for.body5.lr.ph ], [ %add210, %for.body5 ] + %gep = getelementptr i8, ptr %gep14, i32 %w.09 + %gep.ripple.LS.instance = getelementptr i8, ptr %gep, <64 x i32> + call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %conv.ripple.LS.instance.ripple.bcast.splat, <64 x ptr> %gep.ripple.LS.instance, i32 1, <64 x i1> splat (i1 true)) + %add2 = add i32 %add210, 8 + %cmp3.not = icmp ugt i32 %add2, %W + br i1 %cmp3.not, label %for.cond.loopexit, label %for.body5 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.ripple.block.setsize.i32(i32 immarg %0, i32 immarg %1, i32 %2) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare i32 @llvm.ripple.block.index.i32(i32 immarg %0, i32 immarg %1) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare i32 @llvm.ripple.block.getsize.i32(i32 immarg %0, i32 immarg %1) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write) +declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #3 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll new file mode 100644 index 0000000000000..83fd63ebc037c --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s + +; CHECK-LABEL: Ripple_gather_32: +; CHECK: vtmp.w = vgather +; CHECK-LABEL: Ripple_gather_16: +; CHECK: vtmp.h = vgather +; CHECK-LABEL: Ripple_gather_8: +; CHECK: vand +; CHECK: vpacke + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) +define dso_local void @Ripple_gather_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 { +entry: + %source.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %source, i64 0 + %source.ripple.bcast.splat = shufflevector <32 x ptr> %source.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer + %0 = load <32 x i32>, ptr %indexes, align 4 + %arrayidx2.ripple.vectorized = getelementptr inbounds i32, <32 x ptr> %source.ripple.bcast.splat, <32 x i32> %0 + %1 = tail call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> %arrayidx2.ripple.vectorized, i32 4, <32 x i1> , <32 x i32> poison) + store <32 x i32> %1, ptr %destination, align 4 + ret void +} + +; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) +define dso_local void @Ripple_gather_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 { +entry: + %source.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %source, i64 0 + %source.ripple.bcast.splat = shufflevector <64 x ptr> %source.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer + %0 = load <64 x i16>, ptr %indexes, align 2 + %idxprom.ripple.vectorized = zext <64 x i16> %0 to <64 x i32> + %arrayidx2.ripple.vectorized = getelementptr inbounds i16, <64 x ptr> %source.ripple.bcast.splat, <64 x i32> %idxprom.ripple.vectorized + %1 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.vectorized, i32 2, <64 x i1> , <64 x i16> poison) + store <64 x i16> %1, ptr %destination, align 2 + ret void +} + +; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) +define dso_local void @Ripple_gather_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 { +entry: + %source.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %source, i64 0 + %source.ripple.bcast.splat = shufflevector <128 x ptr> %source.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer + %0 = load <128 x i8>, ptr %indexes, align 1 + %idxprom.ripple.vectorized = zext <128 x i8> %0 to <128 x i32> + %arrayidx2.ripple.vectorized = getelementptr inbounds i8, <128 x ptr> %source.ripple.bcast.splat, <128 x i32> %idxprom.ripple.vectorized + %1 = tail call <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %arrayidx2.ripple.vectorized, i32 1, <128 x i1> , <128 x i8> poison) + store <128 x i8> %1, ptr %destination, align 1 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) +declare <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i32>) #1 +declare <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr>, i32 immarg, <64 x i1>, <64 x i16>) #1 +declare <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %0, i32 immarg %1, <128 x i1> %2, <128 x i8> %3) #1 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll new file mode 100644 index 0000000000000..1bd79d7a46556 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll @@ -0,0 +1,54 @@ +; Verify that we generate HVX vgather for the given input. +; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s +; CHECK-LABEL: SpVV_Ripple: +; CHECK: vtmp.h = vgather(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h +; CHECK: vmem(r0+#0) = vtmp.new + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define dso_local i32 @SpVV_Ripple(ptr nocapture noundef writeonly %scratchpad, ptr nocapture noundef readonly %Source_1, ptr nocapture noundef readonly %S_index, i32 noundef %nS, ptr nocapture noundef readonly %Source_2) local_unnamed_addr #1 { +entry: + %Source_2.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %Source_2, i64 0 + %Source_2.ripple.bcast.splat = shufflevector <64 x ptr> %Source_2.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer + %div16 = lshr i32 %nS, 6 + %cmp6.not = icmp ult i32 %nS, 64 + br i1 %cmp6.not, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %lsr.iv17 = phi ptr [ %scevgep18, %for.body ], [ %S_index, %entry ] + %lsr.iv = phi ptr [ %scevgep, %for.body ], [ %Source_1, %entry ] + %result.08.ripple.vectorized = phi <64 x i32> [ %add8.ripple.vectorized, %for.body ], [ zeroinitializer, %entry ] + %_ripple_block_0.07 = phi i32 [ %add9, %for.body ], [ 0, %entry ] + %.ripple.LS.instance = load <64 x i16>, ptr %lsr.iv17, align 2 + %idxprom.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance to <64 x i32> + %arrayidx2.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %Source_2.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance + %.ripple.LS.instance13 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.LS.instance, i32 2, <64 x i1> , <64 x i16> poison) + store <64 x i16> %.ripple.LS.instance13, ptr %scratchpad, align 2 + %.ripple.LS.instance15 = load <64 x i16>, ptr %lsr.iv, align 2 + %conv.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance15 to <64 x i32> + %conv6.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance13 to <64 x i32> + %mul7.ripple.LS.instance = mul nsw <64 x i32> %conv.ripple.LS.instance, %conv6.ripple.LS.instance + %add8.ripple.vectorized = add <64 x i32> %mul7.ripple.LS.instance, %result.08.ripple.vectorized + %add9 = add nuw nsw i32 %_ripple_block_0.07, 1 + %scevgep = getelementptr i8, ptr %lsr.iv, i32 128 + %scevgep18 = getelementptr i8, ptr %lsr.iv17, i32 128 + %cmp = icmp ult i32 %add9, %div16 + br i1 %cmp, label %for.body, label %for.end +for.end: ; preds = %for.body, %entry + %result.0.lcssa.ripple.LS.instance = phi <64 x i32> [ zeroinitializer, %entry ], [ %add8.ripple.vectorized, %for.body ] + %rdx.shuf = shufflevector <64 x i32> %result.0.lcssa.ripple.LS.instance, <64 x i32> poison, <64 x i32> + %bin.rdx = add <64 x i32> %result.0.lcssa.ripple.LS.instance, %rdx.shuf + %rdx.shuf19 = shufflevector <64 x i32> %bin.rdx, <64 x i32> poison, <64 x i32> + %bin.rdx20 = add <64 x i32> %bin.rdx, %rdx.shuf19 + %rdx.shuf21 = shufflevector <64 x i32> %bin.rdx20, <64 x i32> poison, <64 x i32> + %bin.rdx22 = add <64 x i32> %bin.rdx20, %rdx.shuf21 + %rdx.shuf23 = shufflevector <64 x i32> %bin.rdx22, <64 x i32> poison, <64 x i32> + %bin.rdx24 = add <64 x i32> %bin.rdx22, %rdx.shuf23 + %rdx.shuf25 = shufflevector <64 x i32> %bin.rdx24, <64 x i32> poison, <64 x i32> + %bin.rdx26 = add <64 x i32> %bin.rdx24, %rdx.shuf25 + %rdx.shuf27 = shufflevector <64 x i32> %bin.rdx26, <64 x i32> poison, <64 x i32> + %bin.rdx28 = add <64 x i32> %bin.rdx26, %rdx.shuf27 + %0 = extractelement <64 x i32> %bin.rdx28, i32 0 + ret i32 %0 +} diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll new file mode 100644 index 0000000000000..85d299948b729 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll @@ -0,0 +1,52 @@ +; RUN: llc -march=hexagon -mattr=+hvx-length128b,+hvxv73,+v73,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s + +; CHECK-LABEL: Ripple_scatter_8: +; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h +; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h +; CHECK-LABEL: Ripple_scatter_16: +; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h = v{{[0-9]+}} +; CHECK-LABEL: Ripple_scatter_32: +; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.w).w = v{{[0-9]+}} + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define dso_local void @Ripple_scatter_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 { +entry: + %destination.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %destination, i64 0 + %destination.ripple.bcast.splat = shufflevector <128 x ptr> %destination.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer + %.ripple.LS.instance11 = load <128 x i8>, ptr %source, align 1 + %.ripple.LS.instance = load <128 x i8>, ptr %indexes, align 1 + %idxprom.ripple.LS.instance = zext <128 x i8> %.ripple.LS.instance to <128 x i32> + %arrayidx3.ripple.LS.instance = getelementptr inbounds i8, <128 x ptr> %destination.ripple.bcast.splat, <128 x i32> %idxprom.ripple.LS.instance + %cst_ptr_to_i32 = ptrtoint ptr %destination to i32 + tail call void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %.ripple.LS.instance11, <128 x ptr> %arrayidx3.ripple.LS.instance, i32 1, <128 x i1> ) + ret void +} + +define dso_local void @Ripple_scatter_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 { +entry: + %destination.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %destination, i64 0 + %destination.ripple.bcast.splat = shufflevector <64 x ptr> %destination.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer + %.ripple.LS.instance11 = load <64 x i16>, ptr %source, align 2 + %.ripple.LS.instance = load <64 x i16>, ptr %indexes, align 2 + %idxprom.ripple.LS.instance = zext <64 x i16> %.ripple.LS.instance to <64 x i32> + %arrayidx3.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %destination.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance + tail call void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %.ripple.LS.instance11, <64 x ptr> %arrayidx3.ripple.LS.instance, i32 2, <64 x i1> ) + ret void +} + +define dso_local void @Ripple_scatter_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 { +entry: + %destination.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %destination, i64 0 + %destination.ripple.bcast.splat = shufflevector <32 x ptr> %destination.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer + %.ripple.LS.instance11 = load <32 x i32>, ptr %source, align 4 + %.ripple.LS.instance = load <32 x i32>, ptr %indexes, align 4 + %arrayidx3.ripple.LS.instance = getelementptr inbounds i32, <32 x ptr> %destination.ripple.bcast.splat, <32 x i32> %.ripple.LS.instance + tail call void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %.ripple.LS.instance11, <32 x ptr> %arrayidx3.ripple.LS.instance, i32 4, <32 x i1> ) + ret void +} + +declare void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %0, <128 x ptr> %1, i32 immarg %2, <128 x i1> %3) #2 +declare void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #2 +declare void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %0, <32 x ptr> %1, i32 immarg %2, <32 x i1> %3) #2 diff --git a/llvm/test/CodeGen/Hexagon/masked_gather.ll b/llvm/test/CodeGen/Hexagon/masked_gather.ll new file mode 100644 index 0000000000000..461fd7921d796 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/masked_gather.ll @@ -0,0 +1,58 @@ +; This produced masked gather that we are not yet handling +; REQUIRES: asserts +; RUN: opt -march=hexagon -passes=loop-vectorize -hexagon-autohvx -mattr=+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,-long-calls,-packets -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Original C++ +; clang -c -Os -mhvx -mhvx-ieee-fp -fvectorize -mno-packets -fno-strict-aliasing -Os -mhvx -mhvx-ieee-fp -mno-packets -mv68 +;typedef struct poptContext_s * poptContext; +;typedef struct { unsigned int bits[1]; } pbm_set; +;struct poptContext_s { pbm_set * arg_strip; }; +; +;int poptStrippedArgv(poptContext con, int argc, char ** argv) { +; int numargs = argc; +; for (int i = 1; i < argc; i++) { +; if (((((con->arg_strip)->bits)[((i) / (8 * sizeof (unsigned int)))] & ((unsigned int) 1 << ((i) % (8 * sizeof (unsigned int))))) != 0)) +; numargs--; +; } +; return numargs; +;} + +; CHECK-NOT: masked_gather + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon-unknown-unknown-elf" + +; Function Attrs: nofree norecurse nosync nounwind optsize memory(read, inaccessiblemem: none) +define dso_local i32 @poptStrippedArgv(ptr noundef readonly captures(none) %con, i32 noundef %argc, ptr noundef readnone captures(none) %argv) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %argc, 1 + br i1 %cmp8, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %0 = load ptr, ptr %con, align 4 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %spec.select.lcssa = phi i32 [ %spec.select, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %numargs.0.lcssa = phi i32 [ %argc, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %numargs.0.lcssa + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.010 = phi i32 [ 1, %for.body.lr.ph ], [ %inc, %for.body ] + %numargs.09 = phi i32 [ %argc, %for.body.lr.ph ], [ %spec.select, %for.body ] + %div7 = lshr i32 %i.010, 5 + %arrayidx = getelementptr inbounds nuw [1 x i32], ptr %0, i32 0, i32 %div7 + %1 = load i32, ptr %arrayidx, align 4 + %rem = and i32 %i.010, 31 + %shl = shl nuw i32 1, %rem + %and = and i32 %1, %shl + %cmp1.not = icmp ne i32 %and, 0 + %dec = sext i1 %cmp1.not to i32 + %spec.select = add nsw i32 %numargs.09, %dec + %inc = add nuw nsw i32 %i.010, 1 + %exitcond.not = icmp eq i32 %inc, %argc + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/CodeGen/Hexagon/vector-gather.ll b/llvm/test/CodeGen/Hexagon/vector-gather.ll new file mode 100644 index 0000000000000..5700380508a57 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/vector-gather.ll @@ -0,0 +1,27 @@ +; REQUIRES: hexagon-registered-target +; RUN: llc -march=hexagon -mcpu=hexagonv73 -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s + +target triple = "hexagon" + +@VTCM_SCATTER16_ADDRESS = dso_local global i32 0, align 4 +@region_len = dso_local global i32 16383, align 4 + +; CHECK: [[ADR:r[0-9]+]] = memw(gp+#VTCM_SCATTER16_ADDRESS) +; CHECK: vtmp.h = vgather([[ADR]],m0,v0.h).h +; CHECK: vmem(r0+#0) = vtmp.new + +define dso_local void @vector_gather_16(ptr noundef %vgather, <32 x i32> noundef %offsets) #0 { +entry: + %vgather.addr = alloca ptr, align 4 + %offsets.addr = alloca <32 x i32>, align 128 + store ptr %vgather, ptr %vgather.addr, align 4 + store <32 x i32> %offsets, ptr %offsets.addr, align 128 + %0 = load ptr, ptr %vgather.addr, align 4 + %1 = load i32, ptr @VTCM_SCATTER16_ADDRESS, align 4 + %2 = load i32, ptr @region_len, align 4 + %3 = load <32 x i32>, ptr %offsets.addr, align 128 + call void @llvm.hexagon.V6.vgathermh.128B(ptr %0, i32 %1, i32 %2, <32 x i32> %3) + ret void +} + +declare <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32>, i32)