diff --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td index 52c29ef31f0ad1..9c7c43144db5af 100644 --- a/llvm/include/llvm/IR/IntrinsicsHexagon.td +++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td @@ -381,6 +381,32 @@ Hexagon_NonGCC_Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; def int_hexagon_V6_pred_typecast_128B : Hexagon_NonGCC_Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; +// HVX full-precision multiplication. +// V6_vmpyss_parts(Vu,Vv) = (MulHS(Vu,Vv), Mul(Vu,Vv)) +// V6_vmpyuu_parts(Vu,Vv) = (MulHU(Vu,Vv), Mul(Vu,Vv)) +// V6_vmpyus_parts(Vu,Vv) = (MulHUS(Vu,Vv), Mul(Vu,Vv)) +// +// Both, the (purportedly) 64b and the _128B versions are exactly equivalent +// regardless of the HVX mode, they are both defined for consistency. +// The purpose of these intrinsics is to have a uniform way of multiplying two +// integer vectors in the LLVM IR. Many HVX multiply operations interleave +// the even-odd results, except for 32x32 multiplications. Also, different +// HVX versions have different instructions that can be used, so defer the +// instruction choice to the isel. +class Hexagon_vv_vv_pure: + Hexagon_NonGCC_Intrinsic< + [llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + +def int_hexagon_V6_vmpyss_parts: Hexagon_vv_vv_pure; +def int_hexagon_V6_vmpyss_parts_128B: Hexagon_vv_vv_pure; +def int_hexagon_V6_vmpyuu_parts: Hexagon_vv_vv_pure; +def int_hexagon_V6_vmpyuu_parts_128B: Hexagon_vv_vv_pure; +def int_hexagon_V6_vmpyus_parts: Hexagon_vv_vv_pure; +def int_hexagon_V6_vmpyus_parts_128B: Hexagon_vv_vv_pure; + + // Masked vector stores // // These are all deprecated, the intrinsics matching instruction names diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 8723ac678bdc8e..9bd377ab0a5beb 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1904,6 +1904,9 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::MFSHR: return "HexagonISD::MFSHR"; case HexagonISD::SSAT: return "HexagonISD::SSAT"; case HexagonISD::USAT: return "HexagonISD::USAT"; + case HexagonISD::SMUL_LOHI: return "HexagonISD::SMUL_LOHI"; + case HexagonISD::UMUL_LOHI: return "HexagonISD::UMUL_LOHI"; + case HexagonISD::USMUL_LOHI: return "HexagonISD::USMUL_LOHI"; case HexagonISD::VEXTRACTW: return "HexagonISD::VEXTRACTW"; case HexagonISD::VINSERTW0: return "HexagonISD::VINSERTW0"; case HexagonISD::VROR: return "HexagonISD::VROR"; @@ -2189,6 +2192,16 @@ HexagonTargetLowering::getPreferredVectorAction(MVT VT) const { return TargetLoweringBase::TypeSplitVector; } +TargetLoweringBase::LegalizeAction +HexagonTargetLowering::getCustomOperationAction(SDNode &Op) const { + if (Subtarget.useHVXOps()) { + unsigned Action = getCustomHvxOperationAction(Op); + if (Action != ~0u) + return static_cast(Action); + } + return TargetLoweringBase::Legal; +} + std::pair HexagonTargetLowering::getBaseAndOffset(SDValue Addr) const { if (Addr.getOpcode() == ISD::ADD) { diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 2c056992d5cadb..ac877364f9210e 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -61,6 +61,12 @@ enum NodeType : unsigned { SSAT, // Signed saturate. USAT, // Unsigned saturate. + SMUL_LOHI, // Same as ISD::SMUL_LOHI, but opaque to the combiner. + UMUL_LOHI, // Same as ISD::UMUL_LOHI, but opaque to the combiner. + // We want to legalize MULH[SU] to [SU]MUL_LOHI, but the + // combiner will keep rewriting it back to MULH[SU]. + USMUL_LOHI, // Like SMUL_LOHI, but unsigned*signed. + TSTBIT, INSERT, EXTRACTU, @@ -164,8 +170,8 @@ class HexagonTargetLowering : public TargetLowering { unsigned Index) const override; bool isShuffleMaskLegal(ArrayRef Mask, EVT VT) const override; - TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) - const override; + LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + LegalizeAction getCustomOperationAction(SDNode &Op) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; void LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, @@ -355,6 +361,7 @@ class HexagonTargetLowering : public TargetLowering { private: void initializeHVXLowering(); unsigned getPreferredHvxVectorAction(MVT VecTy) const; + unsigned getCustomHvxOperationAction(SDNode &Op) const; bool validateConstPtrAlignment(SDValue Ptr, Align NeedAlign, const SDLoc &dl, SelectionDAG &DAG) const; @@ -485,6 +492,12 @@ class HexagonTargetLowering : public TargetLowering { bool Signed, SelectionDAG &DAG) const; VectorPair emitHvxShiftRightRnd(SDValue Val, unsigned Amt, bool Signed, SelectionDAG &DAG) const; + SDValue emitHvxMulHsV60(SDValue A, SDValue B, const SDLoc &dl, + SelectionDAG &DAG) const; + SDValue emitHvxMulLoHiV60(SDValue A, bool SignedA, SDValue B, bool SignedB, + const SDLoc &dl, SelectionDAG &DAG) const; + SDValue emitHvxMulLoHiV62(SDValue A, bool SignedA, SDValue B, bool SignedB, + const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG) const; @@ -499,6 +512,7 @@ class HexagonTargetLowering : public TargetLowering { SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxMulLoHi(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxSelect(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 7364e1822285f0..e15508e6038df6 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -90,6 +90,7 @@ HexagonTargetLowering::initializeHVXLowering() { ArrayRef LegalV = Use64b ? LegalV64 : LegalV128; ArrayRef LegalW = Use64b ? LegalW64 : LegalW128; MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8; + MVT WordV = Use64b ? MVT::v16i32 : MVT::v32i32; MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; auto setPromoteTo = [this] (unsigned Opc, MVT FromTy, MVT ToTy) { @@ -213,8 +214,11 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::LOAD, T, Custom); setOperationAction(ISD::MLOAD, T, Custom); setOperationAction(ISD::MSTORE, T, Custom); - setOperationAction(ISD::MULHS, T, Custom); - setOperationAction(ISD::MULHU, T, Custom); + if (T.getScalarType() != MVT::i32) { + setOperationAction(ISD::MULHS, T, Legal); + setOperationAction(ISD::MULHU, T, Legal); + } + setOperationAction(ISD::BUILD_VECTOR, T, Custom); // Make concat-vectors custom to handle concats of more than 2 vectors. setOperationAction(ISD::CONCAT_VECTORS, T, Custom); @@ -320,6 +324,12 @@ HexagonTargetLowering::initializeHVXLowering() { } } + // Legalize all of these to HexagonISD::[SU]MUL_LOHI. + setOperationAction(ISD::MULHS, WordV, Custom); // -> _LOHI + setOperationAction(ISD::MULHU, WordV, Custom); // -> _LOHI + setOperationAction(ISD::SMUL_LOHI, WordV, Custom); + setOperationAction(ISD::UMUL_LOHI, WordV, Custom); + setCondCodeAction(ISD::SETNE, MVT::v64f16, Expand); setCondCodeAction(ISD::SETLE, MVT::v64f16, Expand); setCondCodeAction(ISD::SETGE, MVT::v64f16, Expand); @@ -465,6 +475,18 @@ HexagonTargetLowering::getPreferredHvxVectorAction(MVT VecTy) const { return ~0u; } +unsigned +HexagonTargetLowering::getCustomHvxOperationAction(SDNode &Op) const { + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case HexagonISD::SMUL_LOHI: + case HexagonISD::UMUL_LOHI: + case HexagonISD::USMUL_LOHI: + return TargetLoweringBase::Custom; + } + return TargetLoweringBase::Legal; +} + SDValue HexagonTargetLowering::getInt(unsigned IntId, MVT ResTy, ArrayRef Ops, const SDLoc &dl, SelectionDAG &DAG) const { @@ -1882,144 +1904,62 @@ HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const { - MVT ResTy = ty(Op); - assert(ResTy.isVector()); const SDLoc &dl(Op); - SmallVector ShuffMask; + MVT ResTy = ty(Op); + assert(ResTy.getVectorElementType() == MVT::i32); - MVT ElemTy = ResTy.getVectorElementType(); - unsigned VecLen = ResTy.getVectorNumElements(); SDValue Vs = Op.getOperand(0); SDValue Vt = Op.getOperand(1); - bool IsSigned = Op.getOpcode() == ISD::MULHS; - - if (ElemTy == MVT::i8 || ElemTy == MVT::i16) { - // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...), - // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo, - // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...). - // For i16, use V6_vmpyhv, which behaves in an analogous way to - // V6_vmpybv: results Lo and Hi are products of even/odd elements - // respectively. - MVT ExtTy = typeExtElem(ResTy, 2); - unsigned MpyOpc = ElemTy == MVT::i8 - ? (IsSigned ? Hexagon::V6_vmpybv : Hexagon::V6_vmpyubv) - : (IsSigned ? Hexagon::V6_vmpyhv : Hexagon::V6_vmpyuhv); - SDValue M = getInstr(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG); - - // Discard low halves of the resulting values, collect the high halves. - for (unsigned I = 0; I < VecLen; I += 2) { - ShuffMask.push_back(I+1); // Pick even element. - ShuffMask.push_back(I+VecLen+1); // Pick odd element. - } - VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG); - SDValue BS = getByteShuffle(dl, P.first, P.second, ShuffMask, DAG); - return DAG.getBitcast(ResTy, BS); - } - MVT PairTy = typeJoin({ResTy, ResTy}); + SDVTList ResTys = DAG.getVTList(ResTy, ResTy); + unsigned Opc = Op.getOpcode(); - assert(ElemTy == MVT::i32); - SDValue S16 = DAG.getConstant(16, dl, MVT::i32); + // On HVX v62+ producing the full product is cheap, so legalize MULH to LOHI. + if (Opc == ISD::MULHU) + return DAG.getNode(HexagonISD::UMUL_LOHI, dl, ResTys, {Vs, Vt}).getValue(1); + if (Opc == ISD::MULHS) + return DAG.getNode(HexagonISD::SMUL_LOHI, dl, ResTys, {Vs, Vt}).getValue(1); - auto LoVec = [&DAG,ResTy,dl] (SDValue Pair) { - return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, ResTy, Pair); - }; - auto HiVec = [&DAG,ResTy,dl] (SDValue Pair) { - return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, ResTy, Pair); - }; - - auto MulHS_V60 = [&](SDValue Vs, SDValue Vt) { - // mulhs(Vs,Vt) = - // = [(Hi(Vs)*2^16 + Lo(Vs)) *s (Hi(Vt)*2^16 + Lo(Vt))] >> 32 - // = [Hi(Vs)*2^16 *s Hi(Vt)*2^16 + Hi(Vs) *su Lo(Vt)*2^16 - // + Lo(Vs) *us (Hi(Vt)*2^16 + Lo(Vt))] >> 32 - // = [Hi(Vs) *s Hi(Vt)*2^32 + Hi(Vs) *su Lo(Vt)*2^16 - // + Lo(Vs) *us Vt] >> 32 - // The low half of Lo(Vs)*Lo(Vt) will be discarded (it's not added to - // anything, so it cannot produce any carry over to higher bits), - // so everything in [] can be shifted by 16 without loss of precision. - // = [Hi(Vs) *s Hi(Vt)*2^16 + Hi(Vs)*su Lo(Vt) + Lo(Vs)*Vt >> 16] >> 16 - // = [Hi(Vs) *s Hi(Vt)*2^16 + Hi(Vs)*su Lo(Vt) + V6_vmpyewuh(Vs,Vt)] >> 16 - // The final additions need to make sure to properly maintain any - // carry-out bits. - // - // Hi(Vt) Lo(Vt) - // Hi(Vs) Lo(Vs) - // -------------- - // Lo(Vt)*Lo(Vs) | T0 = V6_vmpyewuh(Vt,Vs) does this, - // Hi(Vt)*Lo(Vs) | + dropping the low 16 bits - // Hi(Vs)*Lo(Vt) | T2 - // Hi(Vt)*Hi(Vs) - - SDValue T0 = getInstr(Hexagon::V6_vmpyewuh, dl, ResTy, {Vt, Vs}, DAG); - // T1 = get Hi(Vs) into low halves. - SDValue T1 = getInstr(Hexagon::V6_vasrw, dl, ResTy, {Vs, S16}, DAG); - // P0 = interleaved T1.h*Vt.uh (full precision product) - SDValue P0 = getInstr(Hexagon::V6_vmpyhus, dl, PairTy, {T1, Vt}, DAG); - // T2 = T1.even(h) * Vt.even(uh), i.e. Hi(Vs)*Lo(Vt) - SDValue T2 = LoVec(P0); - // We need to add T0+T2, recording the carry-out, which will be 1<<16 - // added to the final sum. - // P1 = interleaved even/odd 32-bit (unsigned) sums of 16-bit halves - SDValue P1 = getInstr(Hexagon::V6_vadduhw, dl, PairTy, {T0, T2}, DAG); - // P2 = interleaved even/odd 32-bit (signed) sums of 16-bit halves - SDValue P2 = getInstr(Hexagon::V6_vaddhw, dl, PairTy, {T0, T2}, DAG); - // T3 = full-precision(T0+T2) >> 16 - // The low halves are added-unsigned, the high ones are added-signed. - SDValue T3 = getInstr(Hexagon::V6_vasrw_acc, dl, ResTy, - {HiVec(P2), LoVec(P1), S16}, DAG); - SDValue T4 = getInstr(Hexagon::V6_vasrw, dl, ResTy, {Vt, S16}, DAG); - // P3 = interleaved Hi(Vt)*Hi(Vs) (full precision), - // which is now Lo(T1)*Lo(T4), so we want to keep the even product. - SDValue P3 = getInstr(Hexagon::V6_vmpyhv, dl, PairTy, {T1, T4}, DAG); - SDValue T5 = LoVec(P3); - // Add: - SDValue T6 = DAG.getNode(ISD::ADD, dl, ResTy, {T3, T5}); - return T6; - }; +#ifndef NDEBUG + Op.dump(&DAG); +#endif + llvm_unreachable("Unexpected mulh operation"); +} - auto MulHS_V62 = [&](SDValue Vs, SDValue Vt) { - MVT PairTy = typeJoin({ResTy, ResTy}); - SDValue T0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, {Vs, Vt}, DAG); - SDValue T1 = getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy, - {T0, Vs, Vt}, DAG); - return opSplit(T1, dl, DAG).second; - }; +SDValue +HexagonTargetLowering::LowerHvxMulLoHi(SDValue Op, SelectionDAG &DAG) const { + const SDLoc &dl(Op); + unsigned Opc = Op.getOpcode(); + SDValue Vu = Op.getOperand(0); + SDValue Vv = Op.getOperand(1); + + // If the HI part is not used, convert it to a regular MUL. + if (auto HiVal = Op.getValue(1); HiVal.use_empty()) { + // Need to preserve the types and the number of values. + SDValue Hi = DAG.getUNDEF(ty(HiVal)); + SDValue Lo = DAG.getNode(ISD::MUL, dl, ty(Op), {Vu, Vv}); + return DAG.getMergeValues({Lo, Hi}, dl); + } + + bool SignedVu = Opc == HexagonISD::SMUL_LOHI; + bool SignedVv = Opc == HexagonISD::SMUL_LOHI || Opc == HexagonISD::USMUL_LOHI; + + // Legal on HVX v62+, but lower it here because patterns can't handle multi- + // valued nodes. + if (Subtarget.useHVXV62Ops()) + return emitHvxMulLoHiV62(Vu, SignedVu, Vv, SignedVv, dl, DAG); + + if (Opc == HexagonISD::SMUL_LOHI) { + // Direct MULHS expansion is cheaper than doing the whole SMUL_LOHI, + // for other signedness LOHI is cheaper. + if (auto LoVal = Op.getValue(0); LoVal.use_empty()) { + SDValue Hi = emitHvxMulHsV60(Vu, Vv, dl, DAG); + SDValue Lo = DAG.getUNDEF(ty(LoVal)); + return DAG.getMergeValues({Lo, Hi}, dl); + } + } - if (IsSigned) { - if (Subtarget.useHVXV62Ops()) - return MulHS_V62(Vs, Vt); - return MulHS_V60(Vs, Vt); - } - - // Unsigned mulhw. - - SDValue P = getInstr(Hexagon::V6_lvsplatw, dl, ResTy, - {DAG.getConstant(0x02020202, dl, MVT::i32)}, DAG); - // Multiply-unsigned halfwords: - // LoVec = Vs.uh[2i] * Vt.uh[2i], - // HiVec = Vs.uh[2i+1] * Vt.uh[2i+1] - SDValue T0 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {Vs, Vt}, DAG); - // The low halves in the LoVec of the pair can be discarded. They are - // not added to anything (in the full-precision product), so they cannot - // produce a carry into the higher bits. - SDValue T1 = getInstr(Hexagon::V6_vlsrw, dl, ResTy, {LoVec(T0), S16}, DAG); - // Swap low and high halves in Vt, and do the halfword multiplication - // to get products Vs.uh[2i] * Vt.uh[2i+1] and Vs.uh[2i+1] * Vt.uh[2i]. - SDValue D0 = getInstr(Hexagon::V6_vdelta, dl, ResTy, {Vt, P}, DAG); - SDValue T2 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {Vs, D0}, DAG); - // T2 has mixed products of halfwords: Lo(Vt)*Hi(Vs) and Hi(Vt)*Lo(Vs). - // These products are words, but cannot be added directly because the - // sums could overflow. Add these products, by halfwords, where each sum - // of a pair of halfwords gives a word. - SDValue T3 = getInstr(Hexagon::V6_vadduhw, dl, PairTy, - {LoVec(T2), HiVec(T2)}, DAG); - // Add the high halfwords from the products of the low halfwords. - SDValue T4 = DAG.getNode(ISD::ADD, dl, ResTy, {T1, LoVec(T3)}); - SDValue T5 = getInstr(Hexagon::V6_vlsrw, dl, ResTy, {T4, S16}, DAG); - SDValue T6 = DAG.getNode(ISD::ADD, dl, ResTy, {HiVec(T0), HiVec(T3)}); - SDValue T7 = DAG.getNode(ISD::ADD, dl, ResTy, {T5, T6}); - return T7; + return emitHvxMulLoHiV60(Vu, SignedVu, Vv, SignedVv, dl, DAG); } SDValue @@ -2196,21 +2136,38 @@ HexagonTargetLowering::LowerHvxFunnelShift(SDValue Op, SDValue HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const { const SDLoc &dl(Op); - MVT ResTy = ty(Op); - unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); - bool Use64b = Subtarget.useHVX64BOps(); - unsigned IntPredCast = Use64b ? Intrinsic::hexagon_V6_pred_typecast - : Intrinsic::hexagon_V6_pred_typecast_128B; - if (IntNo == IntPredCast) { - SDValue Vs = Op.getOperand(1); - MVT OpTy = ty(Vs); - if (isHvxBoolTy(ResTy) && isHvxBoolTy(OpTy)) { - if (ResTy == OpTy) - return Vs; - return DAG.getNode(HexagonISD::TYPECAST, dl, ResTy, Vs); + SmallVector Ops(Op->ops().begin(), Op->ops().end()); + + auto Swap = [&](SDValue P) { + return DAG.getMergeValues({P.getValue(1), P.getValue(0)}, dl); + }; + + switch (IntNo) { + case Intrinsic::hexagon_V6_pred_typecast: + case Intrinsic::hexagon_V6_pred_typecast_128B: { + MVT ResTy = ty(Op), InpTy = ty(Ops[1]); + if (isHvxBoolTy(ResTy) && isHvxBoolTy(InpTy)) { + if (ResTy == InpTy) + return Ops[1]; + return DAG.getNode(HexagonISD::TYPECAST, dl, ResTy, Ops[1]); } + break; } + case Intrinsic::hexagon_V6_vmpyss_parts: + case Intrinsic::hexagon_V6_vmpyss_parts_128B: + return Swap(DAG.getNode(HexagonISD::SMUL_LOHI, dl, Op->getVTList(), + {Ops[1], Ops[2]})); + case Intrinsic::hexagon_V6_vmpyuu_parts: + case Intrinsic::hexagon_V6_vmpyuu_parts_128B: + return Swap(DAG.getNode(HexagonISD::UMUL_LOHI, dl, Op->getVTList(), + {Ops[1], Ops[2]})); + case Intrinsic::hexagon_V6_vmpyus_parts: + case Intrinsic::hexagon_V6_vmpyus_parts_128B: { + return Swap(DAG.getNode(HexagonISD::USMUL_LOHI, dl, Op->getVTList(), + {Ops[1], Ops[2]})); + } + } // switch return Op; } @@ -2525,6 +2482,212 @@ HexagonTargetLowering::emitHvxShiftRightRnd(SDValue Val, unsigned Amt, return {Mux, Ovf}; } +SDValue +HexagonTargetLowering::emitHvxMulHsV60(SDValue A, SDValue B, const SDLoc &dl, + SelectionDAG &DAG) const { + MVT VecTy = ty(A); + MVT PairTy = typeJoin({VecTy, VecTy}); + assert(VecTy.getVectorElementType() == MVT::i32); + + SDValue S16 = DAG.getConstant(16, dl, MVT::i32); + + auto LoVec = [&DAG, VecTy, dl](SDValue Pair) { + return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair); + }; + auto HiVec = [&DAG, VecTy, dl](SDValue Pair) { + return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair); + }; + + // mulhs(A,B) = + // = [(Hi(A)*2^16 + Lo(A)) *s (Hi(B)*2^16 + Lo(B))] >> 32 + // = [Hi(A)*2^16 *s Hi(B)*2^16 + Hi(A) *su Lo(B)*2^16 + // + Lo(A) *us (Hi(B)*2^16 + Lo(B))] >> 32 + // = [Hi(A) *s Hi(B)*2^32 + Hi(A) *su Lo(B)*2^16 + Lo(A) *us B] >> 32 + // The low half of Lo(A)*Lo(B) will be discarded (it's not added to + // anything, so it cannot produce any carry over to higher bits), + // so everything in [] can be shifted by 16 without loss of precision. + // = [Hi(A) *s Hi(B)*2^16 + Hi(A)*su Lo(B) + Lo(A)*B >> 16] >> 16 + // = [Hi(A) *s Hi(B)*2^16 + Hi(A)*su Lo(B) + V6_vmpyewuh(A,B)] >> 16 + // The final additions need to make sure to properly maintain any carry- + // out bits. + // + // Hi(B) Lo(B) + // Hi(A) Lo(A) + // -------------- + // Lo(B)*Lo(A) | T0 = V6_vmpyewuh(B,A) does this, + // Hi(B)*Lo(A) | + dropping the low 16 bits + // Hi(A)*Lo(B) | T2 + // Hi(B)*Hi(A) + + SDValue T0 = getInstr(Hexagon::V6_vmpyewuh, dl, VecTy, {B, A}, DAG); + // T1 = get Hi(A) into low halves. + SDValue T1 = getInstr(Hexagon::V6_vasrw, dl, VecTy, {A, S16}, DAG); + // P0 = interleaved T1.h*B.uh (full precision product) + SDValue P0 = getInstr(Hexagon::V6_vmpyhus, dl, PairTy, {T1, B}, DAG); + // T2 = T1.even(h) * B.even(uh), i.e. Hi(A)*Lo(B) + SDValue T2 = LoVec(P0); + // We need to add T0+T2, recording the carry-out, which will be 1<<16 + // added to the final sum. + // P1 = interleaved even/odd 32-bit (unsigned) sums of 16-bit halves + SDValue P1 = getInstr(Hexagon::V6_vadduhw, dl, PairTy, {T0, T2}, DAG); + // P2 = interleaved even/odd 32-bit (signed) sums of 16-bit halves + SDValue P2 = getInstr(Hexagon::V6_vaddhw, dl, PairTy, {T0, T2}, DAG); + // T3 = full-precision(T0+T2) >> 16 + // The low halves are added-unsigned, the high ones are added-signed. + SDValue T3 = getInstr(Hexagon::V6_vasrw_acc, dl, VecTy, + {HiVec(P2), LoVec(P1), S16}, DAG); + SDValue T4 = getInstr(Hexagon::V6_vasrw, dl, VecTy, {B, S16}, DAG); + // P3 = interleaved Hi(B)*Hi(A) (full precision), + // which is now Lo(T1)*Lo(T4), so we want to keep the even product. + SDValue P3 = getInstr(Hexagon::V6_vmpyhv, dl, PairTy, {T1, T4}, DAG); + SDValue T5 = LoVec(P3); + // Add: + SDValue T6 = DAG.getNode(ISD::ADD, dl, VecTy, {T3, T5}); + return T6; +} + +SDValue +HexagonTargetLowering::emitHvxMulLoHiV60(SDValue A, bool SignedA, SDValue B, + bool SignedB, const SDLoc &dl, + SelectionDAG &DAG) const { + MVT VecTy = ty(A); + MVT PairTy = typeJoin({VecTy, VecTy}); + assert(VecTy.getVectorElementType() == MVT::i32); + + SDValue S16 = DAG.getConstant(16, dl, MVT::i32); + + auto LoVec = [&DAG, VecTy, dl](SDValue Pair) { + return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair); + }; + auto HiVec = [&DAG, VecTy, dl](SDValue Pair) { + return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair); + }; + + if (SignedA && !SignedB) { + // Make A:unsigned, B:signed. + std::swap(A, B); + std::swap(SignedA, SignedB); + } + + // Do halfword-wise multiplications for unsigned*unsigned product, then + // add corrections for signed and unsigned*signed. + + SDValue Lo, Hi; + + // P0:lo = (uu) products of low halves of A and B, + // P0:hi = (uu) products of high halves. + SDValue P0 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {A, B}, DAG); + + // Swap low/high halves in B + SDValue T0 = getInstr(Hexagon::V6_lvsplatw, dl, VecTy, + {DAG.getConstant(0x02020202, dl, MVT::i32)}, DAG); + SDValue T1 = getInstr(Hexagon::V6_vdelta, dl, VecTy, {B, T0}, DAG); + // P1 = products of even/odd halfwords. + // P1:lo = (uu) products of even(A.uh) * odd(B.uh) + // P1:hi = (uu) products of odd(A.uh) * even(B.uh) + SDValue P1 = getInstr(Hexagon::V6_vmpyuhv, dl, PairTy, {A, T1}, DAG); + + // P2:lo = low halves of P1:lo + P1:hi, + // P2:hi = high halves of P1:lo + P1:hi. + SDValue P2 = + getInstr(Hexagon::V6_vadduhw, dl, PairTy, {HiVec(P1), LoVec(P1)}, DAG); + // Still need to add the high halves of P0:lo to P2:lo + SDValue T2 = getInstr(Hexagon::V6_vlsrw, dl, VecTy, {LoVec(P0), S16}, DAG); + SDValue T3 = DAG.getNode(ISD::ADD, dl, VecTy, {LoVec(P2), T2}); + + // The high halves of T3 will contribute to the HI part of LOHI. + SDValue T4 = + getInstr(Hexagon::V6_vasrw_acc, dl, VecTy, {HiVec(P2), T3, S16}, DAG); + + // The low halves of P2 need to be added to high halves of the LO part. + Lo = getInstr(Hexagon::V6_vaslw_acc, dl, VecTy, {LoVec(P0), LoVec(P2), S16}, + DAG); + Hi = DAG.getNode(ISD::ADD, dl, VecTy, {HiVec(P0), T4}); + + if (SignedA) { + assert(SignedB && "Signed A and unsigned B should have been inverted"); + + MVT PredTy = MVT::getVectorVT(MVT::i1, VecTy.getVectorNumElements()); + SDValue Zero = getZero(dl, VecTy, DAG); + SDValue Q0 = DAG.getSetCC(dl, PredTy, A, Zero, ISD::SETLT); + SDValue Q1 = DAG.getSetCC(dl, PredTy, B, Zero, ISD::SETLT); + SDValue X0 = DAG.getNode(ISD::VSELECT, dl, VecTy, {Q0, B, Zero}); + SDValue X1 = getInstr(Hexagon::V6_vaddwq, dl, VecTy, {Q1, X0, A}, DAG); + Hi = getInstr(Hexagon::V6_vsubw, dl, VecTy, {Hi, X1}, DAG); + } else if (SignedB) { + // Same correction as for mulhus: + // mulhus(A.uw,B.w) = mulhu(A.uw,B.uw) - (A.w if B < 0) + MVT PredTy = MVT::getVectorVT(MVT::i1, VecTy.getVectorNumElements()); + SDValue Zero = getZero(dl, VecTy, DAG); + SDValue Q1 = DAG.getSetCC(dl, PredTy, B, Zero, ISD::SETLT); + Hi = getInstr(Hexagon::V6_vsubwq, dl, VecTy, {Q1, Hi, A}, DAG); + } else { + assert(!SignedA && !SignedB); + } + + return DAG.getMergeValues({Lo, Hi}, dl); +} + +SDValue +HexagonTargetLowering::emitHvxMulLoHiV62(SDValue A, bool SignedA, SDValue B, + bool SignedB, const SDLoc &dl, + SelectionDAG &DAG) const { + MVT VecTy = ty(A); + MVT PairTy = typeJoin({VecTy, VecTy}); + assert(VecTy.getVectorElementType() == MVT::i32); + + auto LoVec = [&DAG, VecTy, dl](SDValue Pair) { + return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair); + }; + auto HiVec = [&DAG, VecTy, dl](SDValue Pair) { + return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair); + }; + + if (SignedA && !SignedB) { + // Make A:unsigned, B:signed. + std::swap(A, B); + std::swap(SignedA, SignedB); + } + + // Do S*S first, then make corrections for U*S or U*U if needed. + SDValue P0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, {A, B}, DAG); + SDValue P1 = + getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy, {P0, A, B}, DAG); + SDValue Lo = LoVec(P1); + SDValue Hi = HiVec(P1); + + if (!SignedB) { + assert(!SignedA && "Signed A and unsigned B should have been inverted"); + SDValue Zero = getZero(dl, VecTy, DAG); + MVT PredTy = MVT::getVectorVT(MVT::i1, VecTy.getVectorNumElements()); + + // Mulhu(X, Y) = Mulhs(X, Y) + (X, if Y < 0) + (Y, if X < 0). + // def: Pat<(VecI32 (mulhu HVI32:$A, HVI32:$B)), + // (V6_vaddw (HiVec (Muls64O $A, $B)), + // (V6_vaddwq (V6_vgtw (V6_vd0), $B), + // (V6_vandvqv (V6_vgtw (V6_vd0), $A), $B), + // $A))>; + SDValue Q0 = DAG.getSetCC(dl, PredTy, A, Zero, ISD::SETLT); + SDValue Q1 = DAG.getSetCC(dl, PredTy, B, Zero, ISD::SETLT); + SDValue T0 = getInstr(Hexagon::V6_vandvqv, dl, VecTy, {Q0, B}, DAG); + SDValue T1 = getInstr(Hexagon::V6_vaddwq, dl, VecTy, {Q1, T0, A}, DAG); + Hi = getInstr(Hexagon::V6_vaddw, dl, VecTy, {Hi, T1}, DAG); + } else if (!SignedA) { + SDValue Zero = getZero(dl, VecTy, DAG); + MVT PredTy = MVT::getVectorVT(MVT::i1, VecTy.getVectorNumElements()); + + // Mulhus(unsigned X, signed Y) = Mulhs(X, Y) + (Y, if X < 0). + // def: Pat<(VecI32 (HexagonMULHUS HVI32:$A, HVI32:$B)), + // (V6_vaddwq (V6_vgtw (V6_vd0), $A), + // (HiVec (Muls64O $A, $B)), + // $B)>; + SDValue Q0 = DAG.getSetCC(dl, PredTy, A, Zero, ISD::SETLT); + Hi = getInstr(Hexagon::V6_vaddwq, dl, VecTy, {Q0, Hi, B}, DAG); + } + + return DAG.getMergeValues({Lo, Hi}, dl); +} + SDValue HexagonTargetLowering::EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG) const { @@ -3084,6 +3247,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FSHR: return LowerHvxFunnelShift(Op, DAG); case ISD::MULHS: case ISD::MULHU: return LowerHvxMulh(Op, DAG); + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: return LowerHvxMulLoHi(Op, DAG); case ISD::ANY_EXTEND_VECTOR_INREG: return LowerHvxExtend(Op, DAG); case ISD::SETCC: case ISD::INTRINSIC_VOID: return Op; @@ -3097,6 +3262,11 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_UINT: return LowerHvxFpToInt(Op, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerHvxIntToFp(Op, DAG); + + // Special nodes: + case HexagonISD::SMUL_LOHI: + case HexagonISD::UMUL_LOHI: + case HexagonISD::USMUL_LOHI: return LowerHvxMulLoHi(Op, DAG); } #ifndef NDEBUG Op.dumpr(&DAG); diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 34e41a4e759943..bf6303fd165e98 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -97,6 +97,7 @@ def HexagonPTRUE: SDNode<"HexagonISD::PTRUE", SDTVecLeaf>; def HexagonPFALSE: SDNode<"HexagonISD::PFALSE", SDTVecLeaf>; def HexagonVALIGN: SDNode<"HexagonISD::VALIGN", SDTVecVecIntOp>; def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>; +def HexagonMULHUS: SDNode<"HexagonISD::MULHUS", SDTIntBinOp>; def SDTSaturate: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>; diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 4519c850278e8e..4a086beccb7c68 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -912,6 +912,20 @@ let Predicates = [UseHVXV68, UseHVXFloatingPoint] in { (V6_pred_not (V6_veqw HvxVR:$Vt, HvxVR:$Vu))>; } +// Multiply high for non-i32 types +def: Pat<(VecI8 (mulhs HVI8:$Vu, HVI8:$Vv)), + (V6_vshuffob (HiVec (V6_vmpybv $Vu, $Vv)), + (LoVec (V6_vmpybv $Vu, $Vv)))>; +def: Pat<(VecI16 (mulhs HVI16:$Vu, HVI16:$Vv)), + (V6_vshufoh (HiVec (V6_vmpyhv $Vu, $Vv)), + (LoVec (V6_vmpyhv $Vu, $Vv)))>; +def: Pat<(VecI8 (mulhu HVI8:$Vu, HVI8:$Vv)), + (V6_vshuffob (HiVec (V6_vmpyubv $Vu, $Vv)), + (LoVec (V6_vmpyubv $Vu, $Vv)))>; +def: Pat<(VecI16 (mulhu HVI16:$Vu, HVI16:$Vv)), + (V6_vshufoh (HiVec (V6_vmpyuhv $Vu, $Vv)), + (LoVec (V6_vmpyuhv $Vu, $Vv)))>; + let Predicates = [UseHVXV60] in { // V60 doesn't have vabsb or byte shifts. // Do the "mask = x >> width-1; abs = (x + mask) ^ mask" trick. diff --git a/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll b/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll index 8c9f6a1ad47018..58e8cac9dfe5c6 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll @@ -86,19 +86,16 @@ define <32 x i32> @mulhu(<32 x i32> %a0, <32 x i32> %a1) #0 { ; V60-NEXT: v1:0.uw = vmpy(v0.uh,v1.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v1:0.w = vadd(v0.uh,v1.uh) +; V60-NEXT: v1:0.w = vadd(v1.uh,v0.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v0.w = vadd(v2.w,v0.w) +; V60-NEXT: v0.w = vadd(v0.w,v2.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v1.w = vadd(v3.w,v1.w) +; V60-NEXT: v1.w += vasr(v0.w,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v0.uw = vlsr(v0.uw,r2) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v0.w = vadd(v0.w,v1.w) +; V60-NEXT: v0.w = vadd(v3.w,v1.w) ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: jumpr r31 @@ -107,40 +104,28 @@ define <32 x i32> @mulhu(<32 x i32> %a0, <32 x i32> %a1) #0 { ; V65-LABEL: mulhu: ; V65: // %bb.0: ; V65-NEXT: { -; V65-NEXT: r0 = ##33686018 -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: r2 = #16 -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v4 = vsplat(r0) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v2.uw = vlsr(v2.uw,r2) +; V65-NEXT: v2 = vxor(v2,v2) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: v1 = vdelta(v1,v4) +; V65-NEXT: v5:4 = vmpye(v0.w,v1.uh) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: v1:0.uw = vmpy(v0.uh,v1.uh) +; V65-NEXT: q0 = vcmp.gt(v2.w,v0.w) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: v1:0.w = vadd(v0.uh,v1.uh) +; V65-NEXT: q1 = vcmp.gt(v2.w,v1.w) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: v0.w = vadd(v2.w,v0.w) +; V65-NEXT: v5:4 += vmpyo(v0.w,v1.h) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: v1.w = vadd(v3.w,v1.w) +; V65-NEXT: v31 = vand(q0,v1) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: v0.uw = vlsr(v0.uw,r2) +; V65-NEXT: if (q1) v31.w += v0.w ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: v0.w = vadd(v0.w,v1.w) +; V65-NEXT: v0.w = vadd(v5.w,v31.w) ; V65-NEXT: } ; V65-NEXT: { ; V65-NEXT: jumpr r31 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll new file mode 100644 index 00000000000000..e03bf942e44b39 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll @@ -0,0 +1,570 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=hexagon -mattr=+hvxv60,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V60 %s +; RUN: llc -march=hexagon -mattr=+hvxv62,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V62 %s + +define <32 x i32> @f0(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f0: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = #16 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w = vmpye(v1.w,v0.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.w = vasr(v0.w,r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3.w = vasr(v1.w,r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5:4.w = vmpy(v0.h,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31:30.w = vmpy(v0.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v7:6.w = vadd(v2.uh,v4.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v29:28.w = vadd(v2.h,v4.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v29.w += vasr(v6.w,r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.w = vadd(v29.w,v30.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f0: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v3:2 = vmpye(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2 += vmpyo(v0.w,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = v3 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyss.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0 + ret <32 x i32> %v1 +} + +define <32 x i32> @f1(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f1: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: v2.w = vmpyieo(v0.h,v1.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w += vmpyie(v0.w,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = v2 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f1: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v2.w = vmpyieo(v0.h,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2.w += vmpyie(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = v2 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyss.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1 + ret <32 x i32> %v1 +} + +define <32 x i32> @f2(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f2: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r1:0 = combine(#16,##33686018) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v30 = vxor(v30,v30) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.gt(v30.w,v1.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.uw = vlsr(v2.uw,r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vdelta(v1,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5:4.uw = vmpy(v0.uh,v4.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5:4.w = vadd(v5.uh,v4.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w = vadd(v4.w,v2.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5.w += vasr(v2.w,r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.w = vadd(v3.w,v5.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: if (q0) v31.w -= v0.w +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = v31 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f2: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v3:2 = vmpye(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v4 = vxor(v4,v4) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: q0 = vcmp.gt(v4.w,v0.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2 += vmpyo(v0.w,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: if (q0) v3.w += v1.w +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = v3 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyus.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0 + ret <32 x i32> %v1 +} + +define <32 x i32> @f3(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f3: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: v2.w = vmpyieo(v0.h,v1.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w += vmpyie(v0.w,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = v2 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f3: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v2.w = vmpyieo(v0.h,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2.w += vmpyie(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = v2 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyus.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1 + ret <32 x i32> %v1 +} + +define <32 x i32> @f4(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f4: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = ##33686018 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r2 = #16 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.uw = vlsr(v2.uw,r2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1 = vdelta(v1,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0.w = vadd(v1.uh,v0.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.w = vadd(v0.w,v2.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.w += vasr(v0.w,r2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.w = vadd(v3.w,v1.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f4: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v2 = vxor(v2,v2) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v5:4 = vmpye(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: q0 = vcmp.gt(v2.w,v0.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: q1 = vcmp.gt(v2.w,v1.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v5:4 += vmpyo(v0.w,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v31 = vand(q0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: if (q1) v31.w += v0.w +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.w = vadd(v5.w,v31.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyuu.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0 + ret <32 x i32> %v1 +} + +define <32 x i32> @f5(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f5: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: v2.w = vmpyieo(v0.h,v1.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w += vmpyie(v0.w,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = v2 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f5: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v2.w = vmpyieo(v0.h,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2.w += vmpyie(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = v2 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyuu.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1 + ret <32 x i32> %v1 +} + +define <64 x i32> @f10(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f10: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = ##33686018 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r2 = #16 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vxor(v4,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q1 = vcmp.gt(v4.w,v0.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.gt(v4.w,v1.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v6.uw = vlsr(v2.uw,r2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5 = vdelta(v1,v5) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1 = vmux(q1,v1,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: if (q0) v1.w += v0.w +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v9:8.uw = vmpy(v0.uh,v5.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v9:8.w = vadd(v9.uh,v8.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.w = vadd(v8.w,v6.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w += vasl(v8.w,r2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v9.w += vasr(v31.w,r2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.w = vadd(v3.w,v9.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3.w = vsub(v0.w,v1.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0 = vcombine(v3,v2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f10: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v3:2 = vmpye(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2 += vmpyo(v0.w,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1:0 = vcombine(v3,v2) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyss.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0 + %v2 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1 + %v3 = shufflevector <32 x i32> %v2, <32 x i32> %v1, <64 x i32> + ret <64 x i32> %v3 +} + +define <64 x i32> @f11(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f11: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r1:0 = combine(#16,##33686018) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31 = vxor(v31,v31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.gt(v31.w,v1.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5.uw = vlsr(v2.uw,r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vdelta(v1,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v7:6.uw = vmpy(v0.uh,v4.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v7:6.w = vadd(v7.uh,v6.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v30.w = vadd(v6.w,v5.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w += vasl(v6.w,r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v7.w += vasr(v30.w,r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3.w = vadd(v3.w,v7.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: if (q0) v3.w -= v0.w +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0 = vcombine(v3,v2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f11: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v3:2 = vmpye(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v4 = vxor(v4,v4) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: q0 = vcmp.gt(v4.w,v0.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2 += vmpyo(v0.w,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: if (q0) v3.w += v1.w +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1:0 = vcombine(v3,v2) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyus.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0 + %v2 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1 + %v3 = shufflevector <32 x i32> %v2, <32 x i32> %v1, <64 x i32> + ret <64 x i32> %v3 +} + +define <64 x i32> @f12(<32 x i32> %a0, <32 x i32> %a1) #0 { +; V60-LABEL: f12: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r1:0 = combine(#16,##33686018) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5.uw = vlsr(v2.uw,r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1 = vdelta(v1,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0.w = vadd(v1.uh,v0.uh) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.w = vadd(v0.w,v5.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w += vasl(v0.w,r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.w += vasr(v31.w,r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3.w = vadd(v3.w,v1.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0 = vcombine(v3,v2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f12: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: v4 = vxor(v4,v4) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2 = vmpye(v0.w,v1.uh) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: q0 = vcmp.gt(v4.w,v0.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: q1 = vcmp.gt(v4.w,v1.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2 += vmpyo(v0.w,v1.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v31 = vand(q0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: if (q1) v31.w += v0.w +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.w = vadd(v3.w,v31.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1:0 = vcombine(v3,v2) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } + %v0 = call {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyuu.parts.128B(<32 x i32> %a0, <32 x i32> %a1) + %v1 = extractvalue {<32 x i32>, <32 x i32>} %v0, 0 + %v2 = extractvalue {<32 x i32>, <32 x i32>} %v0, 1 + %v3 = shufflevector <32 x i32> %v2, <32 x i32> %v1, <64 x i32> + ret <64 x i32> %v3 +} + +declare {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyss.parts.128B(<32 x i32>, <32 x i32>) +declare {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyus.parts.128B(<32 x i32>, <32 x i32>) +declare {<32 x i32>, <32 x i32>} @llvm.hexagon.V6.vmpyuu.parts.128B(<32 x i32>, <32 x i32>) + +attributes #0 = { nounwind }