diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 32baa2d111270..801e557a22520 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -666,6 +666,7 @@ SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op, unsigned NumEles = Val.getSimpleValueType().getVectorNumElements(); unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits(); + unsigned ResBits = OpVT.getScalarSizeInBits(); unsigned LegalVecSize = 128; bool isLASX256Vector = @@ -691,10 +692,11 @@ SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op, if (isLASX256Vector) { SDValue Tmp = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, Val, - DAG.getConstant(2, DL, MVT::i64)); + DAG.getConstant(2, DL, Subtarget.getGRLenVT())); Val = DAG.getNode(ISD::ADD, DL, MVT::v4i64, Tmp, Val); } + Val = DAG.getBitcast(MVT::getVectorVT(OpVT, LegalVecSize / ResBits), Val); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, DAG.getConstant(0, DL, Subtarget.getGRLenVT())); } @@ -727,15 +729,16 @@ SDValue LoongArchTargetLowering::lowerVECREDUCE(SDValue Op, unsigned Opcode = ISD::getVecReduceBaseOpcode(Op.getOpcode()); MVT VecTy = Val.getSimpleValueType(); + MVT GRLenVT = Subtarget.getGRLenVT(); for (int i = NumEles; i > 1; i /= 2) { - SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, MVT::i64); + SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, GRLenVT); SDValue Tmp = DAG.getNode(LoongArchISD::VBSRL, DL, VecTy, Val, ShiftAmt); Val = DAG.getNode(Opcode, DL, VecTy, Tmp, Val); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, - DAG.getConstant(0, DL, Subtarget.getGRLenVT())); + DAG.getConstant(0, DL, GRLenVT)); } SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op, @@ -1119,6 +1122,10 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op, SDValue Src = Op->getOperand(0); SDLoc DL(Op); + // LoongArchISD::BITREV_8B is not supported on LA32. + if (!Subtarget.is64Bit() && (ResTy == MVT::v16i8 || ResTy == MVT::v32i8)) + return SDValue(); + EVT NewVT = ResTy.is128BitVector() ? MVT::v2i64 : MVT::v4i64; unsigned int OrigEltNum = ResTy.getVectorNumElements(); unsigned int NewEltNum = NewVT.getVectorNumElements(); @@ -1128,7 +1135,7 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op, SmallVector Ops; for (unsigned int i = 0; i < NewEltNum; i++) { SDValue Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, NewSrc, - DAG.getConstant(i, DL, MVT::i64)); + DAG.getConstant(i, DL, Subtarget.getGRLenVT())); unsigned RevOp = (ResTy == MVT::v16i8 || ResTy == MVT::v32i8) ? (unsigned)LoongArchISD::BITREV_8B : (unsigned)ISD::BITREVERSE; @@ -1611,9 +1618,8 @@ lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef Mask, MVT VT, assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index"); if (fitsRegularPattern(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) { - APInt Imm(64, SplatIndex); return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, - DAG.getConstant(Imm, DL, Subtarget.getGRLenVT())); + DAG.getConstant(SplatIndex, DL, Subtarget.getGRLenVT())); } return SDValue(); @@ -1671,7 +1677,7 @@ lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef Mask, MVT VT, } // Calculate the immediate. Replace any remaining undefs with zero - APInt Imm(64, 0); + int Imm = 0; for (int i = SubVecSize - 1; i >= 0; --i) { int M = SubMask[i]; @@ -1946,11 +1952,12 @@ static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef Mask, /// adding it as an operand to the resulting VSHUF. static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { SmallVector Ops; for (auto M : Mask) - Ops.push_back(DAG.getConstant(M, DL, MVT::i64)); + Ops.push_back(DAG.getSignedConstant(M, DL, Subtarget.getGRLenVT())); EVT MaskVecTy = VT.changeVectorElementTypeToInteger(); SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops); @@ -2030,7 +2037,8 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, return Result; if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG)) return NewShuffle; - if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG))) + if ((Result = + lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; return SDValue(); } @@ -2088,7 +2096,8 @@ lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef Mask, MVT VT, /// Lower VECTOR_SHUFFLE into XVPERM (if possible). static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { // LoongArch LASX only have XVPERM_W. if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32)) return SDValue(); @@ -2119,9 +2128,10 @@ static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef Mask, return SDValue(); SmallVector Masks; + MVT GRLenVT = Subtarget.getGRLenVT(); for (unsigned i = 0; i < NumElts; ++i) - Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64) - : DAG.getConstant(Mask[i], DL, MVT::i64)); + Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(GRLenVT) + : DAG.getConstant(Mask[i], DL, GRLenVT)); SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks); return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec); @@ -2533,7 +2543,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG, Subtarget))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG, + Subtarget))) return Result; if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT, V1, V2, DAG))) @@ -3102,12 +3113,33 @@ LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, return SDValue(); SDValue SplatElt = DAG.getSplatBuildVector(VT, DL, Op1); - SDValue SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2); - SmallVector RawIndices; - for (unsigned i = 0; i < NumElts; ++i) - RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT())); - SDValue Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices); + SDValue SplatIdx; + SDValue Indices; + + if (!Subtarget.is64Bit() && IdxTy == MVT::i64) { + MVT PairVTy = MVT::getVectorVT(MVT::i32, NumElts * 2); + for (unsigned i = 0; i < NumElts; ++i) { + RawIndices.push_back(Op2); + RawIndices.push_back(DAG.getConstant(0, DL, MVT::i32)); + } + SplatIdx = DAG.getBuildVector(PairVTy, DL, RawIndices); + SplatIdx = DAG.getBitcast(IdxVTy, SplatIdx); + + RawIndices.clear(); + for (unsigned i = 0; i < NumElts; ++i) { + RawIndices.push_back(DAG.getConstant(i, DL, MVT::i32)); + RawIndices.push_back(DAG.getConstant(0, DL, MVT::i32)); + } + Indices = DAG.getBuildVector(PairVTy, DL, RawIndices); + Indices = DAG.getBitcast(IdxVTy, Indices); + } else { + SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2); + + for (unsigned i = 0; i < NumElts; ++i) + RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices); + } // insert vec, elt, idx // => @@ -5129,7 +5161,7 @@ performSETCC_BITCASTCombine(SDNode *N, SelectionDAG &DAG, if (Opc == ISD::DELETED_NODE) return SDValue(); - SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src.getOperand(0)); + SDValue V = DAG.getNode(Opc, DL, Subtarget.getGRLenVT(), Src.getOperand(0)); EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); V = DAG.getZExtOrTrunc(V, DL, T); return DAG.getBitcast(VT, V); @@ -5142,6 +5174,7 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); + MVT GRLenVT = Subtarget.getGRLenVT(); if (!DCI.isBeforeLegalizeOps()) return SDValue(); @@ -5209,11 +5242,11 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, if (Src.getSimpleValueType() == MVT::v32i8) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Src, DL); - Lo = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Lo); - Hi = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Hi); - Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, + Lo = DAG.getNode(LoongArchISD::VMSKLTZ, DL, GRLenVT, Lo); + Hi = DAG.getNode(LoongArchISD::VMSKLTZ, DL, GRLenVT, Hi); + Hi = DAG.getNode(ISD::SHL, DL, GRLenVT, Hi, DAG.getConstant(16, DL, MVT::i8)); - V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); + V = DAG.getNode(ISD::OR, DL, GRLenVT, Lo, Hi); } else if (UseLASX) { return SDValue(); } @@ -5221,7 +5254,7 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, if (!V) { Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; - V = DAG.getNode(Opc, DL, MVT::i64, Src); + V = DAG.getNode(Opc, DL, GRLenVT, Src); } EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); @@ -5878,6 +5911,22 @@ static SDValue lowerVectorBitRevImm(SDNode *Node, SelectionDAG &DAG) { return DAG.getNode(ISD::XOR, DL, ResTy, Node->getOperand(1), BitImm); } +template +static SDValue lowerVectorPickVE2GR(SDNode *N, SelectionDAG &DAG, + unsigned ResOp) { + unsigned Imm = N->getConstantOperandVal(2); + if (!isUInt(Imm)) { + const StringRef ErrorMsg = "argument out of range"; + DAG.getContext()->emitError(N->getOperationName(0) + ": " + ErrorMsg + "."); + return DAG.getUNDEF(N->getValueType(0)); + } + SDLoc DL(N); + SDValue Vec = N->getOperand(1); + SDValue Idx = DAG.getConstant(Imm, DL, MVT::i32); + SDValue EltVT = DAG.getValueType(Vec.getValueType().getVectorElementType()); + return DAG.getNode(ResOp, DL, N->getValueType(0), Vec, Idx, EltVT); +} + static SDValue performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -6367,6 +6416,68 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, N->getOperand(1), DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getGRLenVT(), N->getOperand(2))); + case Intrinsic::loongarch_lsx_vpickve2gr_b: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<4>(N, DAG, LoongArchISD::VPICK_SEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_h: + case Intrinsic::loongarch_lasx_xvpickve2gr_w: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<3>(N, DAG, LoongArchISD::VPICK_SEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_w: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<2>(N, DAG, LoongArchISD::VPICK_SEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_bu: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<4>(N, DAG, LoongArchISD::VPICK_ZEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_hu: + case Intrinsic::loongarch_lasx_xvpickve2gr_wu: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<3>(N, DAG, LoongArchISD::VPICK_ZEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_wu: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<2>(N, DAG, LoongArchISD::VPICK_ZEXT_ELT); + break; + case Intrinsic::loongarch_lsx_bz_b: + case Intrinsic::loongarch_lsx_bz_h: + case Intrinsic::loongarch_lsx_bz_w: + case Intrinsic::loongarch_lsx_bz_d: + case Intrinsic::loongarch_lasx_xbz_b: + case Intrinsic::loongarch_lasx_xbz_h: + case Intrinsic::loongarch_lasx_xbz_w: + case Intrinsic::loongarch_lasx_xbz_d: + if (!Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::VALL_ZERO, DL, N->getValueType(0), + N->getOperand(1)); + break; + case Intrinsic::loongarch_lsx_bz_v: + case Intrinsic::loongarch_lasx_xbz_v: + if (!Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::VANY_ZERO, DL, N->getValueType(0), + N->getOperand(1)); + break; + case Intrinsic::loongarch_lsx_bnz_b: + case Intrinsic::loongarch_lsx_bnz_h: + case Intrinsic::loongarch_lsx_bnz_w: + case Intrinsic::loongarch_lsx_bnz_d: + case Intrinsic::loongarch_lasx_xbnz_b: + case Intrinsic::loongarch_lasx_xbnz_h: + case Intrinsic::loongarch_lasx_xbnz_w: + case Intrinsic::loongarch_lasx_xbnz_d: + if (!Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::VALL_NONZERO, DL, N->getValueType(0), + N->getOperand(1)); + break; + case Intrinsic::loongarch_lsx_bnz_v: + case Intrinsic::loongarch_lasx_xbnz_v: + if (!Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0), + N->getOperand(1)); + break; } return SDValue(); } diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index d99a57e562528..b0eb51a92c6c6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -26,7 +26,7 @@ def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>, def SDT_LoongArchV2RUimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, - SDTCisVT<3, i64>]>; + SDTCisVT<3, GRLenVT>]>; def SDT_LoongArchVreplgr2vr : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>]>; def SDT_LoongArchVFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; def SDT_LoongArchVFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; @@ -1482,7 +1482,7 @@ multiclass VldreplPat { } multiclass VstelmPat { + Operand ImmOpnd, Operand IdxOpnd, ValueType elt = GRLenVT> { def : Pat<(StoreOp(elt(vector_extract vt:$vd, IdxOpnd:$idx)), BaseAddr:$rj), (Inst vt:$vd, BaseAddr:$rj, 0, IdxOpnd:$idx)>; @@ -2110,8 +2110,8 @@ def : Pat<(GRLenVT (vector_extract v4i32:$vj, GRLenVT:$rk)), (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (VREPLVE_W v4i32:$vj, GRLenVT:$rk), sub_32)), GPR)>; -def : Pat<(i64 (vector_extract v2i64:$vj, i64:$rk)), - (COPY_TO_REGCLASS (f64 (EXTRACT_SUBREG (VREPLVE_D v2i64:$vj, i64:$rk), +def : Pat<(GRLenVT (vector_extract v2i64:$vj, GRLenVT:$rk)), + (COPY_TO_REGCLASS (f64 (EXTRACT_SUBREG (VREPLVE_D v2i64:$vj, GRLenVT:$rk), sub_64)), GPR)>; def : Pat<(f32 (vector_extract v4f32:$vj, GRLenVT:$rk)), diff --git a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll index 87ee4ad025395..8b12216d0f856 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll @@ -1,27 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefix=LA32 ; RUN: llc --mtriple=loongarch64 -mattr=+lasx --verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefix=LA64 declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.b $xr1, $xr0, 4 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 4 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvandi.b $xr1, $xr0, 51 +; LA32-NEXT: xvslli.b $xr1, $xr1, 2 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 2 +; LA32-NEXT: xvandi.b $xr0, $xr0, 51 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvandi.b $xr1, $xr0, 85 +; LA32-NEXT: xvslli.b $xr1, $xr1, 1 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 1 +; LA32-NEXT: xvandi.b $xr0, $xr0, 85 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 2 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 1 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvori.b $xr0, $xr1, 0 +; LA64-NEXT: ret %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ret <32 x i8> %b } @@ -29,23 +48,53 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1 -; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2 -; CHECK-NEXT: xvshuf4i.h $xr0, $xr2, 27 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v16i16: +; LA32: # %bb.0: +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 5 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 4 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 7 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 2 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 6 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 3 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 1 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 1 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 3 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 2 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 2 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 3 +; LA32-NEXT: xvpermi.q $xr2, $xr1, 2 +; LA32-NEXT: xvshuf4i.h $xr0, $xr2, 27 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v16i16: +; LA64: # %bb.0: +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 2 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr1, 2 +; LA64-NEXT: xvshuf4i.h $xr0, $xr2, 27 +; LA64-NEXT: ret %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ret <16 x i16> %b } @@ -53,23 +102,53 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1 -; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2 -; CHECK-NEXT: xvshuf4i.w $xr0, $xr2, 177 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 4 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 5 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 1 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 6 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 2 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 7 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 3 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 1 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 2 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 2 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 3 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA32-NEXT: xvori.b $xr0, $xr1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 2 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr1, 2 +; LA64-NEXT: xvshuf4i.w $xr0, $xr2, 177 +; LA64-NEXT: ret %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ret <8 x i32> %b } @@ -77,23 +156,43 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; LA32-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI3_0) +; LA32-NEXT: xvshuf.b $xr0, $xr0, $xr0, $xr1 +; LA32-NEXT: xvslli.b $xr1, $xr0, 4 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 4 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvandi.b $xr1, $xr0, 51 +; LA32-NEXT: xvslli.b $xr1, $xr1, 2 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 2 +; LA32-NEXT: xvandi.b $xr0, $xr0, 51 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvandi.b $xr1, $xr0, 85 +; LA32-NEXT: xvslli.b $xr1, $xr1, 1 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 1 +; LA32-NEXT: xvandi.b $xr0, $xr0, 85 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 2 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 1 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvori.b $xr0, $xr1, 0 +; LA64-NEXT: ret %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ret <4 x i64> %b } diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll index 156c829c2dfb6..45b25013c9173 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll @@ -1,97 +1,178 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+lasx < %s | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefix=LA64 declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32) define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind { -; CHECK-LABEL: powi_v8f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -128 -; CHECK-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill -; CHECK-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill -; CHECK-NEXT: addi.w $fp, $a0, 0 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 5 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 4 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 -; CHECK-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr0, $vr1, 16 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 6 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr1, $vr0, 32 -; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 7 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr1, $vr0, 48 -; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 1 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 -; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr0, $vr1, 16 -; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 2 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr1, $vr0, 32 -; CHECK-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr1, $vr0, 48 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 2 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 -; CHECK-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 128 -; CHECK-NEXT: ret +; LA32-LABEL: powi_v8f32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -128 +; LA32-NEXT: st.w $ra, $sp, 124 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 120 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill +; LA32-NEXT: xvpickve.w $xr0, $xr0, 5 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 4 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA32-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload +; LA32-NEXT: vextrins.w $vr0, $vr1, 16 +; LA32-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 6 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; LA32-NEXT: vextrins.w $vr1, $vr0, 32 +; LA32-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 7 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; LA32-NEXT: vextrins.w $vr1, $vr0, 48 +; LA32-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 1 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 0 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA32-NEXT: vextrins.w $vr0, $vr1, 16 +; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 2 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: vextrins.w $vr1, $vr0, 32 +; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 3 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: vextrins.w $vr1, $vr0, 48 +; LA32-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload +; LA32-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA32-NEXT: xvori.b $xr0, $xr1, 0 +; LA32-NEXT: ld.w $fp, $sp, 120 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 124 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 128 +; LA32-NEXT: ret +; +; LA64-LABEL: powi_v8f32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -128 +; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill +; LA64-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill +; LA64-NEXT: addi.w $fp, $a0, 0 +; LA64-NEXT: xvpickve.w $xr0, $xr0, 5 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 4 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA64-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload +; LA64-NEXT: vextrins.w $vr0, $vr1, 16 +; LA64-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 6 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; LA64-NEXT: vextrins.w $vr1, $vr0, 32 +; LA64-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 7 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; LA64-NEXT: vextrins.w $vr1, $vr0, 48 +; LA64-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 1 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vextrins.w $vr0, $vr1, 16 +; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 2 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: vextrins.w $vr1, $vr0, 32 +; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 3 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: vextrins.w $vr1, $vr0, 48 +; LA64-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: xvori.b $xr0, $xr1, 0 +; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 128 +; LA64-NEXT: ret entry: %res = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %va, i32 %b) ret <8 x float> %res @@ -100,53 +181,96 @@ entry: declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32) define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind { -; CHECK-LABEL: powi_v4f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -112 -; CHECK-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 96 # 8-byte Folded Spill -; CHECK-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill -; CHECK-NEXT: addi.w $fp, $a0, 0 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 3 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; CHECK-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 2 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; CHECK-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 1 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload -; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.d $fp, $sp, 96 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 112 -; CHECK-NEXT: ret +; LA32-LABEL: powi_v4f64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -112 +; LA32-NEXT: st.w $ra, $sp, 108 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 104 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill +; LA32-NEXT: xvpickve.d $xr0, $xr0, 3 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA32-NEXT: bl __powidf2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA32-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.d $xr0, $xr0, 2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powidf2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA32-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload +; LA32-NEXT: vextrins.d $vr0, $vr1, 16 +; LA32-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.d $xr0, $xr0, 1 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powidf2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.d $xr0, $xr0, 0 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powidf2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA32-NEXT: vextrins.d $vr0, $vr1, 16 +; LA32-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload +; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA32-NEXT: ld.w $fp, $sp, 104 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 108 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 112 +; LA32-NEXT: ret +; +; LA64-LABEL: powi_v4f64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -112 +; LA64-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 96 # 8-byte Folded Spill +; LA64-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill +; LA64-NEXT: addi.w $fp, $a0, 0 +; LA64-NEXT: xvpickve.d $xr0, $xr0, 3 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powidf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.d $xr0, $xr0, 2 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powidf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: vextrins.d $vr0, $vr1, 16 +; LA64-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.d $xr0, $xr0, 1 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powidf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.d $xr0, $xr0, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powidf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vextrins.d $vr0, $vr1, 16 +; LA64-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload +; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA64-NEXT: ld.d $fp, $sp, 96 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 112 +; LA64-NEXT: ret entry: %res = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %va, i32 %b) ret <4 x double> %res diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll index 215436823af83..623a6de1bc402 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s declare <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll index ad36c3aa5c29d..743ab10cc9b00 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s declare <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll index a671e9979b2fe..e6688bacd3bf9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s declare <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8>, i32) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll index 5ed4104c295fa..cfe9ec575222a 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s declare <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8>, i32) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d-invalid-imm.ll new file mode 100644 index 0000000000000..5a5af4356f714 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d-invalid-imm.ll @@ -0,0 +1,33 @@ +; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s + +declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32) + +define i64 @lasx_xvpickve2gr_d_lo(<4 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 -1) + ret i64 %res +} + +define i64 @lasx_xvpickve2gr_d_hi(<4 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 4) + ret i64 %res +} + +declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32) + +define i64 @lasx_xvpickve2gr_du_lo(<4 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 -1) + ret i64 %res +} + +define i64 @lasx_xvpickve2gr_du_hi(<4 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 4) + ret i64 %res +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d.ll new file mode 100644 index 0000000000000..178dd92cbdb80 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32) + +define i64 @lasx_xvpickve2gr_d(<4 x i64> %va) nounwind { +; CHECK-LABEL: lasx_xvpickve2gr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; CHECK-NEXT: ret +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 1) + ret i64 %res +} + +declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32) + +define i64 @lasx_xvpickve2gr_du(<4 x i64> %va) nounwind { +; CHECK-LABEL: lasx_xvpickve2gr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvpickve2gr.du $a0, $xr0, 1 +; CHECK-NEXT: ret +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 1) + ret i64 %res +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll index 93056b272dfc5..0c91b56387f79 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s declare i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32>, i32) @@ -16,22 +17,6 @@ entry: ret i32 %res } -declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32) - -define i64 @lasx_xvpickve2gr_d_lo(<4 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 -1) - ret i64 %res -} - -define i64 @lasx_xvpickve2gr_d_hi(<4 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 4) - ret i64 %res -} - declare i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32>, i32) define i32 @lasx_xvpickve2gr_wu_lo(<8 x i32> %va) nounwind { @@ -47,19 +32,3 @@ entry: %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> %va, i32 8) ret i32 %res } - -declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32) - -define i64 @lasx_xvpickve2gr_du_lo(<4 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 -1) - ret i64 %res -} - -define i64 @lasx_xvpickve2gr_du_hi(<4 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 4) - ret i64 %res -} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll index 0617e7424321b..a6f19ce0c0140 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll @@ -1,9 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s - - - declare i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32>, i32) define i32 @lasx_xvpickve2gr_w(<8 x i32> %va) nounwind { @@ -16,18 +14,6 @@ entry: ret i32 %res } -declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32) - -define i64 @lasx_xvpickve2gr_d(<4 x i64> %va) nounwind { -; CHECK-LABEL: lasx_xvpickve2gr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: ret -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 1) - ret i64 %res -} - declare i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32>, i32) define i32 @lasx_xvpickve2gr_wu(<8 x i32> %va) nounwind { @@ -39,15 +25,3 @@ entry: %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> %va, i32 1) ret i32 %res } - -declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32) - -define i64 @lasx_xvpickve2gr_du(<4 x i64> %va) nounwind { -; CHECK-LABEL: lasx_xvpickve2gr_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.du $a0, $xr0, 1 -; CHECK-NEXT: ret -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 1) - ret i64 %res -} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr-d.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr-d.ll new file mode 100644 index 0000000000000..79ec7b51f6278 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr-d.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define <4 x i64> @xvrepl_ins_d(i64 %a, i64 %b) { +; CHECK-LABEL: xvrepl_ins_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0 +; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 1 +; CHECK-NEXT: ret +entry: + %0 = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a) + %1 = call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> %0, i64 %b, i32 1) + ret <4 x i64> %1 +} + +declare <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64>, i64, i32 immarg) +declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr.ll index 2e538ed66b250..31b809e016564 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s define <8 x i32> @xvrepl_ins_w(i32 %a, i32 %b) { @@ -13,19 +14,5 @@ entry: ret <8 x i32> %1 } -define <4 x i64> @xvrepl_ins_d(i64 %a, i64 %b) { -; CHECK-LABEL: xvrepl_ins_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 1 -; CHECK-NEXT: ret -entry: - %0 = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a) - %1 = call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> %0, i64 %b, i32 1) - ret <4 x i64> %1 -} - declare <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32>, i32, i32 immarg) declare <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32) -declare <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64>, i64, i32 immarg) -declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr-d.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr-d.ll new file mode 100644 index 0000000000000..61bc89249d97e --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr-d.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64) + +define <4 x i64> @lasx_xvreplgr2vr_d(i64 %a) nounwind { +; CHECK-LABEL: lasx_xvreplgr2vr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0 +; CHECK-NEXT: ret +entry: + %res = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a) + ret <4 x i64> %res +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll index c71abd2205c67..a3c0e261e7122 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32) @@ -36,15 +37,3 @@ entry: %res = call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 %a) ret <8 x i32> %res } - -declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64) - -define <4 x i64> @lasx_xvreplgr2vr_d(i64 %a) nounwind { -; CHECK-LABEL: lasx_xvreplgr2vr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0 -; CHECK-NEXT: ret -entry: - %res = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a) - ret <4 x i64> %res -} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll index 6e3e2e0330f52..5e234e4bd8210 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare i32 @llvm.loongarch.lasx.xbz.v(<32 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll index a466b78bf8d2d..38e3289ef4cba 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll index 36e65fc5b3281..f6917cffb36b5 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare i32 @llvm.loongarch.lasx.xbz.b(<32 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll index cf0496fb8fb89..60b51755681a4 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll @@ -3,18 +3,11 @@ ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @extract_32xi8(ptr %src, ptr %dst) nounwind { -; LA32-LABEL: extract_32xi8: -; LA32: # %bb.0: -; LA32-NEXT: xvld $xr0, $a0, 0 -; LA32-NEXT: vpickve2gr.b $a0, $vr0, 1 -; LA32-NEXT: st.b $a0, $a1, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: extract_32xi8: -; LA64: # %bb.0: -; LA64-NEXT: xvld $xr0, $a0, 0 -; LA64-NEXT: xvstelm.b $xr0, $a1, 0, 1 -; LA64-NEXT: ret +; CHECK-LABEL: extract_32xi8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvstelm.b $xr0, $a1, 0, 1 +; CHECK-NEXT: ret %v = load volatile <32 x i8>, ptr %src %e = extractelement <32 x i8> %v, i32 1 store i8 %e, ptr %dst @@ -22,18 +15,11 @@ define void @extract_32xi8(ptr %src, ptr %dst) nounwind { } define void @extract_16xi16(ptr %src, ptr %dst) nounwind { -; LA32-LABEL: extract_16xi16: -; LA32: # %bb.0: -; LA32-NEXT: xvld $xr0, $a0, 0 -; LA32-NEXT: vpickve2gr.h $a0, $vr0, 1 -; LA32-NEXT: st.h $a0, $a1, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: extract_16xi16: -; LA64: # %bb.0: -; LA64-NEXT: xvld $xr0, $a0, 0 -; LA64-NEXT: xvstelm.h $xr0, $a1, 0, 1 -; LA64-NEXT: ret +; CHECK-LABEL: extract_16xi16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvstelm.h $xr0, $a1, 0, 1 +; CHECK-NEXT: ret %v = load volatile <16 x i16>, ptr %src %e = extractelement <16 x i16> %v, i32 1 store i16 %e, ptr %dst @@ -111,8 +97,7 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; LA32-NEXT: movgr2fr.w $fa1, $a2 ; LA32-NEXT: xvpermi.q $xr2, $xr0, 1 ; LA32-NEXT: xvshuf.b $xr0, $xr2, $xr0, $xr1 -; LA32-NEXT: vpickve2gr.b $a0, $vr0, 0 -; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: xvstelm.b $xr0, $a1, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: extract_32xi8_idx: @@ -136,8 +121,7 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; LA32-NEXT: movgr2fr.w $fa1, $a2 ; LA32-NEXT: xvpermi.q $xr2, $xr0, 1 ; LA32-NEXT: xvshuf.h $xr1, $xr2, $xr0 -; LA32-NEXT: vpickve2gr.h $a0, $vr1, 0 -; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: xvstelm.h $xr1, $a1, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: extract_16xi16_idx: diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll index ca405314686e6..af1598f69569e 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind { ; CHECK-LABEL: insert_extract_v32i8: @@ -68,11 +69,19 @@ entry: } define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind { -; CHECK-LABEL: insert_extract_v4i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: xvinsve0.d $xr0, $xr1, 1 -; CHECK-NEXT: ret +; LA32-LABEL: insert_extract_v4i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvpickve.w $xr1, $xr0, 6 +; LA32-NEXT: xvpickve.w $xr2, $xr0, 7 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 2 +; LA32-NEXT: xvinsve0.w $xr0, $xr2, 3 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_extract_v4i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvpickve.d $xr1, $xr0, 3 +; LA64-NEXT: xvinsve0.d $xr0, $xr1, 1 +; LA64-NEXT: ret entry: %b = extractelement <4 x i64> %a, i32 3 %c = insertelement <4 x i64> %a, i64 %b, i32 1 @@ -80,10 +89,17 @@ entry: } define <4 x i64> @insert_extract0_v4i64(<4 x i64> %a) nounwind { -; CHECK-LABEL: insert_extract0_v4i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvinsve0.d $xr0, $xr0, 1 -; CHECK-NEXT: ret +; LA32-LABEL: insert_extract0_v4i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvpickve.w $xr1, $xr0, 1 +; LA32-NEXT: xvinsve0.w $xr0, $xr0, 2 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 3 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_extract0_v4i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvinsve0.d $xr0, $xr0, 1 +; LA64-NEXT: ret entry: %b = extractelement <4 x i64> %a, i32 0 %c = insertelement <4 x i64> %a, i64 %b, i32 1 diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll index 4e173c4feadba..c5d20003742e5 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind { ; CHECK-LABEL: insert_extract_v32i8: @@ -54,10 +55,22 @@ entry: } define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind { -; CHECK-LABEL: insert_extract_v4i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvextrins.d $xr0, $xr0, 1 -; CHECK-NEXT: ret +; LA32-LABEL: insert_extract_v4i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvpickve.w $xr1, $xr0, 2 +; LA32-NEXT: xvpickve.w $xr2, $xr0, 3 +; LA32-NEXT: xvpickve.w $xr3, $xr0, 6 +; LA32-NEXT: xvpickve.w $xr4, $xr0, 7 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 0 +; LA32-NEXT: xvinsve0.w $xr0, $xr2, 1 +; LA32-NEXT: xvinsve0.w $xr0, $xr3, 4 +; LA32-NEXT: xvinsve0.w $xr0, $xr4, 5 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_extract_v4i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvextrins.d $xr0, $xr0, 1 +; LA64-NEXT: ret entry: %b_lo = extractelement <4 x i64> %a, i32 1 %b_hi = extractelement <4 x i64> %a, i32 3 diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll index aa29264924df9..2f1db43e68fef 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @insert_32xi8(ptr %src, ptr %dst, i8 %in) nounwind { ; CHECK-LABEL: insert_32xi8: @@ -121,12 +122,20 @@ define void @insert_8xi32(ptr %src, ptr %dst, i32 %in) nounwind { } define void @insert_4xi64(ptr %src, ptr %dst, i64 %in) nounwind { -; CHECK-LABEL: insert_4xi64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a2, 1 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xi64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 2 +; LA32-NEXT: xvinsgr2vr.w $xr0, $a3, 3 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xi64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvinsgr2vr.d $xr0, $a2, 1 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x i64>, ptr %src %v_new = insertelement <4 x i64> %v, i64 %in, i32 1 store <4 x i64> %v_new, ptr %dst @@ -162,18 +171,30 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind { } define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_32xi8_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI12_0) -; CHECK-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI12_0) -; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: xvreplgr2vr.b $xr2, $a0 -; CHECK-NEXT: xvseq.b $xr0, $xr2, $xr0 -; CHECK-NEXT: xvreplgr2vr.b $xr2, $a2 -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_32xi8_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI12_0) +; LA32-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI12_0) +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvreplgr2vr.b $xr2, $a3 +; LA32-NEXT: xvseq.b $xr0, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.b $xr2, $a2 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_32xi8_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI12_0) +; LA64-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI12_0) +; LA64-NEXT: xvld $xr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: xvreplgr2vr.b $xr2, $a0 +; LA64-NEXT: xvseq.b $xr0, $xr2, $xr0 +; LA64-NEXT: xvreplgr2vr.b $xr2, $a2 +; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <32 x i8>, ptr %src %v_new = insertelement <32 x i8> %v, i8 %in, i32 %idx store <32 x i8> %v_new, ptr %dst @@ -181,18 +202,30 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind { } define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_16xi16_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI13_0) -; CHECK-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI13_0) -; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: xvreplgr2vr.h $xr2, $a0 -; CHECK-NEXT: xvseq.h $xr0, $xr2, $xr0 -; CHECK-NEXT: xvreplgr2vr.h $xr2, $a2 -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_16xi16_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI13_0) +; LA32-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI13_0) +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvreplgr2vr.h $xr2, $a3 +; LA32-NEXT: xvseq.h $xr0, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.h $xr2, $a2 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_16xi16_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI13_0) +; LA64-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI13_0) +; LA64-NEXT: xvld $xr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: xvreplgr2vr.h $xr2, $a0 +; LA64-NEXT: xvseq.h $xr0, $xr2, $xr0 +; LA64-NEXT: xvreplgr2vr.h $xr2, $a2 +; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <16 x i16>, ptr %src %v_new = insertelement <16 x i16> %v, i16 %in, i32 %idx store <16 x i16> %v_new, ptr %dst @@ -200,18 +233,30 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind { } define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_8xi32_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI14_0) -; CHECK-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI14_0) -; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: xvreplgr2vr.w $xr2, $a0 -; CHECK-NEXT: xvseq.w $xr0, $xr2, $xr0 -; CHECK-NEXT: xvreplgr2vr.w $xr2, $a2 -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_8xi32_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI14_0) +; LA32-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI14_0) +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a3 +; LA32-NEXT: xvseq.w $xr0, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a2 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_8xi32_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI14_0) +; LA64-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI14_0) +; LA64-NEXT: xvld $xr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: xvreplgr2vr.w $xr2, $a0 +; LA64-NEXT: xvseq.w $xr0, $xr2, $xr0 +; LA64-NEXT: xvreplgr2vr.w $xr2, $a2 +; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <8 x i32>, ptr %src %v_new = insertelement <8 x i32> %v, i32 %in, i32 %idx store <8 x i32> %v_new, ptr %dst @@ -219,18 +264,36 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind { } define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_4xi64_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI15_0) -; CHECK-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI15_0) -; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: xvreplgr2vr.d $xr2, $a0 -; CHECK-NEXT: xvseq.d $xr0, $xr2, $xr0 -; CHECK-NEXT: xvreplgr2vr.d $xr2, $a2 -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xi64_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a5, %pc_hi20(.LCPI15_0) +; LA32-NEXT: xvld $xr0, $a5, %pc_lo12(.LCPI15_0) +; LA32-NEXT: add.w $a4, $a4, $a4 +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a4 +; LA32-NEXT: xvseq.w $xr2, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.w $xr3, $a2 +; LA32-NEXT: xvbitsel.v $xr1, $xr1, $xr3, $xr2 +; LA32-NEXT: addi.w $a0, $a4, 1 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a0 +; LA32-NEXT: xvseq.w $xr0, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a3 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xi64_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI15_0) +; LA64-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI15_0) +; LA64-NEXT: xvld $xr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: xvreplgr2vr.d $xr2, $a0 +; LA64-NEXT: xvseq.d $xr0, $xr2, $xr0 +; LA64-NEXT: xvreplgr2vr.d $xr2, $a2 +; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x i64>, ptr %src %v_new = insertelement <4 x i64> %v, i64 %in, i32 %idx store <4 x i64> %v_new, ptr %dst @@ -238,19 +301,32 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind { } define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_8xfloat_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI16_0) -; CHECK-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI16_0) -; CHECK-NEXT: xvld $xr2, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0 -; CHECK-NEXT: xvreplgr2vr.w $xr3, $a0 -; CHECK-NEXT: xvseq.w $xr1, $xr3, $xr1 -; CHECK-NEXT: xvreplve0.w $xr0, $xr0 -; CHECK-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_8xfloat_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI16_0) +; LA32-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI16_0) +; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA32-NEXT: xvld $xr2, $a0, 0 +; LA32-NEXT: xvreplgr2vr.w $xr3, $a2 +; LA32-NEXT: xvseq.w $xr1, $xr3, $xr1 +; LA32-NEXT: xvreplve0.w $xr0, $xr0 +; LA32-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_8xfloat_idx: +; LA64: # %bb.0: +; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA64-NEXT: pcalau12i $a3, %pc_hi20(.LCPI16_0) +; LA64-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI16_0) +; LA64-NEXT: xvld $xr2, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a2, 31, 0 +; LA64-NEXT: xvreplgr2vr.w $xr3, $a0 +; LA64-NEXT: xvseq.w $xr1, $xr3, $xr1 +; LA64-NEXT: xvreplve0.w $xr0, $xr0 +; LA64-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <8 x float>, ptr %src %v_new = insertelement <8 x float> %v, float %in, i32 %idx store <8 x float> %v_new, ptr %dst @@ -258,19 +334,36 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin } define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_4xdouble_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI17_0) -; CHECK-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI17_0) -; CHECK-NEXT: xvld $xr2, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0 -; CHECK-NEXT: xvreplgr2vr.d $xr3, $a0 -; CHECK-NEXT: xvseq.d $xr1, $xr3, $xr1 -; CHECK-NEXT: xvreplve0.d $xr0, $xr0 -; CHECK-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xdouble_idx: +; LA32: # %bb.0: +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvrepli.b $xr2, 0 +; LA32-NEXT: xvinsgr2vr.w $xr2, $a2, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI17_0) +; LA32-NEXT: xvld $xr3, $a0, %pc_lo12(.LCPI17_0) +; LA32-NEXT: xvinsgr2vr.w $xr2, $a2, 2 +; LA32-NEXT: xvinsgr2vr.w $xr2, $a2, 4 +; LA32-NEXT: xvinsgr2vr.w $xr2, $a2, 6 +; LA32-NEXT: xvseq.d $xr2, $xr2, $xr3 +; LA32-NEXT: xvreplve0.d $xr0, $xr0 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xdouble_idx: +; LA64: # %bb.0: +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA64-NEXT: pcalau12i $a3, %pc_hi20(.LCPI17_0) +; LA64-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI17_0) +; LA64-NEXT: xvld $xr2, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a2, 31, 0 +; LA64-NEXT: xvreplgr2vr.d $xr3, $a0 +; LA64-NEXT: xvseq.d $xr1, $xr3, $xr1 +; LA64-NEXT: xvreplve0.d $xr0, $xr0 +; LA64-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x double>, ptr %src %v_new = insertelement <4 x double> %v, double %in, i32 %idx store <4 x double> %v_new, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll index 935a30a3e54ed..e498358cf4d19 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ;; xvrepl128vei.b diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll index 6a88805148715..4900146b69a25 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ;; xvshuf.b diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll index 02186d23e31e5..37b62ca989edb 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ;; xxvshuf4i.b @@ -40,4 +41,4 @@ define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) ; CHECK-NEXT: ret %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %c -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll index 5f76d9951df9c..24f1b31702b71 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s define <32 x i8> @shuffle_v32i8(<32 x i8> %a) { diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll index 7268eb24ee51c..3e815a174d232 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll @@ -1,19 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefix=LA64 define void @vec_reduce_add_v32i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvhaddw.h.b $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 -; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvhaddw.h.b $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA32-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA32-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvhaddw.h.b $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA64-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA64-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <32 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %v) store i8 %res, ptr %dst @@ -21,17 +35,29 @@ define void @vec_reduce_add_v32i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v16i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 -; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: st.h $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v16i16: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA32-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA32-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v16i16: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA64-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA64-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret %v = load <16 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %v) store i16 %res, ptr %dst @@ -39,16 +65,27 @@ define void @vec_reduce_add_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 -; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: st.w $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA32-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA32-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA64-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA64-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: st.w $a0, $a1, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -56,14 +93,31 @@ define void @vec_reduce_add_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 -; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvstelm.d $xr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: add.w $a0, $a2, $a0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 0 +; LA32-NEXT: add.w $a2, $a3, $a2 +; LA32-NEXT: sltu $a3, $a2, $a3 +; LA32-NEXT: add.w $a0, $a0, $a3 +; LA32-NEXT: st.w $a2, $a1, 0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA64-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA64-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvstelm.d $xr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll index fd64beab57bf0..23cc230f04503 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_and_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_and_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_and_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vand.v $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vand.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,30 @@ define void @vec_reduce_and_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vand.v $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: and $a0, $a2, $a0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: and $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vand.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll index cdb08d9de3821..d7d3afc6dd1da 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_or_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_or_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_or_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,30 @@ define void @vec_reduce_or_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll index 1d182731c93be..8cbbb52884865 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_smax_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_smax_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_smax_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmax.w $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmax.w $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,41 @@ define void @vec_reduce_smax_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmax.d $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: slt $a3, $a2, $a0 +; LA32-NEXT: xor $a4, $a0, $a2 +; LA32-NEXT: sltui $a4, $a4, 1 +; LA32-NEXT: masknez $a3, $a3, $a4 +; LA32-NEXT: vpickve2gr.w $a5, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a6, $vr0, 2 +; LA32-NEXT: sltu $a7, $a6, $a5 +; LA32-NEXT: maskeqz $a4, $a7, $a4 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: masknez $a4, $a6, $a3 +; LA32-NEXT: maskeqz $a5, $a5, $a3 +; LA32-NEXT: or $a4, $a5, $a4 +; LA32-NEXT: masknez $a2, $a2, $a3 +; LA32-NEXT: maskeqz $a0, $a0, $a3 +; LA32-NEXT: or $a0, $a0, $a2 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a4, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmax.d $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.d $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll index 369afdd1fc7bc..c34852aa8a28f 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_smin_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_smin_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_smin_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmin.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmin.w $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmin.w $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,41 @@ define void @vec_reduce_smin_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmin.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmin.d $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: slt $a3, $a2, $a0 +; LA32-NEXT: xor $a4, $a2, $a0 +; LA32-NEXT: sltui $a4, $a4, 1 +; LA32-NEXT: masknez $a3, $a3, $a4 +; LA32-NEXT: vpickve2gr.w $a5, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a6, $vr0, 0 +; LA32-NEXT: sltu $a7, $a6, $a5 +; LA32-NEXT: maskeqz $a4, $a7, $a4 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: masknez $a4, $a5, $a3 +; LA32-NEXT: maskeqz $a5, $a6, $a3 +; LA32-NEXT: or $a4, $a5, $a4 +; LA32-NEXT: masknez $a0, $a0, $a3 +; LA32-NEXT: maskeqz $a2, $a2, $a3 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a4, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmin.d $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.d $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll index 5256a72ad7d97..c44f83a909a68 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_umax_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_umax_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_umax_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmax.wu $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmax.wu $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,41 @@ define void @vec_reduce_umax_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.du $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmax.du $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: sltu $a3, $a2, $a0 +; LA32-NEXT: xor $a4, $a0, $a2 +; LA32-NEXT: sltui $a4, $a4, 1 +; LA32-NEXT: masknez $a3, $a3, $a4 +; LA32-NEXT: vpickve2gr.w $a5, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a6, $vr0, 2 +; LA32-NEXT: sltu $a7, $a6, $a5 +; LA32-NEXT: maskeqz $a4, $a7, $a4 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: masknez $a4, $a6, $a3 +; LA32-NEXT: maskeqz $a5, $a5, $a3 +; LA32-NEXT: or $a4, $a5, $a4 +; LA32-NEXT: masknez $a2, $a2, $a3 +; LA32-NEXT: maskeqz $a0, $a0, $a3 +; LA32-NEXT: or $a0, $a0, $a2 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a4, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmax.du $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.du $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll index a82c886d8eed1..f91a1b34dffe9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_umin_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_umin_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_umin_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmin.wu $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmin.wu $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmin.wu $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,41 @@ define void @vec_reduce_umin_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmin.du $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.du $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmin.du $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: sltu $a3, $a2, $a0 +; LA32-NEXT: xor $a4, $a2, $a0 +; LA32-NEXT: sltui $a4, $a4, 1 +; LA32-NEXT: masknez $a3, $a3, $a4 +; LA32-NEXT: vpickve2gr.w $a5, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a6, $vr0, 0 +; LA32-NEXT: sltu $a7, $a6, $a5 +; LA32-NEXT: maskeqz $a4, $a7, $a4 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: masknez $a4, $a5, $a3 +; LA32-NEXT: maskeqz $a5, $a6, $a3 +; LA32-NEXT: or $a4, $a5, $a4 +; LA32-NEXT: masknez $a0, $a0, $a3 +; LA32-NEXT: maskeqz $a2, $a2, $a3 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a4, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmin.du $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.du $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll index 429fadcdd156e..af1a66b574c03 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_xor_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_xor_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_xor_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,30 @@ define void @vec_reduce_xor_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: xor $a0, $a2, $a0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: xor $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll index b697a2fd07435..c0fa734034114 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s ;; TODO For these special shuffle mask, we can lower it to xvbsll + xvbsrl + xvor. diff --git a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll index 06d4a5d03f276..09908f619fa1f 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll @@ -1,15 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LA64 define i32 @xmsk_eq_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_eq_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmsknz.b $xr0, $xr0 -; CHECK-NEXT: xvnor.v $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_eq_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmsknz.b $xr0, $xr0 +; LA32-NEXT: xvnor.v $xr0, $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_eq_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmsknz.b $xr0, $xr0 +; LA64-NEXT: xvnor.v $xr0, $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp eq <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -17,15 +27,25 @@ entry: } define i32 @xmsk_sgt_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sgt_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvrepli.b $xr1, 0 -; CHECK-NEXT: xvslt.b $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sgt_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvrepli.b $xr1, 0 +; LA32-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sgt_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvrepli.b $xr1, 0 +; LA64-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sgt <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -33,13 +53,21 @@ entry: } define i32 @xmsk_sgt_allones_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sgt_allones_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskgez.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sgt_allones_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskgez.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sgt_allones_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskgez.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sgt <32 x i8> %a, splat (i8 -1) %2 = bitcast <32 x i1> %1 to i32 @@ -47,13 +75,21 @@ entry: } define i32 @xmsk_sge_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sge_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskgez.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sge_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskgez.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sge_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskgez.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sge <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -61,13 +97,21 @@ entry: } define i32 @xmsk_slt_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_slt_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_slt_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_slt_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp slt <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -75,13 +119,21 @@ entry: } define i16 @xmsk_slt_allzeros_i16(<16 x i16 > %a) { -; CHECK-LABEL: xmsk_slt_allzeros_i16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_slt_allzeros_i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_slt_allzeros_i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret entry: %1 = icmp slt <16 x i16> %a, splat (i16 0) %2 = bitcast <16 x i1> %1 to i16 @@ -89,13 +141,21 @@ entry: } define i8 @xmsk_slt_allzeros_i32(<8 x i32 > %a) { -; CHECK-LABEL: xmsk_slt_allzeros_i32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_slt_allzeros_i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_slt_allzeros_i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret entry: %1 = icmp slt <8 x i32> %a, splat (i32 0) %2 = bitcast <8 x i1> %1 to i8 @@ -103,13 +163,21 @@ entry: } define i4 @xmsk_slt_allzeros_i64(<4 x i64 > %a) { -; CHECK-LABEL: xmsk_slt_allzeros_i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_slt_allzeros_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_slt_allzeros_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret entry: %1 = icmp slt <4 x i64> %a, splat (i64 0) %2 = bitcast <4 x i1> %1 to i4 @@ -117,14 +185,23 @@ entry: } define i32 @xmsk_sle_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sle_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvslei.b $xr0, $xr0, 0 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvslei.b $xr0, $xr0, 0 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvslei.b $xr0, $xr0, 0 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sle <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -132,13 +209,21 @@ entry: } define i32 @xmsk_sle_allones_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sle_allones_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allones_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allones_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sle <32 x i8> %a, splat (i8 -1) %2 = bitcast <32 x i1> %1 to i32 @@ -146,13 +231,21 @@ entry: } define i16 @xmsk_sle_allones_i32(<16 x i16 > %a) { -; CHECK-LABEL: xmsk_sle_allones_i32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allones_i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allones_i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret entry: %1 = icmp sle <16 x i16> %a, splat (i16 -1) %2 = bitcast <16 x i1> %1 to i16 @@ -160,13 +253,21 @@ entry: } define i8 @xmsk_sle_allones_i16(<8 x i32 > %a) { -; CHECK-LABEL: xmsk_sle_allones_i16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allones_i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allones_i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret entry: %1 = icmp sle <8 x i32> %a, splat (i32 -1) %2 = bitcast <8 x i1> %1 to i8 @@ -174,13 +275,21 @@ entry: } define i4 @xmsk_sle_allones_i64(<4 x i64 > %a) { -; CHECK-LABEL: xmsk_sle_allones_i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allones_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allones_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret entry: %1 = icmp sle <4 x i64> %a, splat (i64 -1) %2 = bitcast <4 x i1> %1 to i4 @@ -188,13 +297,21 @@ entry: } define i32 @xmsk_ne_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_ne_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmsknz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_ne_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmsknz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_ne_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmsknz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp ne <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -202,100 +319,165 @@ entry: } define i4 @xvmsk_sgt_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: xvmsk_sgt_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %x = icmp sgt <4 x i64> %a, %b %res = bitcast <4 x i1> %x to i4 ret i4 %res } define i4 @xvmsk_ogt_v4f64(<4 x double> %a, <4 x double> %b) { -; CHECK-LABEL: xvmsk_ogt_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_v4f64: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_v4f64: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %x = fcmp ogt <4 x double> %a, %b %res = bitcast <4 x i1> %x to i4 ret i4 %res } define i8 @xvmsk_sgt_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: xvmsk_sgt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x = icmp sgt <8 x i32> %a, %b %res = bitcast <8 x i1> %x to i8 ret i8 %res } define i8 @xvmsk_ogt_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: xvmsk_ogt_v8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_v8f32: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_v8f32: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x = fcmp ogt <8 x float> %a, %b %res = bitcast <8 x i1> %x to i8 ret i8 %res } define i16 @xvmsk_sgt_v16i16(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: xvmsk_sgt_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.h $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_v16i16: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.h $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_v16i16: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.h $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret %x = icmp sgt <16 x i16> %a, %b %res = bitcast <16 x i1> %x to i16 ret i16 %res } define i32 @xvmsk_sgt_v32i8(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: xvmsk_sgt_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.b $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret %x = icmp sgt <32 x i8> %a, %b %res = bitcast <32 x i1> %x to i32 ret i32 %res } define i4 @xvmsk_sgt_and_sgt_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { -; CHECK-LABEL: xvmsk_sgt_and_sgt_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.d $xr2, $xr3, $xr2 -; CHECK-NEXT: xvslt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_and_sgt_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.d $xr2, $xr3, $xr2 +; LA32-NEXT: xvslt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_and_sgt_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.d $xr2, $xr3, $xr2 +; LA64-NEXT: xvslt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %x0 = icmp sgt <4 x i64> %a, %b %x1 = icmp sgt <4 x i64> %c, %d %y = and <4 x i1> %x0, %x1 @@ -304,16 +486,27 @@ define i4 @xvmsk_sgt_and_sgt_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 } define i4 @xvmsk_ogt_and_ogt_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { -; CHECK-LABEL: xvmsk_ogt_and_ogt_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.d $xr2, $xr3, $xr2 -; CHECK-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_and_ogt_v4f64: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.d $xr2, $xr3, $xr2 +; LA32-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_and_ogt_v4f64: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.d $xr2, $xr3, $xr2 +; LA64-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %x0 = fcmp ogt <4 x double> %a, %b %x1 = fcmp ogt <4 x double> %c, %d %y = and <4 x i1> %x0, %x1 @@ -322,16 +515,27 @@ define i4 @xvmsk_ogt_and_ogt_v4f64(<4 x double> %a, <4 x double> %b, <4 x double } define i8 @xvmsk_sgt_and_sgt_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { -; CHECK-LABEL: xvmsk_sgt_and_sgt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.w $xr2, $xr3, $xr2 -; CHECK-NEXT: xvslt.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_and_sgt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.w $xr2, $xr3, $xr2 +; LA32-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_and_sgt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.w $xr2, $xr3, $xr2 +; LA64-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = icmp sgt <8 x i32> %a, %b %x1 = icmp sgt <8 x i32> %c, %d %y = and <8 x i1> %x0, %x1 @@ -340,16 +544,27 @@ define i8 @xvmsk_sgt_and_sgt_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 } define i8 @xvmsk_sgt_or_sgt_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { -; CHECK-LABEL: xvmsk_sgt_or_sgt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.w $xr2, $xr3, $xr2 -; CHECK-NEXT: xvslt.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_or_sgt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.w $xr2, $xr3, $xr2 +; LA32-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_or_sgt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.w $xr2, $xr3, $xr2 +; LA64-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA64-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = icmp sgt <8 x i32> %a, %b %x1 = icmp sgt <8 x i32> %c, %d %y = or <8 x i1> %x0, %x1 @@ -358,18 +573,31 @@ define i8 @xvmsk_sgt_or_sgt_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x } define i8 @xvmsk_sgt_or_slt_and_eq_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d, <8 x i32> %e, <8 x i32> %f) { -; CHECK-LABEL: xvmsk_sgt_or_slt_and_eq_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.w $xr2, $xr2, $xr3 -; CHECK-NEXT: xvslt.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvseq.w $xr1, $xr4, $xr5 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_or_slt_and_eq_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.w $xr2, $xr2, $xr3 +; LA32-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvseq.w $xr1, $xr4, $xr5 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_or_slt_and_eq_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.w $xr2, $xr2, $xr3 +; LA64-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA64-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvseq.w $xr1, $xr4, $xr5 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = icmp sgt <8 x i32> %a, %b %x1 = icmp slt <8 x i32> %c, %d %x2 = icmp eq <8 x i32> %e, %f @@ -380,15 +608,25 @@ define i8 @xvmsk_sgt_or_slt_and_eq_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> % } define i8 @xvmsk_eq_vsel_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { -; CHECK-LABEL: xvmsk_eq_vsel_slt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvseq.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_eq_vsel_slt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvseq.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_eq_vsel_slt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvseq.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %cmp = icmp eq <8 x i32> %a0, %a1 %slt = icmp slt <8 x i32> %a2, zeroinitializer %sel = select <8 x i1> %cmp, <8 x i1> , <8 x i1> %slt @@ -397,22 +635,39 @@ define i8 @xvmsk_eq_vsel_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) } define i8 @xvmsk_sel_eq_or_eq_or_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3, i1 %a4) { -; CHECK-LABEL: xvmsk_sel_eq_or_eq_or_slt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: andi $a0, $a0, 1 -; CHECK-NEXT: xvseq.w $xr2, $xr0, $xr2 -; CHECK-NEXT: addi.d $a1, $zero, -1 -; CHECK-NEXT: maskeqz $a0, $a1, $a0 -; CHECK-NEXT: xvreplgr2vr.w $xr4, $a0 -; CHECK-NEXT: xvand.v $xr2, $xr2, $xr4 -; CHECK-NEXT: xvseq.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvor.v $xr0, $xr3, $xr0 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sel_eq_or_eq_or_slt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: andi $a0, $a0, 1 +; LA32-NEXT: xvseq.w $xr2, $xr0, $xr2 +; LA32-NEXT: addi.w $a1, $zero, -1 +; LA32-NEXT: maskeqz $a0, $a1, $a0 +; LA32-NEXT: xvreplgr2vr.w $xr4, $a0 +; LA32-NEXT: xvand.v $xr2, $xr2, $xr4 +; LA32-NEXT: xvseq.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvor.v $xr0, $xr3, $xr0 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sel_eq_or_eq_or_slt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: andi $a0, $a0, 1 +; LA64-NEXT: xvseq.w $xr2, $xr0, $xr2 +; LA64-NEXT: addi.d $a1, $zero, -1 +; LA64-NEXT: maskeqz $a0, $a1, $a0 +; LA64-NEXT: xvreplgr2vr.w $xr4, $a0 +; LA64-NEXT: xvand.v $xr2, $xr2, $xr4 +; LA64-NEXT: xvseq.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvor.v $xr0, $xr3, $xr0 +; LA64-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %cmp0 = icmp eq <8 x i32> %a0, %a1 %cmp1 = icmp eq <8 x i32> %a0, %a2 %cmp2 = icmp slt <8 x i32> %a3, zeroinitializer @@ -424,16 +679,27 @@ define i8 @xvmsk_sel_eq_or_eq_or_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i3 } define i8 @xvmsk_ogt_and_ogt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { -; CHECK-LABEL: xvmsk_ogt_and_ogt_v8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 -; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_and_ogt_v8f32: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 +; LA32-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_and_ogt_v8f32: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 +; LA64-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = fcmp ogt <8 x float> %a, %b %x1 = fcmp ogt <8 x float> %c, %d %y = and <8 x i1> %x0, %x1 @@ -442,16 +708,27 @@ define i8 @xvmsk_ogt_and_ogt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> % } define i8 @xvmsk_sgt_xor_sgt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { -; CHECK-LABEL: xvmsk_sgt_xor_sgt_v8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 -; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_xor_sgt_v8f32: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 +; LA32-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA32-NEXT: xvxor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_xor_sgt_v8f32: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 +; LA64-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA64-NEXT: xvxor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = fcmp ogt <8 x float> %a, %b %x1 = fcmp ogt <8 x float> %c, %d %y = xor <8 x i1> %x0, %x1 @@ -460,18 +737,31 @@ define i8 @xvmsk_sgt_xor_sgt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> % } define i8 @xvmsk_ugt_xor_ueq_and_ogt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, <8 x float> %e, <8 x float> %f) { -; CHECK-LABEL: xvmsk_ugt_xor_ueq_and_ogt_v8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.cueq.s $xr2, $xr2, $xr3 -; CHECK-NEXT: xvfcmp.cult.s $xr0, $xr1, $xr0 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvfcmp.clt.s $xr1, $xr5, $xr4 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ugt_xor_ueq_and_ogt_v8f32: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.cueq.s $xr2, $xr2, $xr3 +; LA32-NEXT: xvfcmp.cult.s $xr0, $xr1, $xr0 +; LA32-NEXT: xvxor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvfcmp.clt.s $xr1, $xr5, $xr4 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ugt_xor_ueq_and_ogt_v8f32: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.cueq.s $xr2, $xr2, $xr3 +; LA64-NEXT: xvfcmp.cult.s $xr0, $xr1, $xr0 +; LA64-NEXT: xvxor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvfcmp.clt.s $xr1, $xr5, $xr4 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = fcmp ugt <8 x float> %a, %b %x1 = fcmp ueq <8 x float> %c, %d %x2 = fcmp ogt <8 x float> %e, %f @@ -482,16 +772,27 @@ define i8 @xvmsk_ugt_xor_ueq_and_ogt_v8f32(<8 x float> %a, <8 x float> %b, <8 x } define i16 @xvmsk_sgt_and_sgt_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { -; CHECK-LABEL: xvmsk_sgt_and_sgt_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.h $xr2, $xr3, $xr2 -; CHECK-NEXT: xvslt.h $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_and_sgt_v16i16: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.h $xr2, $xr3, $xr2 +; LA32-NEXT: xvslt.h $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_and_sgt_v16i16: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.h $xr2, $xr3, $xr2 +; LA64-NEXT: xvslt.h $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret %x0 = icmp sgt <16 x i16> %a, %b %x1 = icmp sgt <16 x i16> %c, %d %y = and <16 x i1> %x0, %x1 @@ -500,16 +801,27 @@ define i16 @xvmsk_sgt_and_sgt_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c } define i32 @xvmsk_sgt_and_sgt_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { -; CHECK-LABEL: xvmsk_sgt_and_sgt_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.b $xr0, $xr1, $xr0 -; CHECK-NEXT: xvslt.b $xr1, $xr3, $xr2 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_and_sgt_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA32-NEXT: xvslt.b $xr1, $xr3, $xr2 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_and_sgt_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA64-NEXT: xvslt.b $xr1, $xr3, $xr2 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret %x0 = icmp sgt <32 x i8> %a, %b %x1 = icmp sgt <32 x i8> %c, %d %y = and <32 x i1> %x0, %x1 @@ -518,17 +830,29 @@ define i32 @xvmsk_sgt_and_sgt_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3 } define i8 @xvmsk_eq_v2i64_concat_poison(<2 x i64> %vec) { -; CHECK-LABEL: xvmsk_eq_v2i64_concat_poison: -; CHECK: # %bb.0: -; CHECK-NEXT: vseqi.d $vr0, $vr0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 1 -; CHECK-NEXT: vslli.h $vr0, $vr1, 15 -; CHECK-NEXT: vmskltz.h $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_eq_v2i64_concat_poison: +; LA32: # %bb.0: +; LA32-NEXT: vseqi.d $vr0, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: vinsgr2vr.h $vr1, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vinsgr2vr.h $vr1, $a0, 1 +; LA32-NEXT: vslli.h $vr0, $vr1, 15 +; LA32-NEXT: vmskltz.h $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_eq_v2i64_concat_poison: +; LA64: # %bb.0: +; LA64-NEXT: vseqi.d $vr0, $vr0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: vinsgr2vr.h $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: vinsgr2vr.h $vr1, $a0, 1 +; LA64-NEXT: vslli.h $vr0, $vr1, 15 +; LA64-NEXT: vmskltz.h $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: ret %tobool = icmp eq <2 x i64> %vec, zeroinitializer %insertvec = shufflevector <2 x i1> %tobool, <2 x i1> poison, <8 x i32> %res = bitcast <8 x i1> %insertvec to i8 @@ -560,22 +884,39 @@ define i8 @xvmsk_ne_v4i32_concat_poison(<4 x i32> %vec) { } define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) { -; CHECK-LABEL: xvmsk_ogt_v4f64_concat_poison: -; CHECK: # %bb.0: -; CHECK-NEXT: xvrepli.b $xr1, 0 -; CHECK-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 2 -; CHECK-NEXT: xvpickve2gr.d $a2, $xr0, 1 -; CHECK-NEXT: xvpickve2gr.d $a3, $xr0, 0 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a3, 0 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a2, 1 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 3 -; CHECK-NEXT: vslli.h $vr0, $vr0, 15 -; CHECK-NEXT: vmskltz.h $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_v4f64_concat_poison: +; LA32: # %bb.0: +; LA32-NEXT: xvrepli.b $xr1, 0 +; LA32-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 6 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 4 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 0 +; LA32-NEXT: vinsgr2vr.h $vr0, $a3, 0 +; LA32-NEXT: vinsgr2vr.h $vr0, $a2, 1 +; LA32-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; LA32-NEXT: vinsgr2vr.h $vr0, $a0, 3 +; LA32-NEXT: vslli.h $vr0, $vr0, 15 +; LA32-NEXT: vmskltz.h $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_v4f64_concat_poison: +; LA64: # %bb.0: +; LA64-NEXT: xvrepli.b $xr1, 0 +; LA64-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.d $a3, $xr0, 0 +; LA64-NEXT: vinsgr2vr.h $vr0, $a3, 0 +; LA64-NEXT: vinsgr2vr.h $vr0, $a2, 1 +; LA64-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; LA64-NEXT: vinsgr2vr.h $vr0, $a0, 3 +; LA64-NEXT: vslli.h $vr0, $vr0, 15 +; LA64-NEXT: vmskltz.h $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: ret %tobool = fcmp ogt <4 x double> %vec, zeroinitializer %insertvec = shufflevector <4 x i1> %tobool, <4 x i1> poison, <8 x i32> %res = bitcast <8 x i1> %insertvec to i8 @@ -583,56 +924,92 @@ define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) { } define i32 @xvmsk_trunc_i8(<32 x i8> %a) { -; CHECK-LABEL: xvmsk_trunc_i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslli.b $xr0, $xr0, 7 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_trunc_i8: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.b $xr0, $xr0, 7 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_trunc_i8: +; LA64: # %bb.0: +; LA64-NEXT: xvslli.b $xr0, $xr0, 7 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret %y = trunc <32 x i8> %a to <32 x i1> %res = bitcast <32 x i1> %y to i32 ret i32 %res } define i16 @xvmsk_trunc_i16(<16 x i16> %a) { -; CHECK-LABEL: xvmsk_trunc_i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslli.h $xr0, $xr0, 15 -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_trunc_i16: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.h $xr0, $xr0, 15 +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_trunc_i16: +; LA64: # %bb.0: +; LA64-NEXT: xvslli.h $xr0, $xr0, 15 +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret %y = trunc <16 x i16> %a to <16 x i1> %res = bitcast <16 x i1> %y to i16 ret i16 %res } define i8 @xvmsk_trunc_i32(<8 x i32> %a) { -; CHECK-LABEL: xvmsk_trunc_i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslli.w $xr0, $xr0, 31 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_trunc_i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.w $xr0, $xr0, 31 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_trunc_i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslli.w $xr0, $xr0, 31 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %y = trunc <8 x i32> %a to <8 x i1> %res = bitcast <8 x i1> %y to i8 ret i8 %res } define i4 @xvmsk_trunc_i64(<4 x i64> %a) { -; CHECK-LABEL: xvmsk_trunc_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslli.d $xr0, $xr0, 63 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_trunc_i64: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.d $xr0, $xr0, 63 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_trunc_i64: +; LA64: # %bb.0: +; LA64-NEXT: xvslli.d $xr0, $xr0, 63 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %y = trunc <4 x i64> %a to <4 x i1> %res = bitcast <4 x i1> %y to i4 ret i4 %res diff --git a/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll index 4c17d3fd8d7b2..b0d36a8143fa1 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll @@ -1,20 +1,39 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lsx --verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefix=LA32 ; RUN: llc --mtriple=loongarch64 -mattr=+lsx --verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefix=LA64 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: vori.b $vr0, $vr1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v16i8: +; LA32: # %bb.0: +; LA32-NEXT: vslli.b $vr1, $vr0, 4 +; LA32-NEXT: vsrli.b $vr0, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vandi.b $vr1, $vr0, 51 +; LA32-NEXT: vslli.b $vr1, $vr1, 2 +; LA32-NEXT: vsrli.b $vr0, $vr0, 2 +; LA32-NEXT: vandi.b $vr0, $vr0, 51 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vandi.b $vr1, $vr0, 85 +; LA32-NEXT: vslli.b $vr1, $vr1, 1 +; LA32-NEXT: vsrli.b $vr0, $vr0, 1 +; LA32-NEXT: vandi.b $vr0, $vr0, 85 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v16i8: +; LA64: # %bb.0: +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: vori.b $vr0, $vr1, 0 +; LA64-NEXT: ret %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ret <16 x i8> %b } @@ -22,16 +41,33 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: vshuf4i.h $vr0, $vr1, 27 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v8i16: +; LA32: # %bb.0: +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 2 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 3 +; LA32-NEXT: vshuf4i.h $vr0, $vr1, 27 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v8i16: +; LA64: # %bb.0: +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: vshuf4i.h $vr0, $vr1, 27 +; LA64-NEXT: ret %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ret <8 x i16> %b } @@ -39,16 +75,33 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: vshuf4i.w $vr0, $vr1, 177 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 2 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 3 +; LA32-NEXT: vori.b $vr0, $vr1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: vshuf4i.w $vr0, $vr1, 177 +; LA64-NEXT: ret %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ret <4 x i32> %b } @@ -56,16 +109,36 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: vori.b $vr0, $vr1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI3_0) +; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vslli.b $vr1, $vr0, 4 +; LA32-NEXT: vsrli.b $vr0, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vandi.b $vr1, $vr0, 51 +; LA32-NEXT: vslli.b $vr1, $vr1, 2 +; LA32-NEXT: vsrli.b $vr0, $vr0, 2 +; LA32-NEXT: vandi.b $vr0, $vr0, 51 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vandi.b $vr1, $vr0, 85 +; LA32-NEXT: vslli.b $vr1, $vr1, 1 +; LA32-NEXT: vsrli.b $vr0, $vr0, 1 +; LA32-NEXT: vandi.b $vr0, $vr0, 85 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: vori.b $vr0, $vr1, 0 +; LA64-NEXT: ret %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ret <2 x i64> %b } diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll index 669c53b73b16f..92981211adeb8 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll index 1b7a97d9f9720..324098b918890 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s declare <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll index 3cd6c78e87d78..ad46b47c82c86 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s declare <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll index 667ba32723fc4..2ecbe685ff20b 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s declare <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8>, i32) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll index b73bada4f06fb..f4348f57442e6 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s declare <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8>, i32) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d-invalid-imm.ll new file mode 100644 index 0000000000000..4dc5163e721ce --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d-invalid-imm.ll @@ -0,0 +1,33 @@ +; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s + +declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32) + +define i64 @lsx_vpickve2gr_d_lo(<2 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 -1) + ret i64 %res +} + +define i64 @lsx_vpickve2gr_d_hi(<2 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 2) + ret i64 %res +} + +declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32) + +define i64 @lsx_vpickve2gr_du_lo(<2 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 -1) + ret i64 %res +} + +define i64 @lsx_vpickve2gr_du_hi(<2 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 2) + ret i64 %res +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d.ll new file mode 100644 index 0000000000000..78f4e3c1bc18b --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32) + +define i64 @lsx_vpickve2gr_d(<2 x i64> %va) nounwind { +; CHECK-LABEL: lsx_vpickve2gr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 +; CHECK-NEXT: ret +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 1) + ret i64 %res +} + +declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32) + +define i64 @lsx_vpickve2gr_du(<2 x i64> %va) nounwind { +; CHECK-LABEL: lsx_vpickve2gr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpickve2gr.du $a0, $vr0, 1 +; CHECK-NEXT: ret +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 1) + ret i64 %res +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll index 3430c54d21941..492b97c8316c1 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32) @@ -48,22 +49,6 @@ entry: ret i32 %res } -declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32) - -define i64 @lsx_vpickve2gr_d_lo(<2 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 -1) - ret i64 %res -} - -define i64 @lsx_vpickve2gr_d_hi(<2 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 2) - ret i64 %res -} - declare i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8>, i32) define i32 @lsx_vpickve2gr_bu_lo(<16 x i8> %va) nounwind { @@ -111,19 +96,3 @@ entry: %res = call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> %va, i32 4) ret i32 %res } - -declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32) - -define i64 @lsx_vpickve2gr_du_lo(<2 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 -1) - ret i64 %res -} - -define i64 @lsx_vpickve2gr_du_hi(<2 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 2) - ret i64 %res -} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll index ed56d30ce3c46..4e77f6b72fed9 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32) @@ -37,18 +38,6 @@ entry: ret i32 %res } -declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32) - -define i64 @lsx_vpickve2gr_d(<2 x i64> %va) nounwind { -; CHECK-LABEL: lsx_vpickve2gr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: ret -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 1) - ret i64 %res -} - declare i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8>, i32) define i32 @lsx_vpickve2gr_bu(<16 x i8> %va) nounwind { @@ -84,15 +73,3 @@ entry: %res = call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> %va, i32 3) ret i32 %res } - -declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32) - -define i64 @lsx_vpickve2gr_du(<2 x i64> %va) nounwind { -; CHECK-LABEL: lsx_vpickve2gr_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpickve2gr.du $a0, $vr0, 1 -; CHECK-NEXT: ret -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 1) - ret i64 %res -} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr-d.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr-d.ll new file mode 100644 index 0000000000000..51533e4b2474c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr-d.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define <2 x i64> @vrepl_ins_d(i64 %a, i64 %b) { +; CHECK-LABEL: vrepl_ins_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vreplgr2vr.d $vr0, $a0 +; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 1 +; CHECK-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a) + %1 = call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> %0, i64 %b, i32 1) + ret <2 x i64> %1 +} + +declare <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64>, i64, i32 immarg) +declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr.ll index aee7492946829..9d7ab6e1ab5ef 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s define <16 x i8> @vrepl_ins_b(i32 %a, i32 %b) { @@ -37,23 +38,9 @@ entry: ret <4 x i32> %1 } -define <2 x i64> @vrepl_ins_d(i64 %a, i64 %b) { -; CHECK-LABEL: vrepl_ins_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vreplgr2vr.d $vr0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 1 -; CHECK-NEXT: ret -entry: - %0 = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a) - %1 = call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> %0, i64 %b, i32 1) - ret <2 x i64> %1 -} - declare <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8>, i32, i32 immarg) declare <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32) declare <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16>, i32, i32 immarg) declare <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32) declare <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32>, i32, i32 immarg) declare <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32) -declare <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64>, i64, i32 immarg) -declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr-d.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr-d.ll new file mode 100644 index 0000000000000..c8d0fce6ed5a2 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr-d.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64) + +define <2 x i64> @lsx_vreplgr2vr_d(i64 %a) nounwind { +; CHECK-LABEL: lsx_vreplgr2vr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vreplgr2vr.d $vr0, $a0 +; CHECK-NEXT: ret +entry: + %res = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a) + ret <2 x i64> %res +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll index 091f1c98c2289..edaa20792012d 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32) @@ -36,15 +37,3 @@ entry: %res = call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 %a) ret <4 x i32> %res } - -declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64) - -define <2 x i64> @lsx_vreplgr2vr_d(i64 %a) nounwind { -; CHECK-LABEL: lsx_vreplgr2vr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vreplgr2vr.d $vr0, $a0 -; CHECK-NEXT: ret -entry: - %res = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a) - ret <2 x i64> %res -} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll index 3188fb4e2c2ef..004bcde90907a 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare i32 @llvm.loongarch.lsx.bz.v(<16 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll index 22e01922e87bb..6544f91f045a7 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare i32 @llvm.loongarch.lsx.bnz.b(<16 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll index 96c79c10e4688..5ba3eb788c1d7 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare i32 @llvm.loongarch.lsx.bz.b(<16 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll index 3fb55d4806160..b17a90e71e85a 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll @@ -3,18 +3,11 @@ ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @extract_16xi8(ptr %src, ptr %dst) nounwind { -; LA32-LABEL: extract_16xi8: -; LA32: # %bb.0: -; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: vpickve2gr.b $a0, $vr0, 1 -; LA32-NEXT: st.b $a0, $a1, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: extract_16xi8: -; LA64: # %bb.0: -; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: vstelm.b $vr0, $a1, 0, 1 -; LA64-NEXT: ret +; CHECK-LABEL: extract_16xi8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 1 +; CHECK-NEXT: ret %v = load volatile <16 x i8>, ptr %src %e = extractelement <16 x i8> %v, i32 1 store i8 %e, ptr %dst @@ -22,18 +15,11 @@ define void @extract_16xi8(ptr %src, ptr %dst) nounwind { } define void @extract_8xi16(ptr %src, ptr %dst) nounwind { -; LA32-LABEL: extract_8xi16: -; LA32: # %bb.0: -; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: vpickve2gr.h $a0, $vr0, 1 -; LA32-NEXT: st.h $a0, $a1, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: extract_8xi16: -; LA64: # %bb.0: -; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: vstelm.h $vr0, $a1, 0, 1 -; LA64-NEXT: ret +; CHECK-LABEL: extract_8xi16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 1 +; CHECK-NEXT: ret %v = load volatile <8 x i16>, ptr %src %e = extractelement <8 x i16> %v, i32 1 store i16 %e, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll index 4bb1941724dc6..496a1aed39fb5 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @insert_16xi8(ptr %src, ptr %dst, i8 %ins) nounwind { ; CHECK-LABEL: insert_16xi8: @@ -41,12 +42,20 @@ define void @insert_4xi32(ptr %src, ptr %dst, i32 %ins) nounwind { } define void @insert_2xi64(ptr %src, ptr %dst, i64 %ins) nounwind { -; CHECK-LABEL: insert_2xi64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a2, 1 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_2xi64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 2 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 3 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_2xi64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a2, 1 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <2 x i64>, ptr %src %v_new = insertelement <2 x i64> %v, i64 %ins, i32 1 store <2 x i64> %v_new, ptr %dst @@ -82,18 +91,30 @@ define void @insert_2xdouble(ptr %src, ptr %dst, double %ins) nounwind { } define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_16xi8_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI6_0) -; CHECK-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI6_0) -; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: vreplgr2vr.b $vr2, $a0 -; CHECK-NEXT: vseq.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vreplgr2vr.b $vr2, $a2 -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_16xi8_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI6_0) +; LA32-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI6_0) +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: vreplgr2vr.b $vr2, $a3 +; LA32-NEXT: vseq.b $vr0, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.b $vr2, $a2 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_16xi8_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI6_0) +; LA64-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI6_0) +; LA64-NEXT: vld $vr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: vreplgr2vr.b $vr2, $a0 +; LA64-NEXT: vseq.b $vr0, $vr2, $vr0 +; LA64-NEXT: vreplgr2vr.b $vr2, $a2 +; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <16 x i8>, ptr %src %v_new = insertelement <16 x i8> %v, i8 %ins, i32 %idx store <16 x i8> %v_new, ptr %dst @@ -101,18 +122,30 @@ define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind { } define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_8xi16_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI7_0) -; CHECK-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI7_0) -; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: vreplgr2vr.h $vr2, $a0 -; CHECK-NEXT: vseq.h $vr0, $vr2, $vr0 -; CHECK-NEXT: vreplgr2vr.h $vr2, $a2 -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_8xi16_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI7_0) +; LA32-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI7_0) +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: vreplgr2vr.h $vr2, $a3 +; LA32-NEXT: vseq.h $vr0, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.h $vr2, $a2 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_8xi16_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI7_0) +; LA64-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI7_0) +; LA64-NEXT: vld $vr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: vreplgr2vr.h $vr2, $a0 +; LA64-NEXT: vseq.h $vr0, $vr2, $vr0 +; LA64-NEXT: vreplgr2vr.h $vr2, $a2 +; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <8 x i16>, ptr %src %v_new = insertelement <8 x i16> %v, i16 %ins, i32 %idx store <8 x i16> %v_new, ptr %dst @@ -120,18 +153,30 @@ define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind { } define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_4xi32_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI8_0) -; CHECK-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI8_0) -; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: vreplgr2vr.w $vr2, $a0 -; CHECK-NEXT: vseq.w $vr0, $vr2, $vr0 -; CHECK-NEXT: vreplgr2vr.w $vr2, $a2 -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xi32_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI8_0) +; LA32-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI8_0) +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: vreplgr2vr.w $vr2, $a3 +; LA32-NEXT: vseq.w $vr0, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.w $vr2, $a2 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xi32_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI8_0) +; LA64-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI8_0) +; LA64-NEXT: vld $vr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: vreplgr2vr.w $vr2, $a0 +; LA64-NEXT: vseq.w $vr0, $vr2, $vr0 +; LA64-NEXT: vreplgr2vr.w $vr2, $a2 +; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x i32>, ptr %src %v_new = insertelement <4 x i32> %v, i32 %ins, i32 %idx store <4 x i32> %v_new, ptr %dst @@ -139,18 +184,36 @@ define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind { } define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_2xi64_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI9_0) -; CHECK-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI9_0) -; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: vreplgr2vr.d $vr2, $a0 -; CHECK-NEXT: vseq.d $vr0, $vr2, $vr0 -; CHECK-NEXT: vreplgr2vr.d $vr2, $a2 -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_2xi64_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a5, %pc_hi20(.LCPI9_0) +; LA32-NEXT: vld $vr0, $a5, %pc_lo12(.LCPI9_0) +; LA32-NEXT: add.w $a4, $a4, $a4 +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: vreplgr2vr.w $vr2, $a4 +; LA32-NEXT: vseq.w $vr2, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.w $vr3, $a2 +; LA32-NEXT: vbitsel.v $vr1, $vr1, $vr3, $vr2 +; LA32-NEXT: addi.w $a0, $a4, 1 +; LA32-NEXT: vreplgr2vr.w $vr2, $a0 +; LA32-NEXT: vseq.w $vr0, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.w $vr2, $a3 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_2xi64_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI9_0) +; LA64-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI9_0) +; LA64-NEXT: vld $vr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: vreplgr2vr.d $vr2, $a0 +; LA64-NEXT: vseq.d $vr0, $vr2, $vr0 +; LA64-NEXT: vreplgr2vr.d $vr2, $a2 +; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <2 x i64>, ptr %src %v_new = insertelement <2 x i64> %v, i64 %ins, i32 %idx store <2 x i64> %v_new, ptr %dst @@ -158,19 +221,32 @@ define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind { } define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_4xfloat_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI10_0) -; CHECK-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI10_0) -; CHECK-NEXT: vld $vr2, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0 -; CHECK-NEXT: vreplgr2vr.w $vr3, $a0 -; CHECK-NEXT: vseq.w $vr1, $vr3, $vr1 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xfloat_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI10_0) +; LA32-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI10_0) +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: vld $vr2, $a0, 0 +; LA32-NEXT: vreplgr2vr.w $vr3, $a2 +; LA32-NEXT: vseq.w $vr1, $vr3, $vr1 +; LA32-NEXT: vreplvei.w $vr0, $vr0, 0 +; LA32-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xfloat_idx: +; LA64: # %bb.0: +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: pcalau12i $a3, %pc_hi20(.LCPI10_0) +; LA64-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI10_0) +; LA64-NEXT: vld $vr2, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a2, 31, 0 +; LA64-NEXT: vreplgr2vr.w $vr3, $a0 +; LA64-NEXT: vseq.w $vr1, $vr3, $vr1 +; LA64-NEXT: vreplvei.w $vr0, $vr0, 0 +; LA64-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x float>, ptr %src %v_new = insertelement <4 x float> %v, float %ins, i32 %idx store <4 x float> %v_new, ptr %dst @@ -178,19 +254,34 @@ define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwi } define void @insert_2xdouble_idx(ptr %src, ptr %dst, double %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_2xdouble_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI11_0) -; CHECK-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI11_0) -; CHECK-NEXT: vld $vr2, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0 -; CHECK-NEXT: vreplgr2vr.d $vr3, $a0 -; CHECK-NEXT: vseq.d $vr1, $vr3, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_2xdouble_idx: +; LA32: # %bb.0: +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0) +; LA32-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI11_0) +; LA32-NEXT: vrepli.b $vr3, 0 +; LA32-NEXT: vinsgr2vr.w $vr3, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr3, $a2, 2 +; LA32-NEXT: vseq.d $vr2, $vr3, $vr2 +; LA32-NEXT: vreplvei.d $vr0, $vr0, 0 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_2xdouble_idx: +; LA64: # %bb.0: +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: pcalau12i $a3, %pc_hi20(.LCPI11_0) +; LA64-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI11_0) +; LA64-NEXT: vld $vr2, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a2, 31, 0 +; LA64-NEXT: vreplgr2vr.d $vr3, $a0 +; LA64-NEXT: vseq.d $vr1, $vr3, $vr1 +; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 +; LA64-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <2 x double>, ptr %src %v_new = insertelement <2 x double> %v, double %ins, i32 %idx store <2 x double> %v_new, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll index 10510786f3216..40961bc9a08b9 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ;; vreplvei.b diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll index d1c071b45ddff..b13433ee5d159 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) { diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll index cd80dcb44e433..bee4ba6a84334 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ;; vshuf4i.b diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll index 57fd09ed2e09b..9c3a6f7be0542 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll @@ -1,17 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefix=LA64 define void @vec_reduce_add_v16i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v16i8: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v16i8: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <16 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) store i8 %res, ptr %dst @@ -19,16 +31,29 @@ define void @vec_reduce_add_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0 -; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -36,15 +61,25 @@ define void @vec_reduce_add_v8i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v4i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.w $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 0 -; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v4i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA32-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v4i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA64-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <4 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %v) store i8 %res, ptr %dst @@ -52,13 +87,23 @@ define void @vec_reduce_add_v4i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v2i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.h $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v2i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.h $a0, $a0, 0 +; LA32-NEXT: vinsgr2vr.h $vr0, $a0, 0 +; LA32-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v2i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.h $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.h $vr0, $a0, 0 +; LA64-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <2 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v) store i8 %res, ptr %dst @@ -66,15 +111,25 @@ define void @vec_reduce_add_v2i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v8i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: st.h $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v8i16: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v8i16: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret %v = load <8 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) store i16 %res, ptr %dst @@ -82,15 +137,27 @@ define void @vec_reduce_add_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0 -; CHECK-NEXT: st.h $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -98,13 +165,23 @@ define void @vec_reduce_add_v4i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v2i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.w $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v2i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v2i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret %v = load <2 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %v) store i16 %res, ptr %dst @@ -112,14 +189,23 @@ define void @vec_reduce_add_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: st.w $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.w $a0, $a1, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -127,13 +213,25 @@ define void @vec_reduce_add_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.w $a0, $a1, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -141,12 +239,27 @@ define void @vec_reduce_add_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 1 +; LA32-NEXT: add.w $a3, $a4, $a3 +; LA32-NEXT: add.w $a0, $a2, $a0 +; LA32-NEXT: sltu $a2, $a0, $a2 +; LA32-NEXT: add.w $a2, $a3, $a2 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll index cca4ce30758f1..734ecba843a4e 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_and_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_and_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_and_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_and_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_and_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,27 @@ define void @vec_reduce_and_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vori.b $vr1, $vr0, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr1, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +207,26 @@ define void @vec_reduce_and_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 0 +; LA32-NEXT: and $a3, $a4, $a3 +; LA32-NEXT: and $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a3, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll index ce431f0cf6a74..e833930830c3f 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_or_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_or_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_or_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_or_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_or_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,27 @@ define void @vec_reduce_or_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vori.b $vr1, $vr0, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr1, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +207,26 @@ define void @vec_reduce_or_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 0 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a3, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll index bdf153ad7794f..2220df68cddfd 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_smax_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_smax_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_smax_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmax.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vmax.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_smax_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmax.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.h $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmax.h $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.h $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmax.h $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_smax_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,26 @@ define void @vec_reduce_smax_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +206,37 @@ define void @vec_reduce_smax_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 3 +; LA32-NEXT: slt $a5, $a4, $a3 +; LA32-NEXT: xor $a6, $a3, $a4 +; LA32-NEXT: sltui $a6, $a6, 1 +; LA32-NEXT: masknez $a5, $a5, $a6 +; LA32-NEXT: sltu $a7, $a2, $a0 +; LA32-NEXT: maskeqz $a6, $a7, $a6 +; LA32-NEXT: or $a5, $a6, $a5 +; LA32-NEXT: masknez $a2, $a2, $a5 +; LA32-NEXT: maskeqz $a0, $a0, $a5 +; LA32-NEXT: or $a0, $a0, $a2 +; LA32-NEXT: masknez $a2, $a4, $a5 +; LA32-NEXT: maskeqz $a3, $a3, $a5 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.d $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll index e3b3c5e6f2410..50d76a3872e1e 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_smin_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_smin_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_smin_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmin.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vmin.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_smin_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmin.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.h $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmin.h $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.h $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmin.h $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_smin_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,26 @@ define void @vec_reduce_smin_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +206,37 @@ define void @vec_reduce_smin_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 1 +; LA32-NEXT: slt $a5, $a4, $a3 +; LA32-NEXT: xor $a6, $a4, $a3 +; LA32-NEXT: sltui $a6, $a6, 1 +; LA32-NEXT: masknez $a5, $a5, $a6 +; LA32-NEXT: sltu $a7, $a2, $a0 +; LA32-NEXT: maskeqz $a6, $a7, $a6 +; LA32-NEXT: or $a5, $a6, $a5 +; LA32-NEXT: masknez $a0, $a0, $a5 +; LA32-NEXT: maskeqz $a2, $a2, $a5 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: masknez $a2, $a3, $a5 +; LA32-NEXT: maskeqz $a3, $a4, $a5 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.d $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll index fff2304befd68..88146c78a969d 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_umax_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_umax_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_umax_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmax.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vmax.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_umax_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.hu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmax.hu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.hu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmax.hu $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.hu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmax.hu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_umax_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,26 @@ define void @vec_reduce_umax_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +206,37 @@ define void @vec_reduce_umax_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.du $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 3 +; LA32-NEXT: sltu $a5, $a4, $a3 +; LA32-NEXT: xor $a6, $a3, $a4 +; LA32-NEXT: sltui $a6, $a6, 1 +; LA32-NEXT: masknez $a5, $a5, $a6 +; LA32-NEXT: sltu $a7, $a2, $a0 +; LA32-NEXT: maskeqz $a6, $a7, $a6 +; LA32-NEXT: or $a5, $a6, $a5 +; LA32-NEXT: masknez $a2, $a2, $a5 +; LA32-NEXT: maskeqz $a0, $a0, $a5 +; LA32-NEXT: or $a0, $a0, $a2 +; LA32-NEXT: masknez $a2, $a4, $a5 +; LA32-NEXT: maskeqz $a3, $a3, $a5 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.du $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll index e14a294cbcfb6..e9d4b4aab6f91 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_umin_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_umin_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_umin_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmin.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vmin.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_umin_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.hu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmin.hu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.hu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmin.hu $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.hu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmin.hu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_umin_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,26 @@ define void @vec_reduce_umin_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +206,37 @@ define void @vec_reduce_umin_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.du $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 1 +; LA32-NEXT: sltu $a5, $a4, $a3 +; LA32-NEXT: xor $a6, $a4, $a3 +; LA32-NEXT: sltui $a6, $a6, 1 +; LA32-NEXT: masknez $a5, $a5, $a6 +; LA32-NEXT: sltu $a7, $a2, $a0 +; LA32-NEXT: maskeqz $a6, $a7, $a6 +; LA32-NEXT: or $a5, $a6, $a5 +; LA32-NEXT: masknez $a0, $a0, $a5 +; LA32-NEXT: maskeqz $a2, $a2, $a5 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: masknez $a2, $a3, $a5 +; LA32-NEXT: maskeqz $a3, $a4, $a5 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.du $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll index ae2bb8f91de05..ed965e9e10ee7 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_xor_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_xor_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_xor_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_xor_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_xor_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,27 @@ define void @vec_reduce_xor_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vori.b $vr1, $vr0, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr1, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +207,26 @@ define void @vec_reduce_xor_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 0 +; LA32-NEXT: xor $a3, $a4, $a3 +; LA32-NEXT: xor $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a3, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll index 9485df746ff1c..dce6dc9f2aa37 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s - +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @load_sext_2i8_to_2i64(ptr %ptr, ptr %dst) { ; CHECK-LABEL: load_sext_2i8_to_2i64: @@ -40,15 +40,27 @@ entry: } define void @load_sext_8i8_to_8i16(ptr %ptr, ptr %dst) { -; CHECK-LABEL: load_sext_8i8_to_8i16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vslli.h $vr0, $vr0, 8 -; CHECK-NEXT: vsrai.h $vr0, $vr0, 8 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_sext_8i8_to_8i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vilvl.b $vr0, $vr0, $vr0 +; LA32-NEXT: vslli.h $vr0, $vr0, 8 +; LA32-NEXT: vsrai.h $vr0, $vr0, 8 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_sext_8i8_to_8i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vilvl.b $vr0, $vr0, $vr0 +; LA64-NEXT: vslli.h $vr0, $vr0, 8 +; LA64-NEXT: vsrai.h $vr0, $vr0, 8 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret entry: %A = load <8 x i8>, ptr %ptr %B = sext <8 x i8> %A to <8 x i16> @@ -75,15 +87,27 @@ entry: } define void @load_sext_4i16_to_4i32(ptr %ptr, ptr %dst) { -; CHECK-LABEL: load_sext_4i16_to_4i32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vslli.w $vr0, $vr0, 16 -; CHECK-NEXT: vsrai.w $vr0, $vr0, 16 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_sext_4i16_to_4i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vilvl.h $vr0, $vr0, $vr0 +; LA32-NEXT: vslli.w $vr0, $vr0, 16 +; LA32-NEXT: vsrai.w $vr0, $vr0, 16 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_sext_4i16_to_4i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vilvl.h $vr0, $vr0, $vr0 +; LA64-NEXT: vslli.w $vr0, $vr0, 16 +; LA64-NEXT: vsrai.w $vr0, $vr0, 16 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret entry: %A = load <4 x i16>, ptr %ptr %B = sext <4 x i16> %A to <4 x i32> @@ -92,15 +116,26 @@ entry: } define void @load_sext_2i32_to_2i64(ptr %ptr, ptr %dst) { -; CHECK-LABEL: load_sext_2i32_to_2i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16 -; CHECK-NEXT: vslli.d $vr0, $vr0, 32 -; CHECK-NEXT: vsrai.d $vr0, $vr0, 32 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_sext_2i32_to_2i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 2 +; LA32-NEXT: vslli.d $vr0, $vr0, 32 +; LA32-NEXT: vsrai.d $vr0, $vr0, 32 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_sext_2i32_to_2i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.w $vr0, $vr0, 16 +; LA64-NEXT: vslli.d $vr0, $vr0, 32 +; LA64-NEXT: vsrai.d $vr0, $vr0, 32 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret entry: %A = load <2 x i32>, ptr %ptr %B = sext <2 x i32> %A to <2 x i64> diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll index 9b1b584bd9c76..bb008ee5eb903 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @shuffle_any_ext_2i8_to_2i64(ptr %ptr, ptr %dst) nounwind { ; CHECK-LABEL: shuffle_any_ext_2i8_to_2i64: @@ -35,13 +36,22 @@ define void @shuffle_any_ext_2i16_to_2i64(ptr %ptr, ptr %dst) nounwind { } define void @shuffle_any_ext_2i32_to_2i64(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: shuffle_any_ext_2i32_to_2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: shuffle_any_ext_2i32_to_2i64: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 2 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: shuffle_any_ext_2i32_to_2i64: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.w $vr0, $vr0, 16 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %x = load <2 x i32>, ptr %ptr %y = shufflevector <2 x i32> %x, <2 x i32> poison, <4 x i32> %r = bitcast <4 x i32> %y to <2 x i64> @@ -66,13 +76,23 @@ define void @shuffle_any_ext_4i8_to_4i32(ptr %ptr, ptr %dst) nounwind { } define void @shuffle_any_ext_4i16_to_4i32(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: shuffle_any_ext_4i16_to_4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: shuffle_any_ext_4i16_to_4i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vilvl.h $vr0, $vr0, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: shuffle_any_ext_4i16_to_4i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vilvl.h $vr0, $vr0, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %x = load <4 x i16>, ptr %ptr %y = shufflevector <4 x i16> %x, <4 x i16> poison, <8 x i32> %r = bitcast <8 x i16> %y to <4 x i32> @@ -81,13 +101,23 @@ define void @shuffle_any_ext_4i16_to_4i32(ptr %ptr, ptr %dst) nounwind { } define void @shuffle_any_ext_8i8_to_8i16(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: shuffle_any_ext_8i8_to_8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: shuffle_any_ext_8i8_to_8i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vilvl.b $vr0, $vr0, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: shuffle_any_ext_8i8_to_8i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vilvl.b $vr0, $vr0, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %x = load <8 x i8>, ptr %ptr %y = shufflevector <8 x i8> %x, <8 x i8> poison, <16 x i32> %r = bitcast <16 x i8> %y to <8 x i16> diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll index b1e3f74cd1739..be241925a2788 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s ;; TODO For these special shuffle mask, we can lower it to vbsll + vbsrl + vor. diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll index ff0f252ba2bdf..5275d5326f73a 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s define <16 x i8> @shuffle_16i8_vbsll_v_1(<16 x i8> %a) nounwind { diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll index e056e7c38ddcd..314350acd23d6 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll @@ -1,13 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefix=LA64 define void @load_trunc_2i64_to_2i32(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i64_to_2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 8 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i64_to_2i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i64_to_2i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.w $vr0, $vr0, 8 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr %trunc = trunc <2 x i64> %a to <2 x i32> store <2 x i32> %trunc, ptr %dst @@ -15,14 +25,24 @@ define void @load_trunc_2i64_to_2i32(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i64_to_2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) -; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0 -; CHECK-NEXT: vstelm.w $vr1, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i64_to_2i16: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) +; LA32-NEXT: vshuf.h $vr1, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr1, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i64_to_2i16: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) +; LA64-NEXT: vshuf.h $vr1, $vr0, $vr0 +; LA64-NEXT: vstelm.w $vr1, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr %trunc = trunc <2 x i64> %a to <2 x i16> store <2 x i16> %trunc, ptr %dst @@ -30,14 +50,23 @@ define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i64_to_2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) -; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i64_to_2i8: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) +; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) +; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i64_to_2i8: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) +; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) +; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr %trunc = trunc <2 x i64> %a to <2 x i8> store <2 x i8> %trunc, ptr %dst @@ -45,12 +74,22 @@ define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_4i32_to_4i16(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_4i32_to_4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vpickev.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_4i32_to_4i16: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickev.h $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_4i32_to_4i16: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vpickev.h $vr0, $vr0, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <4 x i32>, ptr %ptr %trunc = trunc <4 x i32> %a to <4 x i16> store <4 x i16> %trunc, ptr %dst @@ -58,14 +97,24 @@ define void @load_trunc_4i32_to_4i16(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_4i32_to_4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) -; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_4i32_to_4i8: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) +; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) +; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_4i32_to_4i8: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) +; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) +; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <4 x i32>, ptr %ptr %trunc = trunc <4 x i32> %a to <4 x i8> store <4 x i8> %trunc, ptr %dst @@ -73,12 +122,22 @@ define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_8i16_to_8i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_8i16_to_8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vpickev.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_8i16_to_8i8: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickev.b $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_8i16_to_8i8: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vpickev.b $vr0, $vr0, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <8 x i16>, ptr %ptr %trunc = trunc <8 x i16> %a to <8 x i8> store <8 x i8> %trunc, ptr %dst @@ -86,13 +145,24 @@ define void @load_trunc_8i16_to_8i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i32_to_2i16(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i32_to_2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 8 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i32_to_2i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vshuf4i.h $vr0, $vr0, 8 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i32_to_2i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.h $vr0, $vr0, 8 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i32>, ptr %ptr %trunc = trunc <2 x i32> %a to <2 x i16> store <2 x i16> %trunc, ptr %dst @@ -100,15 +170,27 @@ define void @load_trunc_2i32_to_2i16(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i32_to_2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) -; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI7_0) -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i32_to_2i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI7_0) +; LA32-NEXT: vld $vr0, $a3, %pc_lo12(.LCPI7_0) +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i32_to_2i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA64-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI7_0) +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i32>, ptr %ptr %trunc = trunc <2 x i32> %a to <2 x i8> store <2 x i8> %trunc, ptr %dst @@ -116,13 +198,24 @@ define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_4i16_to_4i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_4i16_to_4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vpickev.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_4i16_to_4i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vpickev.b $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_4i16_to_4i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vpickev.b $vr0, $vr0, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <4 x i16>, ptr %ptr %trunc = trunc <4 x i16> %a to <4 x i8> store <4 x i8> %trunc, ptr %dst @@ -130,17 +223,23 @@ define void @load_trunc_4i16_to_4i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i16_to_2i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i16_to_2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.w $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 8 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i16_to_2i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA32-NEXT: vshuf4i.b $vr0, $vr0, 8 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i16_to_2i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.b $vr0, $vr0, 8 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i16>, ptr %ptr %trunc = trunc <2 x i16> %a to <2 x i8> store <2 x i8> %trunc, ptr %dst ret void } - - diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll index 7fa591db5d1fa..8bdeebef13dd2 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx --verify-machineinstrs < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx --verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx --verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LA64 define i16 @vmsk_eq_allzeros_i8(<16 x i8 > %a) { ; CHECK-LABEL: vmsk_eq_allzeros_i8: @@ -605,17 +606,29 @@ define i4 @vmsk_eq_allzeros_v4i8(<4 x i8> %a) { } define i32 @vmsk2_eq_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_eq_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vseqi.b $vr0, $vr0, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vseqi.b $vr0, $vr1, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_eq_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vseqi.b $vr0, $vr0, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vseqi.b $vr0, $vr1, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_eq_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vseqi.b $vr0, $vr0, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vseqi.b $vr0, $vr1, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp eq <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -623,18 +636,31 @@ entry: } define i32 @vmsk2_sgt_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sgt_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vrepli.b $vr2, 0 -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sgt_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vrepli.b $vr2, 0 +; LA32-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslt.b $vr0, $vr2, $vr1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sgt_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vrepli.b $vr2, 0 +; LA64-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslt.b $vr0, $vr2, $vr1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sgt <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -642,18 +668,31 @@ entry: } define i32 @vmsk2_sgt_allones_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sgt_allones_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vrepli.b $vr2, -1 -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sgt_allones_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vrepli.b $vr2, -1 +; LA32-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslt.b $vr0, $vr2, $vr1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sgt_allones_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vrepli.b $vr2, -1 +; LA64-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslt.b $vr0, $vr2, $vr1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sgt <32 x i8> %a, splat (i8 -1) %2 = bitcast <32 x i1> %1 to i32 @@ -661,18 +700,31 @@ entry: } define i32 @vmsk2_sge_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sge_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vrepli.b $vr2, 0 -; CHECK-NEXT: vsle.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vsle.b $vr0, $vr2, $vr1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sge_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vrepli.b $vr2, 0 +; LA32-NEXT: vsle.b $vr0, $vr2, $vr0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vsle.b $vr0, $vr2, $vr1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sge_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vrepli.b $vr2, 0 +; LA64-NEXT: vsle.b $vr0, $vr2, $vr0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vsle.b $vr0, $vr2, $vr1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sge <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -680,15 +732,25 @@ entry: } define i32 @vmsk2_slt_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_slt_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr1 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_slt_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr1 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_slt_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr1 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp slt <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -696,17 +758,29 @@ entry: } define i32 @vmsk2_sle_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sle_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vslei.b $vr0, $vr0, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslei.b $vr0, $vr1, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sle_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vslei.b $vr0, $vr0, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslei.b $vr0, $vr1, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sle_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vslei.b $vr0, $vr0, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslei.b $vr0, $vr1, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sle <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -714,17 +788,29 @@ entry: } define i32 @vmsk2_sle_allones_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sle_allones_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vslei.b $vr0, $vr0, -1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslei.b $vr0, $vr1, -1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sle_allones_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vslei.b $vr0, $vr0, -1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslei.b $vr0, $vr1, -1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sle_allones_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vslei.b $vr0, $vr0, -1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslei.b $vr0, $vr1, -1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sle <32 x i8> %a, splat (i8 -1) %2 = bitcast <32 x i1> %1 to i32 @@ -732,19 +818,33 @@ entry: } define i32 @vmsk2_ne_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_ne_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vseqi.b $vr0, $vr0, 0 -; CHECK-NEXT: vxori.b $vr0, $vr0, 255 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vseqi.b $vr0, $vr1, 0 -; CHECK-NEXT: vxori.b $vr0, $vr0, 255 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_ne_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vseqi.b $vr0, $vr0, 0 +; LA32-NEXT: vxori.b $vr0, $vr0, 255 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vseqi.b $vr0, $vr1, 0 +; LA32-NEXT: vxori.b $vr0, $vr0, 255 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_ne_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vseqi.b $vr0, $vr0, 0 +; LA64-NEXT: vxori.b $vr0, $vr0, 255 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vseqi.b $vr0, $vr1, 0 +; LA64-NEXT: vxori.b $vr0, $vr0, 255 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp ne <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -752,38 +852,66 @@ entry: } define i32 @vmsk2_sgt_v32i8(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: vmsk2_sgt_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslt.b $vr0, $vr3, $vr1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sgt_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslt.b $vr0, $vr3, $vr1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sgt_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslt.b $vr0, $vr3, $vr1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret %x = icmp sgt <32 x i8> %a, %b %res = bitcast <32 x i1> %x to i32 ret i32 %res } define i32 @vmsk2_sgt_and_sgt_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { -; CHECK-LABEL: vmsk2_sgt_and_sgt_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vslt.b $vr1, $vr3, $vr1 -; CHECK-NEXT: vslt.b $vr2, $vr6, $vr4 -; CHECK-NEXT: vslt.b $vr3, $vr7, $vr5 -; CHECK-NEXT: vand.v $vr1, $vr1, $vr3 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr2 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr1 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sgt_and_sgt_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA32-NEXT: vslt.b $vr1, $vr3, $vr1 +; LA32-NEXT: vslt.b $vr2, $vr6, $vr4 +; LA32-NEXT: vslt.b $vr3, $vr7, $vr5 +; LA32-NEXT: vand.v $vr1, $vr1, $vr3 +; LA32-NEXT: vand.v $vr0, $vr0, $vr2 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr1 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sgt_and_sgt_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA64-NEXT: vslt.b $vr1, $vr3, $vr1 +; LA64-NEXT: vslt.b $vr2, $vr6, $vr4 +; LA64-NEXT: vslt.b $vr3, $vr7, $vr5 +; LA64-NEXT: vand.v $vr1, $vr1, $vr3 +; LA64-NEXT: vand.v $vr0, $vr0, $vr2 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr1 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret %x0 = icmp sgt <32 x i8> %a, %b %x1 = icmp sgt <32 x i8> %c, %d %y = and <32 x i1> %x0, %x1 @@ -792,17 +920,29 @@ define i32 @vmsk2_sgt_and_sgt_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3 } define i32 @vmsk2_trunc_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_trunc_i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vslli.b $vr0, $vr0, 7 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslli.b $vr0, $vr1, 7 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_trunc_i8: +; LA32: # %bb.0: +; LA32-NEXT: vslli.b $vr0, $vr0, 7 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslli.b $vr0, $vr1, 7 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_trunc_i8: +; LA64: # %bb.0: +; LA64-NEXT: vslli.b $vr0, $vr0, 7 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslli.b $vr0, $vr1, 7 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret %y = trunc <32 x i8> %a to <32 x i1> %res = bitcast <32 x i1> %y to i32 ret i32 %res diff --git a/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll b/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll index 54328260d9d14..42ef9133bf04d 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s define <16 x i8> @widen_shuffle_mask_v16i8_to_v8i16(<16 x i8> %a, <16 x i8> %b) {