diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ac95ef5f30888..0c503bbc2f8c8 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -327,7 +327,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::SETCC, VT, Legal); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); } @@ -415,7 +415,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SETCC, VT, Legal); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) { @@ -611,10 +611,57 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerVECREDUCE(Op, DAG); case ISD::ConstantFP: return lowerConstantFP(Op, DAG); + case ISD::VSELECT: + return lowerVSELECT(Op, DAG); } return SDValue(); } +SDValue LoongArchTargetLowering::lowerVSELECT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + + // Try to lower vselect to vector_shuffle. All cases with constant + // build_vector condition will be handled. NOTE: On 32-bit platform, vselect + // with v2i64/v4i64 type condition will never enter this because of the extra + // bitcast. It can be considered separately later. + if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { + BuildVectorSDNode *BVN = cast(Cond.getNode()); + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + // If the Cond is a BUILD_VECTOR with splat constants, using a vldi or + // constant broadcast instruction and a vbitsel is better. + if (BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, + /*MinSplatBits=*/8) && + (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || + SplatBitSize == 64)) + return Op; + + SmallVector Mask; + EVT CondVT = Cond.getValueType(); + int NumElts = CondVT.getVectorNumElements(); + Mask.resize(NumElts, -1); + for (int i = 0; i < NumElts; ++i) { + auto *CondElt = dyn_cast(Cond.getOperand(i)); + if (CondElt->getZExtValue() == 0) + Mask[i] = i + NumElts; + else + Mask[i] = i; + } + + return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); + } + + // Fallback to match patterns in tablegen. + return Op; +} + SDValue LoongArchTargetLowering::lowerConstantFP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 232ac6092149d..75b2db87e0cfa 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -239,6 +239,7 @@ class LoongArchTargetLowering : public TargetLowering { SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVSELECT(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll index 559cc53062566..ccf4f57c24841 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @select_v32i8_imm(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: select_v32i8_imm: @@ -37,10 +37,8 @@ define void @select_v32i8_1(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI2_0) -; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI2_0) -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 -; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 0 +; CHECK-NEXT: xvst $xr1, $a0, 0 ; CHECK-NEXT: ret %v0 = load <32 x i8>, ptr %a0 %v1 = load <32 x i8>, ptr %a1 @@ -56,7 +54,7 @@ define void @select_v32i8_2(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK-NEXT: xvld $xr1, $a2, 0 ; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) ; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI3_0) -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <32 x i8>, ptr %a0 @@ -88,9 +86,7 @@ define void @select_v16i16_1(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI5_0) -; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI5_0) -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: xvinsve0.w $xr0, $xr1, 0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i16>, ptr %a0 @@ -107,8 +103,8 @@ define void @select_v16i16_2(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK-NEXT: xvld $xr1, $a2, 0 ; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI6_0) ; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI6_0) -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 -; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: xvshuf.h $xr2, $xr1, $xr0 +; CHECK-NEXT: xvst $xr2, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i16>, ptr %a0 %v1 = load <16 x i16>, ptr %a1 @@ -138,10 +134,8 @@ define void @select_v8i32_1(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI8_0) -; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI8_0) -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 -; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 0 +; CHECK-NEXT: xvst $xr1, $a0, 0 ; CHECK-NEXT: ret %v0 = load <8 x i32>, ptr %a0 %v1 = load <8 x i32>, ptr %a1 @@ -157,8 +151,8 @@ define void @select_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK-NEXT: xvld $xr1, $a2, 0 ; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI9_0) ; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI9_0) -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 -; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: xvshuf.w $xr2, $xr1, $xr0 +; CHECK-NEXT: xvst $xr2, $a0, 0 ; CHECK-NEXT: ret %v0 = load <8 x float>, ptr %a0 %v1 = load <8 x float>, ptr %a1 @@ -168,15 +162,23 @@ define void @select_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind { } define void @select_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind { -; CHECK-LABEL: select_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI10_0) -; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI10_0) -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: select_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI10_0) +; LA32-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI10_0) +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: select_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: xvinsve0.d $xr1, $xr0, 0 +; LA64-NEXT: xvst $xr1, $a0, 0 +; LA64-NEXT: ret %v0 = load <4 x i64>, ptr %a0 %v1 = load <4 x i64>, ptr %a1 %sel = select <4 x i1> , <4 x i64> %v0, <4 x i64> %v1 @@ -185,15 +187,25 @@ define void @select_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind { } define void @select_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind { -; CHECK-LABEL: select_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvld $xr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI11_0) -; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI11_0) -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: select_v4f64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: xvld $xr1, $a2, 0 +; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI11_0) +; LA32-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI11_0) +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: select_v4f64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvld $xr1, $a2, 0 +; LA64-NEXT: pcalau12i $a1, %pc_hi20(.LCPI11_0) +; LA64-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI11_0) +; LA64-NEXT: xvshuf.d $xr2, $xr1, $xr0 +; LA64-NEXT: xvst $xr2, $a0, 0 +; LA64-NEXT: ret %v0 = load <4 x double>, ptr %a0 %v1 = load <4 x double>, ptr %a1 %sel = select <4 x i1> , <4 x double> %v0, <4 x double> %v1 diff --git a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll index 25c4f099099db..e944c1d7fc418 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @select_v16i8_imm(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: select_v16i8_imm: @@ -51,9 +51,7 @@ define void @select_v16i8_1(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI3_0) -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 6 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i8>, ptr %a0 @@ -70,7 +68,7 @@ define void @select_v16i8_2(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI4_0) ; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI4_0) -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i8>, ptr %a0 @@ -102,9 +100,7 @@ define void @select_v8i16_1(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI6_0) -; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI6_0) -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 12 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <8 x i16>, ptr %a0 @@ -121,8 +117,8 @@ define void @select_v8i16_2(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI7_0) ; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI7_0) -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 -; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: vshuf.w $vr2, $vr1, $vr0 +; CHECK-NEXT: vst $vr2, $a0, 0 ; CHECK-NEXT: ret %v0 = load <8 x i16>, ptr %a0 %v1 = load <8 x i16>, ptr %a1 @@ -152,9 +148,7 @@ define void @select_v4i32_1(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI9_0) -; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI9_0) -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: vshuf4i.d $vr0, $vr1, 12 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <4 x i32>, ptr %a0 @@ -171,8 +165,8 @@ define void @select_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind { ; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI10_0) ; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI10_0) -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 -; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: vshuf.w $vr2, $vr1, $vr0 +; CHECK-NEXT: vst $vr2, $a0, 0 ; CHECK-NEXT: ret %v0 = load <4 x float>, ptr %a0 %v1 = load <4 x float>, ptr %a1 @@ -182,15 +176,23 @@ define void @select_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind { } define void @select_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind { -; CHECK-LABEL: select_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI11_0) -; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI11_0) -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: select_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI11_0) +; LA32-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI11_0) +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: select_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vshuf4i.d $vr0, $vr1, 12 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret %v0 = load <2 x i64>, ptr %a0 %v1 = load <2 x i64>, ptr %a1 %sel = select <2 x i1> , <2 x i64> %v0, <2 x i64> %v1 @@ -199,15 +201,23 @@ define void @select_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind { } define void @select_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind { -; CHECK-LABEL: select_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI12_0) -; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI12_0) -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: select_v2f64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vld $vr1, $a2, 0 +; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI12_0) +; LA32-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI12_0) +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: select_v2f64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vshuf4i.d $vr0, $vr1, 6 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret %v0 = load <2 x double>, ptr %a0 %v1 = load <2 x double>, ptr %a1 %sel = select <2 x i1> , <2 x double> %v0, <2 x double> %v1