diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 94f53d5b85f10..d41f31105b09a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -594,6 +594,59 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return SDValue(); } +// Helper to attempt to return a cheaper, bit-inverted version of \p V. +static SDValue isNOT(SDValue V, SelectionDAG &DAG) { + // TODO: don't always ignore oneuse constraints. + V = peekThroughBitcasts(V); + EVT VT = V.getValueType(); + + // Match not(xor X, -1) -> X. + if (V.getOpcode() == ISD::XOR && + (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) || + isAllOnesConstant(V.getOperand(1)))) + return V.getOperand(0); + + // Match not(extract_subvector(not(X)) -> extract_subvector(X). + if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { + if (SDValue Not = isNOT(V.getOperand(0), DAG)) { + Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not, + V.getOperand(1)); + } + } + + // Match not(SplatVector(not(X)) -> SplatVector(X). + if (V.getOpcode() == ISD::BUILD_VECTOR) { + if (SDValue SplatValue = + cast(V.getNode())->getSplatValue()) { + if (!V->isOnlyUserOf(SplatValue.getNode())) + return SDValue(); + + if (SDValue Not = isNOT(SplatValue, DAG)) { + Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); + return DAG.getSplat(VT, SDLoc(Not), Not); + } + } + } + + // Match not(or(not(X),not(Y))) -> and(X, Y). + if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) { + // TODO: Handle cases with single NOT operand -> VANDN + if (SDValue Op1 = isNOT(V.getOperand(1), DAG)) + if (SDValue Op0 = isNOT(V.getOperand(0), DAG)) + return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0), + DAG.getBitcast(VT, Op1)); + } + + // TODO: Add more matching patterns. Such as, + // not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y). + // not(slt(C, X)) -> slt(X - 1, C) + + return SDValue(); +} + SDValue LoongArchTargetLowering::lowerConstantFP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -4939,6 +4992,33 @@ void LoongArchTargetLowering::ReplaceNodeResults( } } +/// Try to fold: (and (xor X, -1), Y) -> (vandn X, Y). +static SDValue combineAndNotIntoVANDN(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDN"); + + MVT VT = N->getSimpleValueType(0); + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + + SDValue X, Y; + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (SDValue Not = isNOT(N0, DAG)) { + X = Not; + Y = N1; + } else if (SDValue Not = isNOT(N1, DAG)) { + X = Not; + Y = N0; + } else + return SDValue(); + + X = DAG.getBitcast(VT, X); + Y = DAG.getBitcast(VT, Y); + return DAG.getNode(LoongArchISD::VANDN, DL, VT, X, Y); +} + static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const LoongArchSubtarget &Subtarget) { @@ -4956,6 +5036,9 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG, SDValue NewOperand; MVT GRLenVT = Subtarget.getGRLenVT(); + if (SDValue R = combineAndNotIntoVANDN(N, DL, DAG)) + return R; + // BSTRPICK requires the 32S feature. if (!Subtarget.has32S()) return SDValue(); @@ -6628,6 +6711,69 @@ performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Do target-specific dag combines on LoongArchISD::VANDN nodes. +static SDValue performVANDNCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + MVT VT = N->getSimpleValueType(0); + SDLoc DL(N); + + // VANDN(undef, x) -> 0 + // VANDN(x, undef) -> 0 + if (N0.isUndef() || N1.isUndef()) + return DAG.getConstant(0, DL, VT); + + // VANDN(0, x) -> x + if (ISD::isBuildVectorAllZeros(N0.getNode())) + return N1; + + // VANDN(x, 0) -> 0 + if (ISD::isBuildVectorAllZeros(N1.getNode())) + return DAG.getConstant(0, DL, VT); + + // VANDN(x, -1) -> NOT(x) -> XOR(x, -1) + if (ISD::isBuildVectorAllOnes(N1.getNode())) + return DAG.getNOT(DL, N0, VT); + + // Turn VANDN back to AND if input is inverted. + if (SDValue Not = isNOT(N0, DAG)) + return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1); + + // Folds for better commutativity: + if (N1->hasOneUse()) { + // VANDN(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)). + if (SDValue Not = isNOT(N1, DAG)) + return DAG.getNOT( + DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT); + + // VANDN(x, SplatVector(Imm)) -> AND(NOT(x), NOT(SplatVector(~Imm))) + // -> NOT(OR(x, SplatVector(-Imm)) + // Combination is performed only when VT is v16i8/v32i8, using `vnori.b` to + // gain benefits. + if (!DCI.isBeforeLegalizeOps() && (VT == MVT::v16i8 || VT == MVT::v32i8) && + N1.getOpcode() == ISD::BUILD_VECTOR) { + if (SDValue SplatValue = + cast(N1.getNode())->getSplatValue()) { + if (!N1->isOnlyUserOf(SplatValue.getNode())) + return SDValue(); + + if (auto *C = dyn_cast(SplatValue)) { + uint8_t NCVal = static_cast(~(C->getSExtValue())); + SDValue Not = + DAG.getSplat(VT, DL, DAG.getTargetConstant(NCVal, DL, MVT::i8)); + return DAG.getNOT( + DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), + VT); + } + } + } + } + + return SDValue(); +} + SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -6663,6 +6809,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget); case ISD::EXTRACT_VECTOR_ELT: return performEXTRACT_VECTOR_ELTCombine(N, DAG, DCI, Subtarget); + case LoongArchISD::VANDN: + return performVANDNCombine(N, DAG, DCI, Subtarget); } return SDValue(); } @@ -7454,6 +7602,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VPICK_SEXT_ELT) NODE_NAME_CASE(VPICK_ZEXT_ELT) NODE_NAME_CASE(VREPLVE) + NODE_NAME_CASE(VANDN) NODE_NAME_CASE(VALL_ZERO) NODE_NAME_CASE(VANY_ZERO) NODE_NAME_CASE(VALL_NONZERO) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 3c00296116ac2..ed4f618983014 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -174,6 +174,9 @@ enum NodeType : unsigned { VBSLL, VBSRL, + // Vector bit operation + VANDN, + // Scalar load broadcast to vector VLDREPL, diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index adfe990ba1234..b7f5993103286 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1395,7 +1395,7 @@ def : Pat<(vnot (or (vt LASX256:$xj), (vt LASX256:$xk))), (XVNOR_V LASX256:$xj, LASX256:$xk)>; // XVANDN_V foreach vt = [v32i8, v16i16, v8i32, v4i64] in -def : Pat<(and (vt (vnot LASX256:$xj)), (vt LASX256:$xk)), +def : Pat<(loongarch_vandn (vt LASX256:$xj), (vt LASX256:$xk)), (XVANDN_V LASX256:$xj, LASX256:$xk)>; // XVORN_V foreach vt = [v32i8, v16i16, v8i32, v4i64] in @@ -1449,25 +1449,25 @@ defm : PatXr; defm : PatXr; // XVBITCLR_{B/H/W/D} -def : Pat<(and v32i8:$xj, (vnot (shl vsplat_imm_eq_1, v32i8:$xk))), +def : Pat<(loongarch_vandn (v32i8 (shl vsplat_imm_eq_1, v32i8:$xk)), v32i8:$xj), (v32i8 (XVBITCLR_B v32i8:$xj, v32i8:$xk))>; -def : Pat<(and v16i16:$xj, (vnot (shl vsplat_imm_eq_1, v16i16:$xk))), +def : Pat<(loongarch_vandn (v16i16 (shl vsplat_imm_eq_1, v16i16:$xk)), v16i16:$xj), (v16i16 (XVBITCLR_H v16i16:$xj, v16i16:$xk))>; -def : Pat<(and v8i32:$xj, (vnot (shl vsplat_imm_eq_1, v8i32:$xk))), +def : Pat<(loongarch_vandn (v8i32 (shl vsplat_imm_eq_1, v8i32:$xk)), v8i32:$xj), (v8i32 (XVBITCLR_W v8i32:$xj, v8i32:$xk))>; -def : Pat<(and v4i64:$xj, (vnot (shl vsplat_imm_eq_1, v4i64:$xk))), +def : Pat<(loongarch_vandn (v4i64 (shl vsplat_imm_eq_1, v4i64:$xk)), v4i64:$xj), (v4i64 (XVBITCLR_D v4i64:$xj, v4i64:$xk))>; -def : Pat<(and v32i8:$xj, (vnot (shl vsplat_imm_eq_1, - (vsplati8imm7 v32i8:$xk)))), +def : Pat<(loongarch_vandn (v32i8 (shl vsplat_imm_eq_1, + (vsplati8imm7 v32i8:$xk))), v32i8:$xj), (v32i8 (XVBITCLR_B v32i8:$xj, v32i8:$xk))>; -def : Pat<(and v16i16:$xj, (vnot (shl vsplat_imm_eq_1, - (vsplati16imm15 v16i16:$xk)))), +def : Pat<(loongarch_vandn (v16i16 (shl vsplat_imm_eq_1, + (vsplati16imm15 v16i16:$xk))), v16i16:$xj), (v16i16 (XVBITCLR_H v16i16:$xj, v16i16:$xk))>; -def : Pat<(and v8i32:$xj, (vnot (shl vsplat_imm_eq_1, - (vsplati32imm31 v8i32:$xk)))), +def : Pat<(loongarch_vandn (v8i32 (shl vsplat_imm_eq_1, + (vsplati32imm31 v8i32:$xk))), v8i32:$xj), (v8i32 (XVBITCLR_W v8i32:$xj, v8i32:$xk))>; -def : Pat<(and v4i64:$xj, (vnot (shl vsplat_imm_eq_1, - (vsplati64imm63 v4i64:$xk)))), +def : Pat<(loongarch_vandn (v4i64 (shl vsplat_imm_eq_1, + (vsplati64imm63 v4i64:$xk))), v4i64:$xj), (v4i64 (XVBITCLR_D v4i64:$xj, v4i64:$xk))>; // XVBITCLRI_{B/H/W/D} diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 2c36099f8eb71..c5ce7b4e02678 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -56,6 +56,7 @@ def loongarch_vpackev: SDNode<"LoongArchISD::VPACKEV", SDT_LoongArchV2R>; def loongarch_vpackod: SDNode<"LoongArchISD::VPACKOD", SDT_LoongArchV2R>; def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>; def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>; +def loongarch_vandn: SDNode<"LoongArchISD::VANDN", SDT_LoongArchV2R>; def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_LoongArchV1RUimm>; def loongarch_vshuf4i_d : SDNode<"LoongArchISD::VSHUF4I", SDT_LoongArchV2RUimm>; @@ -1586,7 +1587,7 @@ def : Pat<(vnot (or (vt LSX128:$vj), (vt LSX128:$vk))), (VNOR_V LSX128:$vj, LSX128:$vk)>; // VANDN_V foreach vt = [v16i8, v8i16, v4i32, v2i64] in -def : Pat<(and (vt (vnot LSX128:$vj)), (vt LSX128:$vk)), +def : Pat<(loongarch_vandn (vt LSX128:$vj), (vt LSX128:$vk)), (VANDN_V LSX128:$vj, LSX128:$vk)>; // VORN_V foreach vt = [v16i8, v8i16, v4i32, v2i64] in @@ -1640,25 +1641,25 @@ defm : PatVr; defm : PatVr; // VBITCLR_{B/H/W/D} -def : Pat<(and v16i8:$vj, (vnot (shl vsplat_imm_eq_1, v16i8:$vk))), +def : Pat<(loongarch_vandn (v16i8 (shl vsplat_imm_eq_1, v16i8:$vk)), v16i8:$vj), (v16i8 (VBITCLR_B v16i8:$vj, v16i8:$vk))>; -def : Pat<(and v8i16:$vj, (vnot (shl vsplat_imm_eq_1, v8i16:$vk))), +def : Pat<(loongarch_vandn (v8i16 (shl vsplat_imm_eq_1, v8i16:$vk)), v8i16:$vj), (v8i16 (VBITCLR_H v8i16:$vj, v8i16:$vk))>; -def : Pat<(and v4i32:$vj, (vnot (shl vsplat_imm_eq_1, v4i32:$vk))), +def : Pat<(loongarch_vandn (v4i32 (shl vsplat_imm_eq_1, v4i32:$vk)), v4i32:$vj), (v4i32 (VBITCLR_W v4i32:$vj, v4i32:$vk))>; -def : Pat<(and v2i64:$vj, (vnot (shl vsplat_imm_eq_1, v2i64:$vk))), +def : Pat<(loongarch_vandn (v2i64 (shl vsplat_imm_eq_1, v2i64:$vk)), v2i64:$vj), (v2i64 (VBITCLR_D v2i64:$vj, v2i64:$vk))>; -def : Pat<(and v16i8:$vj, (vnot (shl vsplat_imm_eq_1, - (vsplati8imm7 v16i8:$vk)))), +def : Pat<(loongarch_vandn (v16i8 (shl vsplat_imm_eq_1, + (vsplati8imm7 v16i8:$vk))), v16i8:$vj), (v16i8 (VBITCLR_B v16i8:$vj, v16i8:$vk))>; -def : Pat<(and v8i16:$vj, (vnot (shl vsplat_imm_eq_1, - (vsplati16imm15 v8i16:$vk)))), +def : Pat<(loongarch_vandn (v8i16 (shl vsplat_imm_eq_1, + (vsplati16imm15 v8i16:$vk))), v8i16:$vj), (v8i16 (VBITCLR_H v8i16:$vj, v8i16:$vk))>; -def : Pat<(and v4i32:$vj, (vnot (shl vsplat_imm_eq_1, - (vsplati32imm31 v4i32:$vk)))), +def : Pat<(loongarch_vandn (v4i32 (shl vsplat_imm_eq_1, + (vsplati32imm31 v4i32:$vk))), v4i32:$vj), (v4i32 (VBITCLR_W v4i32:$vj, v4i32:$vk))>; -def : Pat<(and v2i64:$vj, (vnot (shl vsplat_imm_eq_1, - (vsplati64imm63 v2i64:$vk)))), +def : Pat<(loongarch_vandn (v2i64 (shl vsplat_imm_eq_1, + (vsplati64imm63 v2i64:$vk))), v2i64:$vj), (v2i64 (VBITCLR_D v2i64:$vj, v2i64:$vk))>; // VBITCLRI_{B/H/W/D} diff --git a/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll b/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll index aa67a20ab08a7..5ed49d959bf33 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll @@ -90,9 +90,8 @@ define void @pre_not_and_not_combine_v32i8(ptr %res, ptr %a, i8 %b) nounwind { ; CHECK-LABEL: pre_not_and_not_combine_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: nor $a1, $a2, $zero -; CHECK-NEXT: xvreplgr2vr.b $xr1, $a1 -; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvreplgr2vr.b $xr1, $a2 +; CHECK-NEXT: xvnor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <32 x i8>, ptr %a @@ -110,8 +109,7 @@ define void @post_not_and_not_combine_v32i8(ptr %res, ptr %a, i8 %b) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvreplgr2vr.b $xr1, $a2 -; CHECK-NEXT: xvxori.b $xr1, $xr1, 255 -; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvnor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <32 x i8>, ptr %a @@ -128,9 +126,8 @@ define void @pre_not_and_not_combine_v16i16(ptr %res, ptr %a, i16 %b) nounwind { ; CHECK-LABEL: pre_not_and_not_combine_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: nor $a1, $a2, $zero -; CHECK-NEXT: xvreplgr2vr.h $xr1, $a1 -; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvreplgr2vr.h $xr1, $a2 +; CHECK-NEXT: xvnor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i16>, ptr %a @@ -148,9 +145,7 @@ define void @post_not_and_not_combine_v16i16(ptr %res, ptr %a, i16 %b) nounwind ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvreplgr2vr.h $xr1, $a2 -; CHECK-NEXT: xvrepli.b $xr2, -1 -; CHECK-NEXT: xvxor.v $xr1, $xr1, $xr2 -; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvnor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i16>, ptr %a @@ -167,9 +162,8 @@ define void @pre_not_and_not_combine_v8i32(ptr %res, ptr %a, i32 %b) nounwind { ; CHECK-LABEL: pre_not_and_not_combine_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: nor $a1, $a2, $zero -; CHECK-NEXT: xvreplgr2vr.w $xr1, $a1 -; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvreplgr2vr.w $xr1, $a2 +; CHECK-NEXT: xvnor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <8 x i32>, ptr %a @@ -187,9 +181,7 @@ define void @post_not_and_not_combine_v8i32(ptr %res, ptr %a, i32 %b) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvreplgr2vr.w $xr1, $a2 -; CHECK-NEXT: xvrepli.b $xr2, -1 -; CHECK-NEXT: xvxor.v $xr1, $xr1, $xr2 -; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvnor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <8 x i32>, ptr %a @@ -218,9 +210,8 @@ define void @pre_not_and_not_combine_v4i64(ptr %res, ptr %a, i64 %b) nounwind { ; LA64-LABEL: pre_not_and_not_combine_v4i64: ; LA64: # %bb.0: ; LA64-NEXT: xvld $xr0, $a1, 0 -; LA64-NEXT: nor $a1, $a2, $zero -; LA64-NEXT: xvreplgr2vr.d $xr1, $a1 -; LA64-NEXT: xvandn.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvreplgr2vr.d $xr1, $a2 +; LA64-NEXT: xvnor.v $xr0, $xr0, $xr1 ; LA64-NEXT: xvst $xr0, $a0, 0 ; LA64-NEXT: ret %v0 = load <4 x i64>, ptr %a @@ -240,9 +231,7 @@ define void @post_not_and_not_combine_v4i64(ptr %res, ptr %a, i64 %b) nounwind { ; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 ; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 ; LA32-NEXT: xvreplve0.d $xr1, $xr1 -; LA32-NEXT: xvrepli.b $xr2, -1 -; LA32-NEXT: xvxor.v $xr1, $xr1, $xr2 -; LA32-NEXT: xvandn.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvnor.v $xr0, $xr0, $xr1 ; LA32-NEXT: xvst $xr0, $a0, 0 ; LA32-NEXT: ret ; @@ -250,9 +239,7 @@ define void @post_not_and_not_combine_v4i64(ptr %res, ptr %a, i64 %b) nounwind { ; LA64: # %bb.0: ; LA64-NEXT: xvld $xr0, $a1, 0 ; LA64-NEXT: xvreplgr2vr.d $xr1, $a2 -; LA64-NEXT: xvrepli.b $xr2, -1 -; LA64-NEXT: xvxor.v $xr1, $xr1, $xr2 -; LA64-NEXT: xvandn.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvnor.v $xr0, $xr0, $xr1 ; LA64-NEXT: xvst $xr0, $a0, 0 ; LA64-NEXT: ret %v0 = load <4 x i64>, ptr %a @@ -269,8 +256,7 @@ define void @and_not_combine_splatimm_v32i8(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: and_not_combine_splatimm_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvrepli.b $xr1, -4 -; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvnori.b $xr0, $xr0, 3 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <32 x i8>, ptr %a0 @@ -332,10 +318,9 @@ define void @and_or_not_combine_v32i8(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounw ; CHECK-NEXT: xvld $xr1, $a2, 0 ; CHECK-NEXT: xvld $xr2, $a1, 0 ; CHECK-NEXT: xvseq.b $xr0, $xr1, $xr0 -; CHECK-NEXT: xvxori.b $xr0, $xr0, 255 ; CHECK-NEXT: xvseq.b $xr1, $xr1, $xr2 -; CHECK-NEXT: xvorn.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvandi.b $xr0, $xr0, 4 +; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvnori.b $xr0, $xr0, 251 ; CHECK-NEXT: xvst $xr0, $a3, 0 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %pa @@ -357,12 +342,10 @@ define void @and_or_not_combine_v16i16(ptr %pa, ptr %pb, ptr %pv, ptr %dst) noun ; CHECK-NEXT: xvld $xr1, $a2, 0 ; CHECK-NEXT: xvld $xr2, $a1, 0 ; CHECK-NEXT: xvseq.h $xr0, $xr1, $xr0 -; CHECK-NEXT: xvrepli.b $xr3, -1 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr3 ; CHECK-NEXT: xvseq.h $xr1, $xr1, $xr2 -; CHECK-NEXT: xvorn.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvrepli.h $xr1, 4 ; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvrepli.h $xr1, 4 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a3, 0 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %pa @@ -384,12 +367,10 @@ define void @and_or_not_combine_v8i32(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounw ; CHECK-NEXT: xvld $xr1, $a2, 0 ; CHECK-NEXT: xvld $xr2, $a1, 0 ; CHECK-NEXT: xvseq.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvrepli.b $xr3, -1 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr3 ; CHECK-NEXT: xvseq.w $xr1, $xr1, $xr2 -; CHECK-NEXT: xvorn.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvrepli.w $xr1, 4 ; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvrepli.w $xr1, 4 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a3, 0 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %pa @@ -411,12 +392,10 @@ define void @and_or_not_combine_v4i64(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounw ; CHECK-NEXT: xvld $xr1, $a2, 0 ; CHECK-NEXT: xvld $xr2, $a1, 0 ; CHECK-NEXT: xvseq.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvrepli.b $xr3, -1 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr3 ; CHECK-NEXT: xvseq.d $xr1, $xr1, $xr2 -; CHECK-NEXT: xvorn.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvrepli.d $xr1, 4 ; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvrepli.d $xr1, 4 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvst $xr0, $a3, 0 ; CHECK-NEXT: ret %a = load <4 x i64>, ptr %pa @@ -435,9 +414,8 @@ define void @and_extract_subvector_not_combine_v32i8(ptr %pa, ptr %dst) nounwind ; CHECK-LABEL: and_extract_subvector_not_combine_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvxori.b $xr0, $xr0, 255 ; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 -; CHECK-NEXT: vandi.b $vr0, $vr0, 4 +; CHECK-NEXT: vnori.b $vr0, $vr0, 251 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %a = load volatile <32 x i8>, ptr %pa @@ -454,11 +432,9 @@ define void @and_extract_subvector_not_combine_v16i16(ptr %pa, ptr %dst) nounwin ; CHECK-LABEL: and_extract_subvector_not_combine_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvrepli.b $xr1, -1 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 ; CHECK-NEXT: vrepli.h $vr1, 4 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %a = load volatile <16 x i16>, ptr %pa @@ -474,11 +450,9 @@ define void @and_extract_subvector_not_combine_v8i32(ptr %pa, ptr %dst) nounwind ; CHECK-LABEL: and_extract_subvector_not_combine_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvrepli.b $xr1, -1 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 ; CHECK-NEXT: vrepli.w $vr1, 4 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %a = load volatile <8 x i32>, ptr %pa @@ -493,11 +467,9 @@ define void @and_extract_subvector_not_combine_v4i64(ptr %pa, ptr %dst) nounwind ; CHECK-LABEL: and_extract_subvector_not_combine_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvrepli.b $xr1, -1 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 ; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 ; CHECK-NEXT: vrepli.d $vr1, 4 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %a = load volatile <4 x i64>, ptr %pa diff --git a/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll b/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll index 960d8c4b156b5..f439a33230596 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll @@ -90,9 +90,8 @@ define void @pre_not_and_not_combine_v16i8(ptr %res, ptr %a, i8 %b) nounwind { ; CHECK-LABEL: pre_not_and_not_combine_v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: nor $a1, $a2, $zero -; CHECK-NEXT: vreplgr2vr.b $vr1, $a1 -; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vreplgr2vr.b $vr1, $a2 +; CHECK-NEXT: vnor.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i8>, ptr %a @@ -110,8 +109,7 @@ define void @post_not_and_not_combine_v16i8(ptr %res, ptr %a, i8 %b) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vreplgr2vr.b $vr1, $a2 -; CHECK-NEXT: vxori.b $vr1, $vr1, 255 -; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vnor.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i8>, ptr %a @@ -128,9 +126,8 @@ define void @pre_not_and_not_combine_v8i16(ptr %res, ptr %a, i16 %b) nounwind { ; CHECK-LABEL: pre_not_and_not_combine_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: nor $a1, $a2, $zero -; CHECK-NEXT: vreplgr2vr.h $vr1, $a1 -; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vreplgr2vr.h $vr1, $a2 +; CHECK-NEXT: vnor.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <8 x i16>, ptr %a @@ -148,9 +145,7 @@ define void @post_not_and_not_combine_v8i16(ptr %res, ptr %a, i16 %b) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vreplgr2vr.h $vr1, $a2 -; CHECK-NEXT: vrepli.b $vr2, -1 -; CHECK-NEXT: vxor.v $vr1, $vr1, $vr2 -; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vnor.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <8 x i16>, ptr %a @@ -167,9 +162,8 @@ define void @pre_not_and_not_combine_v4i32(ptr %res, ptr %a, i32 %b) nounwind { ; CHECK-LABEL: pre_not_and_not_combine_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: nor $a1, $a2, $zero -; CHECK-NEXT: vreplgr2vr.w $vr1, $a1 -; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vreplgr2vr.w $vr1, $a2 +; CHECK-NEXT: vnor.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <4 x i32>, ptr %a @@ -187,9 +181,7 @@ define void @post_not_and_not_combine_v4i32(ptr %res, ptr %a, i32 %b) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vreplgr2vr.w $vr1, $a2 -; CHECK-NEXT: vrepli.b $vr2, -1 -; CHECK-NEXT: vxor.v $vr1, $vr1, $vr2 -; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vnor.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <4 x i32>, ptr %a @@ -218,9 +210,8 @@ define void @pre_not_and_not_combine_v2i64(ptr %res, ptr %a, i64 %b) nounwind { ; LA64-LABEL: pre_not_and_not_combine_v2i64: ; LA64: # %bb.0: ; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: nor $a1, $a2, $zero -; LA64-NEXT: vreplgr2vr.d $vr1, $a1 -; LA64-NEXT: vandn.v $vr0, $vr0, $vr1 +; LA64-NEXT: vreplgr2vr.d $vr1, $a2 +; LA64-NEXT: vnor.v $vr0, $vr0, $vr1 ; LA64-NEXT: vst $vr0, $a0, 0 ; LA64-NEXT: ret %v0 = load <2 x i64>, ptr %a @@ -240,9 +231,7 @@ define void @post_not_and_not_combine_v2i64(ptr %res, ptr %a, i64 %b) nounwind { ; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 ; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 ; LA32-NEXT: vreplvei.d $vr1, $vr1, 0 -; LA32-NEXT: vrepli.b $vr2, -1 -; LA32-NEXT: vxor.v $vr1, $vr1, $vr2 -; LA32-NEXT: vandn.v $vr0, $vr0, $vr1 +; LA32-NEXT: vnor.v $vr0, $vr0, $vr1 ; LA32-NEXT: vst $vr0, $a0, 0 ; LA32-NEXT: ret ; @@ -250,9 +239,7 @@ define void @post_not_and_not_combine_v2i64(ptr %res, ptr %a, i64 %b) nounwind { ; LA64: # %bb.0: ; LA64-NEXT: vld $vr0, $a1, 0 ; LA64-NEXT: vreplgr2vr.d $vr1, $a2 -; LA64-NEXT: vrepli.b $vr2, -1 -; LA64-NEXT: vxor.v $vr1, $vr1, $vr2 -; LA64-NEXT: vandn.v $vr0, $vr0, $vr1 +; LA64-NEXT: vnor.v $vr0, $vr0, $vr1 ; LA64-NEXT: vst $vr0, $a0, 0 ; LA64-NEXT: ret %v0 = load <2 x i64>, ptr %a @@ -269,8 +256,7 @@ define void @and_not_combine_splatimm_v16i8(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: and_not_combine_splatimm_v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vrepli.b $vr1, -4 -; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vnori.b $vr0, $vr0, 3 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret %v0 = load <16 x i8>, ptr %a0 @@ -332,10 +318,9 @@ define void @and_or_not_combine_v16i8(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounw ; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: vld $vr2, $a1, 0 ; CHECK-NEXT: vseq.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vxori.b $vr0, $vr0, 255 ; CHECK-NEXT: vseq.b $vr1, $vr1, $vr2 -; CHECK-NEXT: vorn.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vandi.b $vr0, $vr0, 4 +; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vnori.b $vr0, $vr0, 251 ; CHECK-NEXT: vst $vr0, $a3, 0 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %pa @@ -357,12 +342,10 @@ define void @and_or_not_combine_v8i16(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounw ; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: vld $vr2, $a1, 0 ; CHECK-NEXT: vseq.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vrepli.b $vr3, -1 -; CHECK-NEXT: vxor.v $vr0, $vr0, $vr3 ; CHECK-NEXT: vseq.h $vr1, $vr1, $vr2 -; CHECK-NEXT: vorn.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vrepli.h $vr1, 4 ; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vrepli.h $vr1, 4 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a3, 0 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %pa @@ -384,12 +367,10 @@ define void @and_or_not_combine_v4i32(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounw ; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: vld $vr2, $a1, 0 ; CHECK-NEXT: vseq.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vrepli.b $vr3, -1 -; CHECK-NEXT: vxor.v $vr0, $vr0, $vr3 ; CHECK-NEXT: vseq.w $vr1, $vr1, $vr2 -; CHECK-NEXT: vorn.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vrepli.w $vr1, 4 ; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vrepli.w $vr1, 4 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a3, 0 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %pa @@ -411,12 +392,10 @@ define void @and_or_not_combine_v2i64(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounw ; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: vld $vr2, $a1, 0 ; CHECK-NEXT: vseq.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vrepli.b $vr3, -1 -; CHECK-NEXT: vxor.v $vr0, $vr0, $vr3 ; CHECK-NEXT: vseq.d $vr1, $vr1, $vr2 -; CHECK-NEXT: vorn.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vrepli.d $vr1, 4 ; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vrepli.d $vr1, 4 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 ; CHECK-NEXT: vst $vr0, $a3, 0 ; CHECK-NEXT: ret %a = load <2 x i64>, ptr %pa