From 5e31eef765246e51c849618516fbd6162b5b6260 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Mon, 1 Sep 2025 14:11:57 +0000 Subject: [PATCH 1/4] [AArch64] Add test coverage for some aba/abal cases (NFC) --- llvm/test/CodeGen/AArch64/neon-saba.ll | 262 +++++++++++++++++++++++++ 1 file changed, 262 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll index 19967bd1a69ec..2cb5053e07d6a 100644 --- a/llvm/test/CodeGen/AArch64/neon-saba.ll +++ b/llvm/test/CodeGen/AArch64/neon-saba.ll @@ -174,6 +174,268 @@ define <8 x i8> @saba_sabd_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { ret <8 x i8> %add } +; SABA from ADD(SABD(X, ZEROS)) + +define <4 x i32> @saba_sabd_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { +; CHECK-SD-LABEL: saba_sabd_zeros_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.4s, v1.4s +; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ret + %sabd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %b, <4 x i32> zeroinitializer) + %add = add <4 x i32> %sabd, %a + ret <4 x i32> %add +} + +define <2 x i32> @saba_sabd_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-SD-LABEL: saba_sabd_zeros_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.2s, v1.2s +; CHECK-SD-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: ret + %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) + %add = add <2 x i32> %sabd, %a + ret <2 x i32> %add +} + +define <8 x i16> @saba_sabd_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { +; CHECK-SD-LABEL: saba_sabd_zeros_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.8h, v1.8h +; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ret + %sabd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %b, <8 x i16> zeroinitializer) + %add = add <8 x i16> %sabd, %a + ret <8 x i16> %add +} + +define <4 x i16> @saba_sabd_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { +; CHECK-SD-LABEL: saba_sabd_zeros_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.4h, v1.4h +; CHECK-SD-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ret + %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) + %add = add <4 x i16> %sabd, %a + ret <4 x i16> %add +} + +define <16 x i8> @saba_sabd_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-SD-LABEL: saba_sabd_zeros_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.16b, v1.16b +; CHECK-SD-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ret + %sabd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> zeroinitializer) + %add = add <16 x i8> %sabd, %a + ret <16 x i8> %add +} + +define <8 x i8> @saba_sabd_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { +; CHECK-SD-LABEL: saba_sabd_zeros_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.8b, v1.8b +; CHECK-SD-NEXT: add v0.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ret + %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) + %add = add <8 x i8> %sabd, %a + ret <8 x i8> %add +} + +define <4 x i32> @saba_abs_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %b, i1 true) + %add = add <4 x i32> %a, %abs + ret <4 x i32> %add +} + +define <2 x i32> @saba_abs_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.2s, v1.2s +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret + %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) + %add = add <2 x i32> %a, %abs + ret <2 x i32> %add +} + +define <8 x i16> @saba_abs_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.8h, v1.8h +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %b, i1 true) + %add = add <8 x i16> %a, %abs + ret <8 x i16> %add +} + +define <4 x i16> @saba_abs_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.4h, v1.4h +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret + %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) + %add = add <4 x i16> %a, %abs + ret <4 x i16> %add +} + +define <16 x i8> @saba_abs_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_16b: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.16b, v1.16b +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %b, i1 true) + %add = add <16 x i8> %a, %abs + ret <16 x i8> %add +} + +define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.8b, v1.8b +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) + %add = add <8 x i8> %a, %abs + ret <8 x i8> %add +} + +; SABAL from ADD(ZEXT(SABD(X, ZEROS))) + +define <2 x i64> @sabal_sabd_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { +; CHECK-SD-LABEL: sabal_sabd_zeros_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.2s, v1.2s +; CHECK-SD-NEXT: uaddw v0.2d, v0.2d, v1.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_sabd_zeros_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: sabal v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: ret + %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) + %sabd.zext = zext <2 x i32> %sabd to <2 x i64> + %add = add <2 x i64> %sabd.zext, %a + ret <2 x i64> %add +} + +define <4 x i32> @sabal_sabd_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { +; CHECK-SD-LABEL: sabal_sabd_zeros_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.4h, v1.4h +; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_sabd_zeros_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: sabal v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: ret + %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) + %sabd.zext = zext <4 x i16> %sabd to <4 x i32> + %add = add <4 x i32> %sabd.zext, %a + ret <4 x i32> %add +} + +define <8 x i16> @sabal_sabd_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { +; CHECK-SD-LABEL: sabal_sabd_zeros_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.8b, v1.8b +; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_sabd_zeros_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: sabal v0.8h, v1.8b, v2.8b +; CHECK-GI-NEXT: ret + %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) + %sabd.zext = zext <8 x i8> %sabd to <8 x i16> + %add = add <8 x i16> %sabd.zext, %a + ret <8 x i16> %add +} + +define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: sabal_abs_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.2s, v1.2s +; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s +; CHECK-NEXT: ret + %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) + %abs.zext = zext <2 x i32> %abs to <2 x i64> + %add = add <2 x i64> %a, %abs.zext + ret <2 x i64> %add +} + +define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { +; CHECK-LABEL: sabal_abs_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.4h, v1.4h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ret + %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) + %abs.zext = zext <4 x i16> %abs to <4 x i32> + %add = add <4 x i32> %a, %abs.zext + ret <4 x i32> %add +} + +define <8 x i16> @sabal_abs_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { +; CHECK-LABEL: sabal_abs_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.8b, v1.8b +; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-NEXT: ret + %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) + %abs.zext = zext <8 x i8> %abs to <8 x i16> + %add = add <8 x i16> %a, %abs.zext + ret <8 x i16> %add +} + declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) From d50b01b2a88e875dd0502ec10c2bdd71bd4be339 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Tue, 2 Sep 2025 15:03:11 +0000 Subject: [PATCH 2/4] [AArch64] Transform add(x, abs(y)) -> saba(x, y, 0) --- .../Target/AArch64/AArch64ISelLowering.cpp | 54 ++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 25 ++ llvm/test/CodeGen/AArch64/neon-saba.ll | 294 +++++++++--------- 3 files changed, 226 insertions(+), 147 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d3515cf81f443..1bbfcf9fe5206 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -50,6 +50,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -21914,6 +21915,56 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift); } +// Transform the following: +// - add(x, abs(y)) -> saba(x, y, 0) +// - add(x, zext(abs(y))) -> sabal(x, y, 0) +static SDValue performAddSABACombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (N->getOpcode() != ISD::ADD) + return SDValue(); + + EVT VT = N->getValueType(0); + if (!VT.isFixedLengthVector()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + auto MatchAbsOrZExtAbs = [](SDValue V0, SDValue V1, SDValue &AbsOp, + SDValue &Other, bool &IsZExt) { + Other = V1; + if (sd_match(V0, m_Abs(SDPatternMatch::m_Value(AbsOp)))) { + IsZExt = false; + return true; + } + if (sd_match(V0, SDPatternMatch::m_ZExt( + m_Abs(SDPatternMatch::m_Value(AbsOp))))) { + IsZExt = true; + return true; + } + + return false; + }; + + SDValue AbsOp; + SDValue Other; + bool IsZExt; + if (!MatchAbsOrZExtAbs(N0, N1, AbsOp, Other, IsZExt) && + !MatchAbsOrZExtAbs(N1, N0, AbsOp, Other, IsZExt)) + return SDValue(); + + // Don't perform this on abs(sub), as this will become an ABD/ABA anyway. + if (AbsOp.getOpcode() == ISD::SUB) + return SDValue(); + + SDLoc DL(N); + SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64); + SDValue Zeros = DCI.DAG.getSplatVector(AbsOp.getValueType(), DL, Zero); + + unsigned Opcode = IsZExt ? AArch64ISD::SABAL : AArch64ISD::SABA; + return DCI.DAG.getNode(Opcode, DL, VT, Other, AbsOp, Zeros); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Try to change sum of two reductions. @@ -21939,6 +21990,9 @@ static SDValue performAddSubCombine(SDNode *N, if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG)) return Val; + if (SDValue Val = performAddSABACombine(N, DCI)) + return Val; + return performAddSubLongCombine(N, DCI); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 62b26b5239365..fdfde5ea1dc37 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1059,6 +1059,10 @@ def AArch64sdot : SDNode<"AArch64ISD::SDOT", SDT_AArch64Dot>; def AArch64udot : SDNode<"AArch64ISD::UDOT", SDT_AArch64Dot>; def AArch64usdot : SDNode<"AArch64ISD::USDOT", SDT_AArch64Dot>; +// saba/sabal +def AArch64neonsaba : SDNode<"AArch64ISD::SABA", SDT_AArch64trivec>; +def AArch64neonsabal : SDNode<"AArch64ISD::SABAL", SDT_AArch64Dot>; + // Vector across-lanes addition // Only the lower result lane is defined. def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; @@ -6121,6 +6125,19 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", int_aarch64_neon_sqrdmlsh>; +def : Pat<(AArch64neonsaba (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (SABAv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64neonsaba (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (SABAv4i16 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64neonsaba (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (SABAv2i32 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64neonsaba (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (SABAv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64neonsaba (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (SABAv8i16 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64neonsaba (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (SABAv4i32 V128:$Rd, V128:$Rn, V128:$Rm)>; + defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; @@ -7008,6 +7025,14 @@ defm : AddSubHNPatterns; +// Patterns for SABAL +def : Pat<(AArch64neonsabal (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)), + (SABALv8i8_v8i16 V128:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64neonsabal (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)), + (SABALv4i16_v4i32 V128:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64neonsabal (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)), + (SABALv2i32_v2i64 V128:$Rd, V64:$Rn, V64:$Rm)>; + //---------------------------------------------------------------------------- // AdvSIMD bitwise extract from vector instruction. //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll index 2cb5053e07d6a..c8de6b21e9764 100644 --- a/llvm/test/CodeGen/AArch64/neon-saba.ll +++ b/llvm/test/CodeGen/AArch64/neon-saba.ll @@ -177,168 +177,168 @@ define <8 x i8> @saba_sabd_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { ; SABA from ADD(SABD(X, ZEROS)) define <4 x i32> @saba_sabd_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_4s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.4s, v1.4s -; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_4s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ret %sabd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %b, <4 x i32> zeroinitializer) %add = add <4 x i32> %sabd, %a ret <4 x i32> %add } define <2 x i32> @saba_sabd_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_2s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.2s, v1.2s -; CHECK-SD-NEXT: add v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_2s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-NEXT: ret %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) %add = add <2 x i32> %sabd, %a ret <2 x i32> %add } define <8 x i16> @saba_sabd_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_8h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.8h, v1.8h -; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_8h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-NEXT: ret %sabd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %b, <8 x i16> zeroinitializer) %add = add <8 x i16> %sabd, %a ret <8 x i16> %add } define <4 x i16> @saba_sabd_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_4h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.4h, v1.4h -; CHECK-SD-NEXT: add v0.4h, v1.4h, v0.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_4h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.4h, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-NEXT: ret %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) %add = add <4 x i16> %sabd, %a ret <4 x i16> %add } define <16 x i8> @saba_sabd_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_16b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.16b, v1.16b -; CHECK-SD-NEXT: add v0.16b, v1.16b, v0.16b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_16b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_16b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret %sabd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> zeroinitializer) %add = add <16 x i8> %sabd, %a ret <16 x i8> %add } define <8 x i8> @saba_sabd_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_8b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.8b, v1.8b -; CHECK-SD-NEXT: add v0.8b, v1.8b, v0.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_8b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.8b, v1.8b, v2.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) %add = add <8 x i8> %sabd, %a ret <8 x i8> %add } define <4 x i32> @saba_abs_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { -; CHECK-LABEL: saba_abs_zeros_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_zeros_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_zeros_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %b, i1 true) %add = add <4 x i32> %a, %abs ret <4 x i32> %add } define <2 x i32> @saba_abs_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { -; CHECK-LABEL: saba_abs_zeros_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.2s, v1.2s -; CHECK-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_zeros_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_zeros_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.2s, v1.2s +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) %add = add <2 x i32> %a, %abs ret <2 x i32> %add } define <8 x i16> @saba_abs_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { -; CHECK-LABEL: saba_abs_zeros_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.8h, v1.8h -; CHECK-NEXT: add v0.8h, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_zeros_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_zeros_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.8h, v1.8h +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %b, i1 true) %add = add <8 x i16> %a, %abs ret <8 x i16> %add } define <4 x i16> @saba_abs_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { -; CHECK-LABEL: saba_abs_zeros_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.4h, v1.4h -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_zeros_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_zeros_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.4h, v1.4h +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) %add = add <4 x i16> %a, %abs ret <4 x i16> %add } define <16 x i8> @saba_abs_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { -; CHECK-LABEL: saba_abs_zeros_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.16b, v1.16b -; CHECK-NEXT: add v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_zeros_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_zeros_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.16b, v1.16b +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %b, i1 true) %add = add <16 x i8> %a, %abs ret <16 x i8> %add } define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { -; CHECK-LABEL: saba_abs_zeros_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.8b, v1.8b -; CHECK-NEXT: add v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_zeros_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_zeros_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.8b, v1.8b +; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) %add = add <8 x i8> %a, %abs ret <8 x i8> %add @@ -347,65 +347,53 @@ define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { ; SABAL from ADD(ZEXT(SABD(X, ZEROS))) define <2 x i64> @sabal_sabd_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { -; CHECK-SD-LABEL: sabal_sabd_zeros_2s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.2s, v1.2s -; CHECK-SD-NEXT: uaddw v0.2d, v0.2d, v1.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_sabd_zeros_2s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: sabal v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_sabd_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s +; CHECK-NEXT: ret %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) %sabd.zext = zext <2 x i32> %sabd to <2 x i64> - %add = add <2 x i64> %sabd.zext, %a + %add = add <2 x i64> %a, %sabd.zext ret <2 x i64> %add } define <4 x i32> @sabal_sabd_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { -; CHECK-SD-LABEL: sabal_sabd_zeros_4h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.4h, v1.4h -; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_sabd_zeros_4h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: sabal v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_sabd_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h +; CHECK-NEXT: ret %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) %sabd.zext = zext <4 x i16> %sabd to <4 x i32> - %add = add <4 x i32> %sabd.zext, %a + %add = add <4 x i32> %a, %sabd.zext ret <4 x i32> %add } define <8 x i16> @sabal_sabd_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { -; CHECK-SD-LABEL: sabal_sabd_zeros_8b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.8b, v1.8b -; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_sabd_zeros_8b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: sabal v0.8h, v1.8b, v2.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_sabd_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b +; CHECK-NEXT: ret %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) %sabd.zext = zext <8 x i8> %sabd to <8 x i16> - %add = add <8 x i16> %sabd.zext, %a + %add = add <8 x i16> %a, %sabd.zext ret <8 x i16> %add } define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { -; CHECK-LABEL: sabal_abs_zeros_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.2s, v1.2s -; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sabal_abs_zeros_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: sabal v0.2d, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_abs_zeros_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.2s, v1.2s +; CHECK-GI-NEXT: uaddw v0.2d, v0.2d, v1.2s +; CHECK-GI-NEXT: ret %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) %abs.zext = zext <2 x i32> %abs to <2 x i64> %add = add <2 x i64> %a, %abs.zext @@ -413,11 +401,17 @@ define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { } define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { -; CHECK-LABEL: sabal_abs_zeros_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.4h, v1.4h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sabal_abs_zeros_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: sabal v0.4s, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_abs_zeros_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.4h, v1.4h +; CHECK-GI-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-GI-NEXT: ret %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) %abs.zext = zext <4 x i16> %abs to <4 x i32> %add = add <4 x i32> %a, %abs.zext @@ -425,11 +419,17 @@ define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { } define <8 x i16> @sabal_abs_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { -; CHECK-LABEL: sabal_abs_zeros_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.8b, v1.8b -; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sabal_abs_zeros_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: sabal v0.8h, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_abs_zeros_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: abs v1.8b, v1.8b +; CHECK-GI-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: ret %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) %abs.zext = zext <8 x i8> %abs to <8 x i16> %add = add <8 x i16> %a, %abs.zext From 2ac2716d6f37722268f5b921020aa9739f52a027 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Wed, 3 Sep 2025 22:29:41 +0000 Subject: [PATCH 3/4] Revert "[AArch64] Transform add(x, abs(y)) -> saba(x, y, 0)" This reverts commit 8ac71cda207b4334f5b6ea2f4ec5d4e0fb3606d1. --- .../Target/AArch64/AArch64ISelLowering.cpp | 54 ---- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 25 -- llvm/test/CodeGen/AArch64/neon-saba.ll | 294 +++++++++--------- 3 files changed, 147 insertions(+), 226 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1bbfcf9fe5206..d3515cf81f443 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -50,7 +50,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -21915,56 +21914,6 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift); } -// Transform the following: -// - add(x, abs(y)) -> saba(x, y, 0) -// - add(x, zext(abs(y))) -> sabal(x, y, 0) -static SDValue performAddSABACombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - if (N->getOpcode() != ISD::ADD) - return SDValue(); - - EVT VT = N->getValueType(0); - if (!VT.isFixedLengthVector()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - auto MatchAbsOrZExtAbs = [](SDValue V0, SDValue V1, SDValue &AbsOp, - SDValue &Other, bool &IsZExt) { - Other = V1; - if (sd_match(V0, m_Abs(SDPatternMatch::m_Value(AbsOp)))) { - IsZExt = false; - return true; - } - if (sd_match(V0, SDPatternMatch::m_ZExt( - m_Abs(SDPatternMatch::m_Value(AbsOp))))) { - IsZExt = true; - return true; - } - - return false; - }; - - SDValue AbsOp; - SDValue Other; - bool IsZExt; - if (!MatchAbsOrZExtAbs(N0, N1, AbsOp, Other, IsZExt) && - !MatchAbsOrZExtAbs(N1, N0, AbsOp, Other, IsZExt)) - return SDValue(); - - // Don't perform this on abs(sub), as this will become an ABD/ABA anyway. - if (AbsOp.getOpcode() == ISD::SUB) - return SDValue(); - - SDLoc DL(N); - SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64); - SDValue Zeros = DCI.DAG.getSplatVector(AbsOp.getValueType(), DL, Zero); - - unsigned Opcode = IsZExt ? AArch64ISD::SABAL : AArch64ISD::SABA; - return DCI.DAG.getNode(Opcode, DL, VT, Other, AbsOp, Zeros); -} - static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Try to change sum of two reductions. @@ -21990,9 +21939,6 @@ static SDValue performAddSubCombine(SDNode *N, if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG)) return Val; - if (SDValue Val = performAddSABACombine(N, DCI)) - return Val; - return performAddSubLongCombine(N, DCI); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index fdfde5ea1dc37..62b26b5239365 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1059,10 +1059,6 @@ def AArch64sdot : SDNode<"AArch64ISD::SDOT", SDT_AArch64Dot>; def AArch64udot : SDNode<"AArch64ISD::UDOT", SDT_AArch64Dot>; def AArch64usdot : SDNode<"AArch64ISD::USDOT", SDT_AArch64Dot>; -// saba/sabal -def AArch64neonsaba : SDNode<"AArch64ISD::SABA", SDT_AArch64trivec>; -def AArch64neonsabal : SDNode<"AArch64ISD::SABAL", SDT_AArch64Dot>; - // Vector across-lanes addition // Only the lower result lane is defined. def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; @@ -6125,19 +6121,6 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", int_aarch64_neon_sqrdmlsh>; -def : Pat<(AArch64neonsaba (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), - (SABAv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64neonsaba (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), - (SABAv4i16 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64neonsaba (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), - (SABAv2i32 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64neonsaba (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), - (SABAv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64neonsaba (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), - (SABAv8i16 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64neonsaba (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), - (SABAv4i32 V128:$Rd, V128:$Rn, V128:$Rm)>; - defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; @@ -7025,14 +7008,6 @@ defm : AddSubHNPatterns; -// Patterns for SABAL -def : Pat<(AArch64neonsabal (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)), - (SABALv8i8_v8i16 V128:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64neonsabal (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)), - (SABALv4i16_v4i32 V128:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64neonsabal (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)), - (SABALv2i32_v2i64 V128:$Rd, V64:$Rn, V64:$Rm)>; - //---------------------------------------------------------------------------- // AdvSIMD bitwise extract from vector instruction. //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll index c8de6b21e9764..2cb5053e07d6a 100644 --- a/llvm/test/CodeGen/AArch64/neon-saba.ll +++ b/llvm/test/CodeGen/AArch64/neon-saba.ll @@ -177,168 +177,168 @@ define <8 x i8> @saba_sabd_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { ; SABA from ADD(SABD(X, ZEROS)) define <4 x i32> @saba_sabd_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { -; CHECK-LABEL: saba_sabd_zeros_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_sabd_zeros_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.4s, v1.4s +; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ret %sabd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %b, <4 x i32> zeroinitializer) %add = add <4 x i32> %sabd, %a ret <4 x i32> %add } define <2 x i32> @saba_sabd_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { -; CHECK-LABEL: saba_sabd_zeros_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_sabd_zeros_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.2s, v1.2s +; CHECK-SD-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: ret %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) %add = add <2 x i32> %sabd, %a ret <2 x i32> %add } define <8 x i16> @saba_sabd_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { -; CHECK-LABEL: saba_sabd_zeros_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_sabd_zeros_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.8h, v1.8h +; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ret %sabd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %b, <8 x i16> zeroinitializer) %add = add <8 x i16> %sabd, %a ret <8 x i16> %add } define <4 x i16> @saba_sabd_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { -; CHECK-LABEL: saba_sabd_zeros_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_sabd_zeros_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.4h, v1.4h +; CHECK-SD-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ret %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) %add = add <4 x i16> %sabd, %a ret <4 x i16> %add } define <16 x i8> @saba_sabd_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { -; CHECK-LABEL: saba_sabd_zeros_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_sabd_zeros_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.16b, v1.16b +; CHECK-SD-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ret %sabd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> zeroinitializer) %add = add <16 x i8> %sabd, %a ret <16 x i8> %add } define <8 x i8> @saba_sabd_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { -; CHECK-LABEL: saba_sabd_zeros_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_sabd_zeros_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.8b, v1.8b +; CHECK-SD-NEXT: add v0.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_sabd_zeros_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ret %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) %add = add <8 x i8> %sabd, %a ret <8 x i8> %add } define <4 x i32> @saba_abs_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { -; CHECK-SD-LABEL: saba_abs_zeros_4s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: saba v0.4s, v1.4s, v2.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_abs_zeros_4s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.4s, v1.4s -; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_abs_zeros_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %b, i1 true) %add = add <4 x i32> %a, %abs ret <4 x i32> %add } define <2 x i32> @saba_abs_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { -; CHECK-SD-LABEL: saba_abs_zeros_2s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: saba v0.2s, v1.2s, v2.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_abs_zeros_2s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.2s, v1.2s -; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_abs_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.2s, v1.2s +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) %add = add <2 x i32> %a, %abs ret <2 x i32> %add } define <8 x i16> @saba_abs_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { -; CHECK-SD-LABEL: saba_abs_zeros_8h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: saba v0.8h, v1.8h, v2.8h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_abs_zeros_8h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.8h, v1.8h -; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_abs_zeros_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.8h, v1.8h +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %b, i1 true) %add = add <8 x i16> %a, %abs ret <8 x i16> %add } define <4 x i16> @saba_abs_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { -; CHECK-SD-LABEL: saba_abs_zeros_4h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: saba v0.4h, v1.4h, v2.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_abs_zeros_4h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.4h, v1.4h -; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_abs_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.4h, v1.4h +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) %add = add <4 x i16> %a, %abs ret <4 x i16> %add } define <16 x i8> @saba_abs_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { -; CHECK-SD-LABEL: saba_abs_zeros_16b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: saba v0.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_abs_zeros_16b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.16b, v1.16b -; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_abs_zeros_16b: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.16b, v1.16b +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %b, i1 true) %add = add <16 x i8> %a, %abs ret <16 x i8> %add } define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { -; CHECK-SD-LABEL: saba_abs_zeros_8b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: saba v0.8b, v1.8b, v2.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_abs_zeros_8b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.8b, v1.8b -; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_abs_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.8b, v1.8b +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) %add = add <8 x i8> %a, %abs ret <8 x i8> %add @@ -347,53 +347,65 @@ define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { ; SABAL from ADD(ZEXT(SABD(X, ZEROS))) define <2 x i64> @sabal_sabd_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { -; CHECK-LABEL: sabal_sabd_zeros_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sabal_sabd_zeros_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.2s, v1.2s +; CHECK-SD-NEXT: uaddw v0.2d, v0.2d, v1.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_sabd_zeros_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: sabal v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: ret %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) %sabd.zext = zext <2 x i32> %sabd to <2 x i64> - %add = add <2 x i64> %a, %sabd.zext + %add = add <2 x i64> %sabd.zext, %a ret <2 x i64> %add } define <4 x i32> @sabal_sabd_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { -; CHECK-LABEL: sabal_sabd_zeros_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sabal_sabd_zeros_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.4h, v1.4h +; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_sabd_zeros_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: sabal v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: ret %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) %sabd.zext = zext <4 x i16> %sabd to <4 x i32> - %add = add <4 x i32> %a, %sabd.zext + %add = add <4 x i32> %sabd.zext, %a ret <4 x i32> %add } define <8 x i16> @sabal_sabd_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { -; CHECK-LABEL: sabal_sabd_zeros_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sabal_sabd_zeros_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: abs v1.8b, v1.8b +; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sabal_sabd_zeros_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: sabal v0.8h, v1.8b, v2.8b +; CHECK-GI-NEXT: ret %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) %sabd.zext = zext <8 x i8> %sabd to <8 x i16> - %add = add <8 x i16> %a, %sabd.zext + %add = add <8 x i16> %sabd.zext, %a ret <8 x i16> %add } define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { -; CHECK-SD-LABEL: sabal_abs_zeros_2s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: sabal v0.2d, v1.2s, v2.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_abs_zeros_2s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.2s, v1.2s -; CHECK-GI-NEXT: uaddw v0.2d, v0.2d, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_abs_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.2s, v1.2s +; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s +; CHECK-NEXT: ret %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) %abs.zext = zext <2 x i32> %abs to <2 x i64> %add = add <2 x i64> %a, %abs.zext @@ -401,17 +413,11 @@ define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { } define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { -; CHECK-SD-LABEL: sabal_abs_zeros_4h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: sabal v0.4s, v1.4h, v2.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_abs_zeros_4h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.4h, v1.4h -; CHECK-GI-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_abs_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.4h, v1.4h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ret %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) %abs.zext = zext <4 x i16> %abs to <4 x i32> %add = add <4 x i32> %a, %abs.zext @@ -419,17 +425,11 @@ define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { } define <8 x i16> @sabal_abs_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { -; CHECK-SD-LABEL: sabal_abs_zeros_8b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: sabal v0.8h, v1.8b, v2.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_abs_zeros_8b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: abs v1.8b, v1.8b -; CHECK-GI-NEXT: uaddw v0.8h, v0.8h, v1.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_abs_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: abs v1.8b, v1.8b +; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-NEXT: ret %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) %abs.zext = zext <8 x i8> %abs to <8 x i16> %add = add <8 x i16> %a, %abs.zext From c3249de00177e43302c2149a189a4906e91d61ed Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Wed, 3 Sep 2025 22:50:10 +0000 Subject: [PATCH 4/4] Do it in tblgen instead --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 23 +++ llvm/test/CodeGen/AArch64/neon-saba.ll | 204 +++++++------------- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 139 +++++++------ 3 files changed, 164 insertions(+), 202 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 62b26b5239365..04b67cb3103a4 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8303,6 +8303,29 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; } +// SABA patterns for add(x, abs(y)) -> saba(x, y, 0) +def : Pat<(v8i8 (add V64:$Vn, (abs V64:$Vm))), + (SABAv8i8 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v4i16 (add V64:$Vn, (abs V64:$Vm))), + (SABAv4i16 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v2i32 (add V64:$Vn, (abs V64:$Vm))), + (SABAv2i32 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v16i8 (add V128:$Vn, (abs V128:$Vm))), + (SABAv16i8 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>; +def : Pat<(v8i16 (add V128:$Vn, (abs V128:$Vm))), + (SABAv8i16 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>; +def : Pat<(v4i32 (add V128:$Vn, (abs V128:$Vm))), + (SABAv4i32 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>; + +// SABAL patterns for add(x, zext(abs(y))) -> sabal(x, y, 0) +def : Pat<(v8i16 (add V128:$Vn, (zext (abs (v8i8 V64:$Vm))))), + (SABALv8i8_v8i16 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v4i32 (add V128:$Vn, (zext (abs (v4i16 V64:$Vm))))), + (SABALv4i16_v4i32 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v2i64 (add V128:$Vn, (zext (abs (v2i32 V64:$Vm))))), + (SABALv2i32_v2i64 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; + + //---------------------------------------------------------------------------- // AdvSIMD indexed element //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll index 2cb5053e07d6a..ddb85d6dee03c 100644 --- a/llvm/test/CodeGen/AArch64/neon-saba.ll +++ b/llvm/test/CodeGen/AArch64/neon-saba.ll @@ -12,9 +12,9 @@ define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_4s: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: abs v1.4s, v1.4s -; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: saba v0.4s, v1.4s, v3.4s ; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i32> %b, %c %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) @@ -30,9 +30,9 @@ define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_2s: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: abs v1.2s, v1.2s -; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: saba v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret %sub = sub nsw <2 x i32> %b, %c %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true) @@ -48,9 +48,9 @@ define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_8h: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: abs v1.8h, v1.8h -; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: saba v0.8h, v1.8h, v3.8h ; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i16> %b, %c %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) @@ -66,9 +66,9 @@ define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_4h: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.4h, v1.4h, v2.4h -; CHECK-GI-NEXT: abs v1.4h, v1.4h -; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: saba v0.4h, v1.4h, v3.4h ; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i16> %b, %c %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true) @@ -84,9 +84,9 @@ define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_16b: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: abs v1.16b, v1.16b -; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: saba v0.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: ret %sub = sub nsw <16 x i8> %b, %c %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) @@ -102,9 +102,9 @@ define <8 x i8> @saba_abs_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_8b: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.8b, v1.8b, v2.8b -; CHECK-GI-NEXT: abs v1.8b, v1.8b -; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: saba v0.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i8> %b, %c %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %sub, i1 true) @@ -177,102 +177,66 @@ define <8 x i8> @saba_sabd_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { ; SABA from ADD(SABD(X, ZEROS)) define <4 x i32> @saba_sabd_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_4s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.4s, v1.4s -; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_4s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ret %sabd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %b, <4 x i32> zeroinitializer) %add = add <4 x i32> %sabd, %a ret <4 x i32> %add } define <2 x i32> @saba_sabd_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_2s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.2s, v1.2s -; CHECK-SD-NEXT: add v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_2s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-NEXT: ret %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) %add = add <2 x i32> %sabd, %a ret <2 x i32> %add } define <8 x i16> @saba_sabd_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_8h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.8h, v1.8h -; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_8h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-NEXT: ret %sabd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %b, <8 x i16> zeroinitializer) %add = add <8 x i16> %sabd, %a ret <8 x i16> %add } define <4 x i16> @saba_sabd_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_4h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.4h, v1.4h -; CHECK-SD-NEXT: add v0.4h, v1.4h, v0.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_4h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.4h, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-NEXT: ret %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) %add = add <4 x i16> %sabd, %a ret <4 x i16> %add } define <16 x i8> @saba_sabd_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_16b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.16b, v1.16b -; CHECK-SD-NEXT: add v0.16b, v1.16b, v0.16b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_16b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_16b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret %sabd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> zeroinitializer) %add = add <16 x i8> %sabd, %a ret <16 x i8> %add } define <8 x i8> @saba_sabd_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { -; CHECK-SD-LABEL: saba_sabd_zeros_8b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.8b, v1.8b -; CHECK-SD-NEXT: add v0.8b, v1.8b, v0.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: saba_sabd_zeros_8b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: saba v0.8b, v1.8b, v2.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: saba_sabd_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) %add = add <8 x i8> %sabd, %a ret <8 x i8> %add @@ -281,8 +245,8 @@ define <8 x i8> @saba_sabd_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { define <4 x i32> @saba_abs_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { ; CHECK-LABEL: saba_abs_zeros_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s ; CHECK-NEXT: ret %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %b, i1 true) %add = add <4 x i32> %a, %abs @@ -292,8 +256,8 @@ define <4 x i32> @saba_abs_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { define <2 x i32> @saba_abs_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { ; CHECK-LABEL: saba_abs_zeros_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.2s, v1.2s -; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s ; CHECK-NEXT: ret %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) %add = add <2 x i32> %a, %abs @@ -303,8 +267,8 @@ define <2 x i32> @saba_abs_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { define <8 x i16> @saba_abs_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { ; CHECK-LABEL: saba_abs_zeros_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.8h, v1.8h -; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h ; CHECK-NEXT: ret %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %b, i1 true) %add = add <8 x i16> %a, %abs @@ -314,8 +278,8 @@ define <8 x i16> @saba_abs_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { define <4 x i16> @saba_abs_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { ; CHECK-LABEL: saba_abs_zeros_4h: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.4h, v1.4h -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) %add = add <4 x i16> %a, %abs @@ -325,8 +289,8 @@ define <4 x i16> @saba_abs_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { define <16 x i8> @saba_abs_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { ; CHECK-LABEL: saba_abs_zeros_16b: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.16b, v1.16b -; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %b, i1 true) %add = add <16 x i8> %a, %abs @@ -336,8 +300,8 @@ define <16 x i8> @saba_abs_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { ; CHECK-LABEL: saba_abs_zeros_8b: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.8b, v1.8b -; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) %add = add <8 x i8> %a, %abs @@ -347,17 +311,11 @@ define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { ; SABAL from ADD(ZEXT(SABD(X, ZEROS))) define <2 x i64> @sabal_sabd_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { -; CHECK-SD-LABEL: sabal_sabd_zeros_2s: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.2s, v1.2s -; CHECK-SD-NEXT: uaddw v0.2d, v0.2d, v1.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_sabd_zeros_2s: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: sabal v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_sabd_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s +; CHECK-NEXT: ret %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) %sabd.zext = zext <2 x i32> %sabd to <2 x i64> %add = add <2 x i64> %sabd.zext, %a @@ -365,17 +323,11 @@ define <2 x i64> @sabal_sabd_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { } define <4 x i32> @sabal_sabd_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { -; CHECK-SD-LABEL: sabal_sabd_zeros_4h: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.4h, v1.4h -; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_sabd_zeros_4h: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: sabal v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_sabd_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h +; CHECK-NEXT: ret %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) %sabd.zext = zext <4 x i16> %sabd to <4 x i32> %add = add <4 x i32> %sabd.zext, %a @@ -383,17 +335,11 @@ define <4 x i32> @sabal_sabd_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { } define <8 x i16> @sabal_sabd_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { -; CHECK-SD-LABEL: sabal_sabd_zeros_8b: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: abs v1.8b, v1.8b -; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sabal_sabd_zeros_8b: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: sabal v0.8h, v1.8b, v2.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sabal_sabd_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b +; CHECK-NEXT: ret %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) %sabd.zext = zext <8 x i8> %sabd to <8 x i16> %add = add <8 x i16> %sabd.zext, %a @@ -403,8 +349,8 @@ define <8 x i16> @sabal_sabd_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { ; CHECK-LABEL: sabal_abs_zeros_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.2s, v1.2s -; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) %abs.zext = zext <2 x i32> %abs to <2 x i64> @@ -415,8 +361,8 @@ define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { ; CHECK-LABEL: sabal_abs_zeros_4h: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.4h, v1.4h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) %abs.zext = zext <4 x i16> %abs to <4 x i32> @@ -427,8 +373,8 @@ define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { define <8 x i16> @sabal_abs_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { ; CHECK-LABEL: sabal_abs_zeros_8b: ; CHECK: // %bb.0: -; CHECK-NEXT: abs v1.8b, v1.8b -; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) %abs.zext = zext <8 x i8> %abs to <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 74d1165d99b82..fb504028a161b 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -4535,96 +4535,89 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI-NEXT: ldr d1, [x2] ; CHECK-GI-NEXT: add x10, x0, x9 ; CHECK-GI-NEXT: add x11, x2, x8 -; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: usubl v2.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: ldr d1, [x10] -; CHECK-GI-NEXT: ldr d2, [x11] +; CHECK-GI-NEXT: ldr d3, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b -; CHECK-GI-NEXT: ldr d3, [x10] -; CHECK-GI-NEXT: ldr d4, [x11] -; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: add x10, x10, x9 -; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: ldr d2, [x10] -; CHECK-GI-NEXT: add x10, x10, x9 -; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: ldr d6, [x11] -; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v3.8h, v3.8b, v4.8b -; CHECK-GI-NEXT: abs v5.4s, v5.4s -; CHECK-GI-NEXT: abs v0.4s, v0.4s +; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 +; CHECK-GI-NEXT: usubl v3.8h, v1.8b, v3.8b ; CHECK-GI-NEXT: ldr d4, [x10] -; CHECK-GI-NEXT: ldr d16, [x11] -; CHECK-GI-NEXT: abs v7.4s, v7.4s -; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: ldr d5, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v2.8h, v2.8b, v6.8b +; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: ldr d6, [x10] -; CHECK-GI-NEXT: ldr d17, [x11] +; CHECK-GI-NEXT: ldr d7, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b -; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-GI-NEXT: sshll2 v16.4s, v3.8h, #0 +; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v5.8b ; CHECK-GI-NEXT: ldr d5, [x10] -; CHECK-GI-NEXT: ldr d7, [x11] -; CHECK-GI-NEXT: sshll v18.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v17.8b -; CHECK-GI-NEXT: ldr d17, [x11, x8] -; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0 -; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v7.8b -; CHECK-GI-NEXT: ldr d7, [x10, x9] -; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: ldr d17, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v7.8b +; CHECK-GI-NEXT: ldr d7, [x10] +; CHECK-GI-NEXT: ldr d19, [x11] +; CHECK-GI-NEXT: abs v2.4s, v2.4s +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 ; CHECK-GI-NEXT: abs v16.4s, v16.4s -; CHECK-GI-NEXT: abs v3.4s, v3.4s +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v17.8b +; CHECK-GI-NEXT: ldr d17, [x10] +; CHECK-GI-NEXT: ldr d20, [x11] +; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v19.8b +; CHECK-GI-NEXT: ldr d19, [x10, x9] +; CHECK-GI-NEXT: ldr d21, [x11, x8] +; CHECK-GI-NEXT: sshll2 v18.4s, v4.8h, #0 +; CHECK-GI-NEXT: saba v2.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sshll2 v1.4s, v6.8h, #0 +; CHECK-GI-NEXT: usubl v17.8h, v17.8b, v20.8b +; CHECK-GI-NEXT: saba v16.4s, v3.4s, v0.4s +; CHECK-GI-NEXT: sshll2 v3.4s, v5.8h, #0 +; CHECK-GI-NEXT: usubl v19.8h, v19.8b, v21.8b +; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 ; CHECK-GI-NEXT: abs v18.4s, v18.4s -; CHECK-GI-NEXT: abs v2.4s, v2.4s -; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v17.8b -; CHECK-GI-NEXT: sshll v17.4s, v6.4h, #0 -; CHECK-GI-NEXT: sshll2 v6.4s, v6.8h, #0 -; CHECK-GI-NEXT: abs v19.4s, v19.4s -; CHECK-GI-NEXT: abs v4.4s, v4.4s -; CHECK-GI-NEXT: add v3.4s, v16.4s, v3.4s -; CHECK-GI-NEXT: sshll v16.4s, v5.4h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0 -; CHECK-GI-NEXT: add v2.4s, v18.4s, v2.4s -; CHECK-GI-NEXT: abs v17.4s, v17.4s +; CHECK-GI-NEXT: sshll2 v20.4s, v7.8h, #0 +; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: sshll2 v21.4s, v17.8h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0 +; CHECK-GI-NEXT: abs v3.4s, v3.4s +; CHECK-GI-NEXT: sshll2 v22.4s, v19.8h, #0 +; CHECK-GI-NEXT: saba v18.4s, v4.4s, v0.4s +; CHECK-GI-NEXT: sshll v4.4s, v7.4h, #0 +; CHECK-GI-NEXT: abs v7.4s, v20.4s +; CHECK-GI-NEXT: saba v1.4s, v6.4s, v0.4s +; CHECK-GI-NEXT: sshll v6.4s, v17.4h, #0 +; CHECK-GI-NEXT: abs v17.4s, v21.4s +; CHECK-GI-NEXT: saba v3.4s, v5.4s, v0.4s +; CHECK-GI-NEXT: sshll v5.4s, v19.4h, #0 +; CHECK-GI-NEXT: abs v19.4s, v22.4s +; CHECK-GI-NEXT: saba v7.4s, v4.4s, v0.4s +; CHECK-GI-NEXT: saba v17.4s, v6.4s, v0.4s +; CHECK-GI-NEXT: saba v19.4s, v5.4s, v0.4s +; CHECK-GI-NEXT: addv s0, v2.4s +; CHECK-GI-NEXT: addv s2, v16.4s +; CHECK-GI-NEXT: addv s4, v18.4s ; CHECK-GI-NEXT: addv s1, v1.4s -; CHECK-GI-NEXT: abs v6.4s, v6.4s -; CHECK-GI-NEXT: addv s0, v0.4s -; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s -; CHECK-GI-NEXT: addv s3, v3.4s -; CHECK-GI-NEXT: sshll v18.4s, v7.4h, #0 -; CHECK-GI-NEXT: sshll2 v7.4s, v7.8h, #0 -; CHECK-GI-NEXT: abs v16.4s, v16.4s -; CHECK-GI-NEXT: abs v5.4s, v5.4s -; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: add v6.4s, v17.4s, v6.4s -; CHECK-GI-NEXT: addv s2, v2.4s +; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: addv s4, v4.4s -; CHECK-GI-NEXT: fmov w10, s3 -; CHECK-GI-NEXT: abs v18.4s, v18.4s -; CHECK-GI-NEXT: abs v7.4s, v7.4s -; CHECK-GI-NEXT: add v1.4s, v16.4s, v5.4s +; CHECK-GI-NEXT: addv s0, v3.4s +; CHECK-GI-NEXT: fmov w10, s4 +; CHECK-GI-NEXT: addv s2, v7.4s ; CHECK-GI-NEXT: add w8, w8, w9 -; CHECK-GI-NEXT: addv s3, v6.4s -; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: addv s1, v17.4s ; CHECK-GI-NEXT: add w8, w10, w8 -; CHECK-GI-NEXT: fmov w10, s4 -; CHECK-GI-NEXT: add v0.4s, v18.4s, v7.4s -; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: add w8, w10, w8 -; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: addv s0, v19.4s +; CHECK-GI-NEXT: add w8, w9, w8 +; CHECK-GI-NEXT: fmov w9, s2 ; CHECK-GI-NEXT: add w8, w9, w8 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w8, w9, w8