Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64][SVE] Instcombine uzp1/reinterpret svbool to use vector.insert #81069

Merged
merged 2 commits into from
Feb 15, 2024

Conversation

UsmanNadeem
Copy link
Contributor

Concatenating two predictes using uzp1 after converting to double length
using sve.convert.to/from.svbool is optimized poorly in the backend,
resulting in additional and instructions to zero the lanes. See
#78623

Combine this pattern to use llvm.vector.insert to concatenate and get
rid of convert to/from svbools.

Change-Id: Ieb38055f1f2304ead1ab2bead03ebd046b3bd36f

Concatenating two predictes using uzp1 after converting to double length
using sve.convert.to/from.svbool is optimized poorly in the backend,
resulting in additional `and` instructions to zero the lanes. See
llvm#78623

Combine this pattern to use `llvm.vector.insert` to concatenate and get
rid of convert to/from svbools.

Change-Id: Ieb38055f1f2304ead1ab2bead03ebd046b3bd36f
@llvmbot
Copy link
Collaborator

llvmbot commented Feb 8, 2024

@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-llvm-transforms

Author: Usman Nadeem (UsmanNadeem)

Changes

Concatenating two predictes using uzp1 after converting to double length
using sve.convert.to/from.svbool is optimized poorly in the backend,
resulting in additional and instructions to zero the lanes. See
#78623

Combine this pattern to use llvm.vector.insert to concatenate and get
rid of convert to/from svbools.

Change-Id: Ieb38055f1f2304ead1ab2bead03ebd046b3bd36f


Full diff: https://github.com/llvm/llvm-project/pull/81069.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+32)
  • (added) llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll (+177)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2c9..d583b38d9ff750 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1630,6 +1630,36 @@ static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, VectorSplat);
 }
 
+static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
+                                                       IntrinsicInst &II) {
+  Value *A, *B;
+  Type *RetTy = II.getType();
+  constexpr Intrinsic::ID From_SV = Intrinsic::aarch64_sve_convert_from_svbool;
+  constexpr Intrinsic::ID To_SV = Intrinsic::aarch64_sve_convert_to_svbool;
+
+  // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
+  // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
+  if ((match(II.getArgOperand(0),
+             m_Intrinsic<From_SV>(m_Intrinsic<To_SV>(m_Value(A)))) &&
+       match(II.getArgOperand(1),
+             m_Intrinsic<From_SV>(m_Intrinsic<To_SV>(m_Value(B))))) ||
+      (match(II.getArgOperand(0), m_Intrinsic<To_SV>(m_Value(A))) &&
+       match(II.getArgOperand(1), m_Intrinsic<To_SV>(m_Value(B))))) {
+    auto *TyA = cast<ScalableVectorType>(A->getType());
+    if (TyA == B->getType() &&
+        RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) {
+      auto *SubVec = IC.Builder.CreateInsertVector(
+          RetTy, PoisonValue::get(RetTy), A, IC.Builder.getInt64(0));
+      auto *ConcatVec = IC.Builder.CreateInsertVector(
+          RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
+      ConcatVec->takeName(&II);
+      return IC.replaceInstUsesWith(II, ConcatVec);
+    }
+  }
+
+  return std::nullopt;
+}
+
 static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
                                                       IntrinsicInst &II) {
   // zip1(uzp1(A, B), uzp2(A, B)) --> A
@@ -2012,6 +2042,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_sunpkhi:
   case Intrinsic::aarch64_sve_sunpklo:
     return instCombineSVEUnpack(IC, II);
+  case Intrinsic::aarch64_sve_uzp1:
+    return instCombineSVEUzp1(IC, II);
   case Intrinsic::aarch64_sve_zip1:
   case Intrinsic::aarch64_sve_zip2:
     return instCombineSVEZip(IC, II);
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll
new file mode 100644
index 00000000000000..7fc59af6edd458
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine -mtriple=aarch64 < %s | FileCheck %s
+
+; Code the concatenates two predictes using uzp1 after converting to
+; double length using sve.convert.to/from.svbool is optimized poorly
+; in the backend, resulting in additional `and` instructions to zero
+; the lanes. Test that we get rid of convert to/from and generate a
+; concatenate using vector insert instead.
+
+
+define <vscale x 8 x i1> @reinterpt_uzp1_1(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: define <vscale x 8 x i1> @reinterpt_uzp1_1(
+; CHECK-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult <vscale x 4 x i32> [[V0]], [[X]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> poison, <vscale x 4 x i1> [[CMP0]], i64 0)
+; CHECK-NEXT:    [[UZ1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> [[TMP1]], <vscale x 4 x i1> [[CMP1]], i64 4)
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[UZ1]]
+;
+  %cmp0 = icmp ult <vscale x 4 x i32> %v0, %x
+  %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp0)
+  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp1)
+  %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+  %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %4)
+  ret <vscale x 8 x i1> %uz1
+}
+
+define <vscale x 8 x i1> @reinterpt_uzp1_2(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: define <vscale x 8 x i1> @reinterpt_uzp1_2(
+; CHECK-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult <vscale x 2 x i64> [[V0]], [[X]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 2 x i64> [[V1]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> poison, <vscale x 4 x i1> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[UZ1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> [[TMP5]], <vscale x 4 x i1> [[TMP4]], i64 4)
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[UZ1]]
+;
+  %cmp0 = icmp ult <vscale x 2 x i64> %v0, %x
+  %cmp1 = icmp ult <vscale x 2 x i64> %v1, %x
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp0)
+  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp1)
+  %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %3)
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+  %6 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %5)
+  %7 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %4)
+  %8 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %7)
+  %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %6, <vscale x 8 x i1> %8)
+  ret <vscale x 8 x i1> %uz1
+}
+
+define <vscale x 16 x i1> @reinterpt_uzp1_3(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i32> %v3, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: define <vscale x 16 x i1> @reinterpt_uzp1_3(
+; CHECK-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]], <vscale x 4 x i32> [[V3:%.*]], <vscale x 4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult <vscale x 4 x i32> [[V0]], [[X]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult <vscale x 4 x i32> [[V2]], [[X]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult <vscale x 4 x i32> [[V3]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> poison, <vscale x 4 x i1> [[CMP0]], i64 0)
+; CHECK-NEXT:    [[UZ1_1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> [[TMP1]], <vscale x 4 x i1> [[CMP1]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> poison, <vscale x 4 x i1> [[CMP2]], i64 0)
+; CHECK-NEXT:    [[UZ1_2:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> [[TMP2]], <vscale x 4 x i1> [[CMP3]], i64 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> poison, <vscale x 8 x i1> [[UZ1_1]], i64 0)
+; CHECK-NEXT:    [[UZ3:%.*]] = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> [[TMP3]], <vscale x 8 x i1> [[UZ1_2]], i64 8)
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[UZ3]]
+;
+  %cmp0 = icmp ult <vscale x 4 x i32> %v0, %x
+  %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+  %cmp2 = icmp ult <vscale x 4 x i32> %v2, %x
+  %cmp3 = icmp ult <vscale x 4 x i32> %v3, %x
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp0)
+  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp1)
+  %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+  %uz1_1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %4)
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp2)
+  %6 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %5)
+  %7 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp3)
+  %8 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %7)
+  %uz1_2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %6, <vscale x 8 x i1> %8)
+  %9 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %uz1_1)
+  %10 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %uz1_2)
+  %uz3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.uzp1.nxv16i1(<vscale x 16 x i1> %9, <vscale x 16 x i1> %10)
+  ret <vscale x 16 x i1> %uz3
+}
+
+define <vscale x 4 x i1> @neg1(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: define <vscale x 4 x i1> @neg1(
+; CHECK-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult <vscale x 4 x i32> [[V0]], [[X]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT:    [[UZ1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.uzp1.nxv4i1(<vscale x 4 x i1> [[CMP0]], <vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[UZ1]]
+;
+  %cmp0 = icmp ult <vscale x 4 x i32> %v0, %x
+  %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+  %uz1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.uzp1.nxv4i1(<vscale x 4 x i1> %cmp0, <vscale x 4 x i1> %cmp1)
+  ret <vscale x 4 x i1> %uz1
+}
+
+define <vscale x 8 x i1> @neg2(<vscale x 2 x i64> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: define <vscale x 8 x i1> @neg2(
+; CHECK-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]]) {
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult <vscale x 2 x i64> [[V0]], [[Y]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT:    [[UZ1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> [[TMP2]], <vscale x 8 x i1> [[TMP4]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[UZ1]]
+;
+  %cmp0 = icmp ult <vscale x 2 x i64> %v0, %y
+  %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp0)
+  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp1)
+  %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+  %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %4)
+  ret <vscale x 8 x i1> %uz1
+}
+
+define <vscale x 8 x i1> @neg3(<vscale x 8 x i16> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: define <vscale x 8 x i1> @neg3(
+; CHECK-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[X:%.*]], <vscale x 8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult <vscale x 8 x i16> [[V0]], [[Y]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT:    [[UZ1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> [[CMP0]], <vscale x 8 x i1> [[TMP2]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[UZ1]]
+;
+  %cmp0 = icmp ult <vscale x 8 x i16> %v0, %y
+  %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp1)
+  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+  %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %cmp0, <vscale x 8 x i1> %2)
+  ret <vscale x 8 x i1> %uz1
+}
+
+define <vscale x 8 x i1> @neg4(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: define <vscale x 8 x i1> @neg4(
+; CHECK-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult <vscale x 2 x i64> [[V0]], [[X]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 2 x i64> [[V1]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT:    [[UZ1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> [[TMP2]], <vscale x 8 x i1> [[TMP4]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[UZ1]]
+;
+  %cmp0 = icmp ult <vscale x 2 x i64> %v0, %x
+  %cmp1 = icmp ult <vscale x 2 x i64> %v1, %x
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp0)
+  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp1)
+  %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+  %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %4)
+  ret <vscale x 8 x i1> %uz1
+}
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.uzp1.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.uzp1.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+

Copy link
Collaborator

@paulwalker-arm paulwalker-arm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few minor comments but otherwise looks good.

Change-Id: I35daab19bacbb020dff0448b6f4e61b53c2f119f
@UsmanNadeem UsmanNadeem merged commit 267d6b5 into llvm:main Feb 15, 2024
3 of 4 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants