-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Instcombine uzp1/reinterpret svbool to use vector.insert #81069
Conversation
Concatenating two predictes using uzp1 after converting to double length using sve.convert.to/from.svbool is optimized poorly in the backend, resulting in additional `and` instructions to zero the lanes. See llvm#78623 Combine this pattern to use `llvm.vector.insert` to concatenate and get rid of convert to/from svbools. Change-Id: Ieb38055f1f2304ead1ab2bead03ebd046b3bd36f
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-transforms Author: Usman Nadeem (UsmanNadeem) ChangesConcatenating two predictes using uzp1 after converting to double length Combine this pattern to use Change-Id: Ieb38055f1f2304ead1ab2bead03ebd046b3bd36f Full diff: https://github.com/llvm/llvm-project/pull/81069.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2c9..d583b38d9ff750 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1630,6 +1630,36 @@ static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
return IC.replaceInstUsesWith(II, VectorSplat);
}
+static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *A, *B;
+ Type *RetTy = II.getType();
+ constexpr Intrinsic::ID From_SV = Intrinsic::aarch64_sve_convert_from_svbool;
+ constexpr Intrinsic::ID To_SV = Intrinsic::aarch64_sve_convert_to_svbool;
+
+ // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
+ // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
+ if ((match(II.getArgOperand(0),
+ m_Intrinsic<From_SV>(m_Intrinsic<To_SV>(m_Value(A)))) &&
+ match(II.getArgOperand(1),
+ m_Intrinsic<From_SV>(m_Intrinsic<To_SV>(m_Value(B))))) ||
+ (match(II.getArgOperand(0), m_Intrinsic<To_SV>(m_Value(A))) &&
+ match(II.getArgOperand(1), m_Intrinsic<To_SV>(m_Value(B))))) {
+ auto *TyA = cast<ScalableVectorType>(A->getType());
+ if (TyA == B->getType() &&
+ RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) {
+ auto *SubVec = IC.Builder.CreateInsertVector(
+ RetTy, PoisonValue::get(RetTy), A, IC.Builder.getInt64(0));
+ auto *ConcatVec = IC.Builder.CreateInsertVector(
+ RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
+ ConcatVec->takeName(&II);
+ return IC.replaceInstUsesWith(II, ConcatVec);
+ }
+ }
+
+ return std::nullopt;
+}
+
static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
IntrinsicInst &II) {
// zip1(uzp1(A, B), uzp2(A, B)) --> A
@@ -2012,6 +2042,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_sunpkhi:
case Intrinsic::aarch64_sve_sunpklo:
return instCombineSVEUnpack(IC, II);
+ case Intrinsic::aarch64_sve_uzp1:
+ return instCombineSVEUzp1(IC, II);
case Intrinsic::aarch64_sve_zip1:
case Intrinsic::aarch64_sve_zip2:
return instCombineSVEZip(IC, II);
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll
new file mode 100644
index 00000000000000..7fc59af6edd458
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine -mtriple=aarch64 < %s | FileCheck %s
+
+; Code the concatenates two predictes using uzp1 after converting to
+; double length using sve.convert.to/from.svbool is optimized poorly
+; in the backend, resulting in additional `and` instructions to zero
+; the lanes. Test that we get rid of convert to/from and generate a
+; concatenate using vector insert instead.
+
+
+define <vscale x 8 x i1> @reinterpt_uzp1_1(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: define <vscale x 8 x i1> @reinterpt_uzp1_1(
+; CHECK-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[CMP0:%.*]] = icmp ult <vscale x 4 x i32> [[V0]], [[X]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> poison, <vscale x 4 x i1> [[CMP0]], i64 0)
+; CHECK-NEXT: [[UZ1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> [[TMP1]], <vscale x 4 x i1> [[CMP1]], i64 4)
+; CHECK-NEXT: ret <vscale x 8 x i1> [[UZ1]]
+;
+ %cmp0 = icmp ult <vscale x 4 x i32> %v0, %x
+ %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp0)
+ %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+ %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp1)
+ %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+ %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %4)
+ ret <vscale x 8 x i1> %uz1
+}
+
+define <vscale x 8 x i1> @reinterpt_uzp1_2(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: define <vscale x 8 x i1> @reinterpt_uzp1_2(
+; CHECK-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[X:%.*]]) {
+; CHECK-NEXT: [[CMP0:%.*]] = icmp ult <vscale x 2 x i64> [[V0]], [[X]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <vscale x 2 x i64> [[V1]], [[X]]
+; CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP1]])
+; CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> poison, <vscale x 4 x i1> [[TMP2]], i64 0)
+; CHECK-NEXT: [[UZ1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> [[TMP5]], <vscale x 4 x i1> [[TMP4]], i64 4)
+; CHECK-NEXT: ret <vscale x 8 x i1> [[UZ1]]
+;
+ %cmp0 = icmp ult <vscale x 2 x i64> %v0, %x
+ %cmp1 = icmp ult <vscale x 2 x i64> %v1, %x
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp0)
+ %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+ %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp1)
+ %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %3)
+ %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+ %6 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %5)
+ %7 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %4)
+ %8 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %7)
+ %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %6, <vscale x 8 x i1> %8)
+ ret <vscale x 8 x i1> %uz1
+}
+
+define <vscale x 16 x i1> @reinterpt_uzp1_3(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i32> %v3, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: define <vscale x 16 x i1> @reinterpt_uzp1_3(
+; CHECK-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]], <vscale x 4 x i32> [[V3:%.*]], <vscale x 4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[CMP0:%.*]] = icmp ult <vscale x 4 x i32> [[V0]], [[X]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ult <vscale x 4 x i32> [[V2]], [[X]]
+; CHECK-NEXT: [[CMP3:%.*]] = icmp ult <vscale x 4 x i32> [[V3]], [[X]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> poison, <vscale x 4 x i1> [[CMP0]], i64 0)
+; CHECK-NEXT: [[UZ1_1:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> [[TMP1]], <vscale x 4 x i1> [[CMP1]], i64 4)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> poison, <vscale x 4 x i1> [[CMP2]], i64 0)
+; CHECK-NEXT: [[UZ1_2:%.*]] = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv4i1(<vscale x 8 x i1> [[TMP2]], <vscale x 4 x i1> [[CMP3]], i64 4)
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> poison, <vscale x 8 x i1> [[UZ1_1]], i64 0)
+; CHECK-NEXT: [[UZ3:%.*]] = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> [[TMP3]], <vscale x 8 x i1> [[UZ1_2]], i64 8)
+; CHECK-NEXT: ret <vscale x 16 x i1> [[UZ3]]
+;
+ %cmp0 = icmp ult <vscale x 4 x i32> %v0, %x
+ %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+ %cmp2 = icmp ult <vscale x 4 x i32> %v2, %x
+ %cmp3 = icmp ult <vscale x 4 x i32> %v3, %x
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp0)
+ %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+ %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp1)
+ %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+ %uz1_1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %4)
+ %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp2)
+ %6 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %5)
+ %7 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp3)
+ %8 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %7)
+ %uz1_2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %6, <vscale x 8 x i1> %8)
+ %9 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %uz1_1)
+ %10 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %uz1_2)
+ %uz3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.uzp1.nxv16i1(<vscale x 16 x i1> %9, <vscale x 16 x i1> %10)
+ ret <vscale x 16 x i1> %uz3
+}
+
+define <vscale x 4 x i1> @neg1(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: define <vscale x 4 x i1> @neg1(
+; CHECK-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[X:%.*]]) {
+; CHECK-NEXT: [[CMP0:%.*]] = icmp ult <vscale x 4 x i32> [[V0]], [[X]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT: [[UZ1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.uzp1.nxv4i1(<vscale x 4 x i1> [[CMP0]], <vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT: ret <vscale x 4 x i1> [[UZ1]]
+;
+ %cmp0 = icmp ult <vscale x 4 x i32> %v0, %x
+ %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+ %uz1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.uzp1.nxv4i1(<vscale x 4 x i1> %cmp0, <vscale x 4 x i1> %cmp1)
+ ret <vscale x 4 x i1> %uz1
+}
+
+define <vscale x 8 x i1> @neg2(<vscale x 2 x i64> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: define <vscale x 8 x i1> @neg2(
+; CHECK-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]]) {
+; CHECK-NEXT: [[CMP0:%.*]] = icmp ult <vscale x 2 x i64> [[V0]], [[Y]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT: [[UZ1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> [[TMP2]], <vscale x 8 x i1> [[TMP4]])
+; CHECK-NEXT: ret <vscale x 8 x i1> [[UZ1]]
+;
+ %cmp0 = icmp ult <vscale x 2 x i64> %v0, %y
+ %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp0)
+ %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+ %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp1)
+ %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+ %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %4)
+ ret <vscale x 8 x i1> %uz1
+}
+
+define <vscale x 8 x i1> @neg3(<vscale x 8 x i16> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: define <vscale x 8 x i1> @neg3(
+; CHECK-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[X:%.*]], <vscale x 8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT: [[CMP0:%.*]] = icmp ult <vscale x 8 x i16> [[V0]], [[Y]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[V1]], [[X]]
+; CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT: [[UZ1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> [[CMP0]], <vscale x 8 x i1> [[TMP2]])
+; CHECK-NEXT: ret <vscale x 8 x i1> [[UZ1]]
+;
+ %cmp0 = icmp ult <vscale x 8 x i16> %v0, %y
+ %cmp1 = icmp ult <vscale x 4 x i32> %v1, %x
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %cmp1)
+ %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+ %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %cmp0, <vscale x 8 x i1> %2)
+ ret <vscale x 8 x i1> %uz1
+}
+
+define <vscale x 8 x i1> @neg4(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: define <vscale x 8 x i1> @neg4(
+; CHECK-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[X:%.*]]) {
+; CHECK-NEXT: [[CMP0:%.*]] = icmp ult <vscale x 2 x i64> [[V0]], [[X]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <vscale x 2 x i64> [[V1]], [[X]]
+; CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[CMP1]])
+; CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP3]])
+; CHECK-NEXT: [[UZ1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> [[TMP2]], <vscale x 8 x i1> [[TMP4]])
+; CHECK-NEXT: ret <vscale x 8 x i1> [[UZ1]]
+;
+ %cmp0 = icmp ult <vscale x 2 x i64> %v0, %x
+ %cmp1 = icmp ult <vscale x 2 x i64> %v1, %x
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp0)
+ %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+ %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %cmp1)
+ %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+ %uz1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1> %2, <vscale x 8 x i1> %4)
+ ret <vscale x 8 x i1> %uz1
+}
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.uzp1.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.uzp1.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.uzp1.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A few minor comments but otherwise looks good.
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-uzp1.ll
Outdated
Show resolved
Hide resolved
Change-Id: I35daab19bacbb020dff0448b6f4e61b53c2f119f
Concatenating two predictes using uzp1 after converting to double length
using sve.convert.to/from.svbool is optimized poorly in the backend,
resulting in additional
and
instructions to zero the lanes. See#78623
Combine this pattern to use
llvm.vector.insert
to concatenate and getrid of convert to/from svbools.
Change-Id: Ieb38055f1f2304ead1ab2bead03ebd046b3bd36f