New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Add optimisation for SVE intrinsics with no active lanes #73964
[AArch64][SVE] Add optimisation for SVE intrinsics with no active lanes #73964
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-transforms Author: Mark Harley (MarkAHarley) ChangesCloses #56573 This patch introduces optimisations for SVE intrinsic function calls which have all false predicates. Patch is 179.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73964.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b5b8b68291786dc..a0e42a183f1d90d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1406,9 +1406,30 @@ static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
return &II;
}
+// Optimize operations that take an all false predicate or send them for
+// canonicalization.
+static std::optional<Instruction *>
+instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
+ Intrinsic::ID IID) {
+ if (match(II.getOperand(0), m_ZeroInt())) {
+ if (II.getIntrinsicID() != IID)
+ // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
+ // inactive for sv[func]_m or sv[func]_z
+ return IC.replaceInstUsesWith(II, II.getOperand(1));
+ else
+ // llvm_ir_u, pred(0), op1, op2 - Spec says to return undef when all lanes
+ // are inactive for sv[func]_x
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+ if (II.getIntrinsicID() != IID)
+ return instCombineSVEAllActive(II, IID);
+ return std::nullopt;
+}
+
static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
IntrinsicInst &II) {
- if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_add_u))
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
return II_U;
if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mla>(
@@ -1421,9 +1442,22 @@ static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
return std::nullopt;
}
+static std::optional<Instruction *>
+instCombineSVEVectorAddU(InstCombiner &IC, IntrinsicInst &II) {
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
+ return II_U;
+ else {
+ return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
+ Intrinsic::aarch64_sve_mla_u>(
+ IC, II, true);
+ }
+}
+
static std::optional<Instruction *>
instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
- if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fadd_u))
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
return II_U;
if (auto FMLA =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
@@ -1445,6 +1479,9 @@ instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
static std::optional<Instruction *>
instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
+ return II_U;
if (auto FMLA =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fmla>(IC, II,
@@ -1465,7 +1502,8 @@ instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
static std::optional<Instruction *>
instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
- if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fsub_u))
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
return II_U;
if (auto FMLS =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
@@ -1487,6 +1525,9 @@ instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
static std::optional<Instruction *>
instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
+ return II_U;
if (auto FMLS =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fmls>(IC, II,
@@ -1507,7 +1548,8 @@ instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
IntrinsicInst &II) {
- if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sub_u))
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
return II_U;
if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mls>(
@@ -1516,6 +1558,18 @@ static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
return std::nullopt;
}
+static std::optional<Instruction *>
+instCombineSVEVectorSubU(InstCombiner &IC, IntrinsicInst &II) {
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
+ return II_U;
+ else {
+ return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
+ Intrinsic::aarch64_sve_mls_u>(
+ IC, II, true);
+ }
+}
+
static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
IntrinsicInst &II,
Intrinsic::ID IID) {
@@ -1523,10 +1577,8 @@ static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
auto *OpMultiplicand = II.getOperand(1);
auto *OpMultiplier = II.getOperand(2);
- // Canonicalise a non _u intrinsic only.
- if (II.getIntrinsicID() != IID)
- if (auto II_U = instCombineSVEAllActive(II, IID))
- return II_U;
+ if (auto II_U = instCombineSVEAllOrNoActive(IC, II, IID))
+ return II_U;
// Return true if a given instruction is a unit splat value, false otherwise.
auto IsUnitSplat = [](auto *I) {
@@ -1891,34 +1943,45 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_ptest_last:
return instCombineSVEPTest(IC, II);
case Intrinsic::aarch64_sve_fabd:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fabd_u);
+ case Intrinsic::aarch64_sve_fabd_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
case Intrinsic::aarch64_sve_fadd:
return instCombineSVEVectorFAdd(IC, II);
case Intrinsic::aarch64_sve_fadd_u:
return instCombineSVEVectorFAddU(IC, II);
case Intrinsic::aarch64_sve_fdiv:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fdiv_u);
+ case Intrinsic::aarch64_sve_fdiv_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
case Intrinsic::aarch64_sve_fmax:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmax_u);
+ case Intrinsic::aarch64_sve_fmax_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
case Intrinsic::aarch64_sve_fmaxnm:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmaxnm_u);
+ case Intrinsic::aarch64_sve_fmaxnm_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
case Intrinsic::aarch64_sve_fmin:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmin_u);
+ case Intrinsic::aarch64_sve_fmin_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
case Intrinsic::aarch64_sve_fminnm:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fminnm_u);
+ case Intrinsic::aarch64_sve_fminnm_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
case Intrinsic::aarch64_sve_fmla:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmla_u);
+ case Intrinsic::aarch64_sve_fmla_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
case Intrinsic::aarch64_sve_fmls:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmls_u);
+ case Intrinsic::aarch64_sve_fmls_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
case Intrinsic::aarch64_sve_fmul:
case Intrinsic::aarch64_sve_fmul_u:
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
case Intrinsic::aarch64_sve_fmulx:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmulx_u);
+ case Intrinsic::aarch64_sve_fmulx_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
case Intrinsic::aarch64_sve_fnmla:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmla_u);
+ case Intrinsic::aarch64_sve_fnmla_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
case Intrinsic::aarch64_sve_fnmls:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmls_u);
+ case Intrinsic::aarch64_sve_fnmls_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
case Intrinsic::aarch64_sve_fsub:
return instCombineSVEVectorFSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
@@ -1926,56 +1989,71 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_add:
return instCombineSVEVectorAdd(IC, II);
case Intrinsic::aarch64_sve_add_u:
- return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
- Intrinsic::aarch64_sve_mla_u>(
- IC, II, true);
+ return instCombineSVEVectorAddU(IC, II);
case Intrinsic::aarch64_sve_mla:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mla_u);
+ case Intrinsic::aarch64_sve_mla_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
case Intrinsic::aarch64_sve_mls:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mls_u);
+ case Intrinsic::aarch64_sve_mls_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
case Intrinsic::aarch64_sve_mul:
case Intrinsic::aarch64_sve_mul_u:
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
case Intrinsic::aarch64_sve_sabd:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sabd_u);
+ case Intrinsic::aarch64_sve_sabd_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
case Intrinsic::aarch64_sve_smax:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smax_u);
+ case Intrinsic::aarch64_sve_smax_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
case Intrinsic::aarch64_sve_smin:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smin_u);
+ case Intrinsic::aarch64_sve_smin_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
case Intrinsic::aarch64_sve_smulh:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smulh_u);
+ case Intrinsic::aarch64_sve_smulh_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
- return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
- Intrinsic::aarch64_sve_mls_u>(
- IC, II, true);
+ return instCombineSVEVectorSubU(IC, II);
case Intrinsic::aarch64_sve_uabd:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uabd_u);
+ case Intrinsic::aarch64_sve_uabd_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
case Intrinsic::aarch64_sve_umax:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umax_u);
+ case Intrinsic::aarch64_sve_umax_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
case Intrinsic::aarch64_sve_umin:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umin_u);
+ case Intrinsic::aarch64_sve_umin_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
case Intrinsic::aarch64_sve_umulh:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umulh_u);
+ case Intrinsic::aarch64_sve_umulh_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
case Intrinsic::aarch64_sve_asr:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_asr_u);
+ case Intrinsic::aarch64_sve_asr_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
case Intrinsic::aarch64_sve_lsl:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsl_u);
+ case Intrinsic::aarch64_sve_lsl_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
case Intrinsic::aarch64_sve_lsr:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsr_u);
+ case Intrinsic::aarch64_sve_lsr_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
case Intrinsic::aarch64_sve_and:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_and_u);
+ case Intrinsic::aarch64_sve_and_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
case Intrinsic::aarch64_sve_bic:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_bic_u);
+ case Intrinsic::aarch64_sve_bic_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
case Intrinsic::aarch64_sve_eor:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_eor_u);
+ case Intrinsic::aarch64_sve_eor_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
case Intrinsic::aarch64_sve_orr:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_orr_u);
+ case Intrinsic::aarch64_sve_orr_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
case Intrinsic::aarch64_sve_sqsub:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sqsub_u);
+ case Intrinsic::aarch64_sve_sqsub_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
case Intrinsic::aarch64_sve_uqsub:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uqsub_u);
+ case Intrinsic::aarch64_sve_uqsub_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
case Intrinsic::aarch64_sve_tbl:
return instCombineSVETBL(IC, II);
case Intrinsic::aarch64_sve_uunpkhi:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-m-forms-no-active-lanes.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-m-forms-no-active-lanes.ll
new file mode 100644
index 000000000000000..463a5f5d2cfb5c8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-m-forms-no-active-lanes.ll
@@ -0,0 +1,1324 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Replace SVE _m intrinsics with their first operand when the predicate is all false.
+
+; Float arithmetic
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fabd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fabd_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
+;
+ %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fabd_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fabd_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
+;
+ %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fabd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fabd_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
+;
+ %1 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %1
+}
+
+; aarch64_sve_fadd intrinsic combines to a LLVM instruction fadd.
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fadd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fadd_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
+;
+ %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fadd_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fadd_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
+;
+ %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fadd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fadd_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
+;
+ %1 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %1
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fdiv_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fdiv_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
+;
+ %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fdiv_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fdiv_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
+;
+ %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4...
[truncated]
|
instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, | ||
Intrinsic::ID IID) { | ||
if (match(II.getOperand(0), m_ZeroInt())) { | ||
if (II.getIntrinsicID() != IID) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: maybe the if
and else
could have braces due to the comments on their body?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
@@ -1406,9 +1406,30 @@ static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II, | |||
return &II; | |||
} | |||
|
|||
// Optimize operations that take an all false predicate or send them for |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @MarkAHarley,
This comment is misleading, as it is missing the point that canonicalization applied here is only for the ptrue all predicate.
I would suggest rephrase it.
"Simplify operations where predicate has all inactive lanes or try to replace with _u form when all lanes are active".
case Intrinsic::aarch64_sve_uabd_u: | ||
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't like the idea of handling the two intrinsic types together because I recall we only recently split similar instances apart because it was awkward to implement _u
and non-_u
specific optimisation. You can already see this for this patch where only half of instCombineSVEAllOrNoActive
is applicable. I'd rather have dedicated functions for the specific combine so by all means have instCombineSVEAllOrNoActive
in place of instCombineSVEAllActive
but I think the _u
forms should have a dedicated function.
That said, I do wonder how useful the combines that produce undef
are from a practical sense. I would think that a sign the source material is likely bogus rather than a legitimate optimisation opportunity.
@@ -1523,9 +1540,8 @@ static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, | |||
auto *OpMultiplicand = II.getOperand(1); | |||
auto *OpMultiplier = II.getOperand(2); | |||
|
|||
// Canonicalise a non _u intrinsic only. | |||
if (II.getIntrinsicID() != IID) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This check can be moved outside of this function if preferred.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A couple of minor things but otherwise looks good. Do you plan to implement a similar optimisation for the unary instructions?
// llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are | ||
// inactive for sv[func]_m or sv[func]_z |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The sv[func]_z
part should be removed because those cases should return zero for inactive lanes.
; RUN: opt -S -passes=instcombine < %s | FileCheck %s | ||
|
||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
; Replace SVE _u intrinsics with undef if the predicate is all false. | ||
|
||
; Float arithmetic | ||
|
||
declare <vscale x 8 x half> @llvm.aarch64.sve.fabd.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>) | ||
define <vscale x 8 x half> @replace_fabd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This file exists from a previous version of the patch which is no longer applicable to the patch? So can be removed?
The optimisation for unary instructions will be in a separate patch. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the final changes, LGTM.
This patch removes the optimisations for _u intrinsics will no active lanes as this should never occur.
3433ceb
to
09f0cd9
Compare
…es (llvm#73964) This patch introduces optimisations for SVE intrinsic function calls which have all false predicates.
This patch introduces optimisations for SVE intrinsic function calls which have all false predicates.