New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Teach compiler to use information that there are no ac… #68698
[AArch64][SVE] Teach compiler to use information that there are no ac… #68698
Conversation
…tive lanes This patch optimizes SVE intrinsic calls which predicate is all false.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: None (JolantaJensen) Changes…tive lanes This patch optimizes SVE intrinsic calls which predicate is all false. Patch is 177.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68698.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cded28054f59259..85e7eb0f211af8c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1169,6 +1169,8 @@ instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
AddendOp = II.getOperand(2);
Mul = II.getOperand(1);
}
+ if (match(P, m_ZeroInt()))
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
m_Value(MulOp1))))
@@ -1301,9 +1303,26 @@ static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
return &II;
}
+// Optimize operations that take an all false predicate or send them for
+// canonicalization.
+static std::optional<Instruction *>
+instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
+ Intrinsic::ID IID) {
+ if (match(II.getOperand(0), m_ZeroInt())) {
+ if (II.getIntrinsicID() != IID)
+ return IC.replaceInstUsesWith(II, II.getOperand(1));
+ else
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+ if (II.getIntrinsicID() != IID)
+ return instCombineSVEAllActive(II, IID);
+ return std::nullopt;
+}
+
static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
IntrinsicInst &II) {
- if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_add_u))
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
return II_U;
if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mla>(
@@ -1318,7 +1337,8 @@ static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
static std::optional<Instruction *>
instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
- if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fadd_u))
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
return II_U;
if (auto FMLA =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
@@ -1360,7 +1380,8 @@ instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
static std::optional<Instruction *>
instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
- if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fsub_u))
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
return II_U;
if (auto FMLS =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
@@ -1402,7 +1423,8 @@ instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
IntrinsicInst &II) {
- if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sub_u))
+ if (auto II_U =
+ instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
return II_U;
if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mls>(
@@ -1418,10 +1440,8 @@ static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
auto *OpMultiplicand = II.getOperand(1);
auto *OpMultiplier = II.getOperand(2);
- // Canonicalise a non _u intrinsic only.
- if (II.getIntrinsicID() != IID)
- if (auto II_U = instCombineSVEAllActive(II, IID))
- return II_U;
+ if (auto II_U = instCombineSVEAllOrNoActive(IC, II, IID))
+ return II_U;
// Return true if a given instruction is a unit splat value, false otherwise.
auto IsUnitSplat = [](auto *I) {
@@ -1786,34 +1806,45 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_ptest_last:
return instCombineSVEPTest(IC, II);
case Intrinsic::aarch64_sve_fabd:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fabd_u);
+ case Intrinsic::aarch64_sve_fabd_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
case Intrinsic::aarch64_sve_fadd:
return instCombineSVEVectorFAdd(IC, II);
case Intrinsic::aarch64_sve_fadd_u:
return instCombineSVEVectorFAddU(IC, II);
case Intrinsic::aarch64_sve_fdiv:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fdiv_u);
+ case Intrinsic::aarch64_sve_fdiv_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
case Intrinsic::aarch64_sve_fmax:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmax_u);
+ case Intrinsic::aarch64_sve_fmax_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
case Intrinsic::aarch64_sve_fmaxnm:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmaxnm_u);
+ case Intrinsic::aarch64_sve_fmaxnm_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
case Intrinsic::aarch64_sve_fmin:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmin_u);
+ case Intrinsic::aarch64_sve_fmin_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
case Intrinsic::aarch64_sve_fminnm:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fminnm_u);
+ case Intrinsic::aarch64_sve_fminnm_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
case Intrinsic::aarch64_sve_fmla:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmla_u);
+ case Intrinsic::aarch64_sve_fmla_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
case Intrinsic::aarch64_sve_fmls:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmls_u);
+ case Intrinsic::aarch64_sve_fmls_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
case Intrinsic::aarch64_sve_fmul:
case Intrinsic::aarch64_sve_fmul_u:
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
case Intrinsic::aarch64_sve_fmulx:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmulx_u);
+ case Intrinsic::aarch64_sve_fmulx_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
case Intrinsic::aarch64_sve_fnmla:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmla_u);
+ case Intrinsic::aarch64_sve_fnmla_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
case Intrinsic::aarch64_sve_fnmls:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmls_u);
+ case Intrinsic::aarch64_sve_fnmls_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
case Intrinsic::aarch64_sve_fsub:
return instCombineSVEVectorFSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
@@ -1825,20 +1856,26 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
case Intrinsic::aarch64_sve_mla:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mla_u);
+ case Intrinsic::aarch64_sve_mla_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
case Intrinsic::aarch64_sve_mls:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mls_u);
+ case Intrinsic::aarch64_sve_mls_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
case Intrinsic::aarch64_sve_mul:
case Intrinsic::aarch64_sve_mul_u:
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
case Intrinsic::aarch64_sve_sabd:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sabd_u);
+ case Intrinsic::aarch64_sve_sabd_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
case Intrinsic::aarch64_sve_smax:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smax_u);
+ case Intrinsic::aarch64_sve_smax_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
case Intrinsic::aarch64_sve_smin:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smin_u);
+ case Intrinsic::aarch64_sve_smin_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
case Intrinsic::aarch64_sve_smulh:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smulh_u);
+ case Intrinsic::aarch64_sve_smulh_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
@@ -1846,31 +1883,44 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
Intrinsic::aarch64_sve_mls_u>(
IC, II, true);
case Intrinsic::aarch64_sve_uabd:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uabd_u);
+ case Intrinsic::aarch64_sve_uabd_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
case Intrinsic::aarch64_sve_umax:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umax_u);
+ case Intrinsic::aarch64_sve_umax_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
case Intrinsic::aarch64_sve_umin:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umin_u);
+ case Intrinsic::aarch64_sve_umin_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
case Intrinsic::aarch64_sve_umulh:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umulh_u);
+ case Intrinsic::aarch64_sve_umulh_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
case Intrinsic::aarch64_sve_asr:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_asr_u);
+ case Intrinsic::aarch64_sve_asr_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
case Intrinsic::aarch64_sve_lsl:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsl_u);
+ case Intrinsic::aarch64_sve_lsl_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
case Intrinsic::aarch64_sve_lsr:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsr_u);
+ case Intrinsic::aarch64_sve_lsr_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
case Intrinsic::aarch64_sve_and:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_and_u);
+ case Intrinsic::aarch64_sve_and_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
case Intrinsic::aarch64_sve_bic:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_bic_u);
+ case Intrinsic::aarch64_sve_bic_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
case Intrinsic::aarch64_sve_eor:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_eor_u);
+ case Intrinsic::aarch64_sve_eor_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
case Intrinsic::aarch64_sve_orr:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_orr_u);
+ case Intrinsic::aarch64_sve_orr_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
case Intrinsic::aarch64_sve_sqsub:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sqsub_u);
+ case Intrinsic::aarch64_sve_sqsub_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
case Intrinsic::aarch64_sve_uqsub:
- return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uqsub_u);
+ case Intrinsic::aarch64_sve_uqsub_u:
+ return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
case Intrinsic::aarch64_sve_tbl:
return instCombineSVETBL(IC, II);
case Intrinsic::aarch64_sve_uunpkhi:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intr-comb-m-forms-no-active-lanes.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intr-comb-m-forms-no-active-lanes.ll
new file mode 100644
index 000000000000000..463a5f5d2cfb5c8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intr-comb-m-forms-no-active-lanes.ll
@@ -0,0 +1,1324 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Replace SVE _m intrinsics with their first operand when the predicate is all false.
+
+; Float arithmetic
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fabd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fabd_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
+;
+ %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fabd_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fabd_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
+;
+ %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fabd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fabd_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
+;
+ %1 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %1
+}
+
+; aarch64_sve_fadd intrinsic combines to a LLVM instruction fadd.
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fadd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fadd_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
+;
+ %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fadd_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fadd_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
+;
+ %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fadd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fadd_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
+;
+ %1 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %1
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fdiv_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fdiv_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
+;
+ %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fdiv_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fdiv_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
+;
+ %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fdiv_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fdiv_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
+;
+ %1 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %1
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmax_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fmax_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
+;
+ %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmax_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fmax_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
+;
+ %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmax_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fmax_intrinsic_double
+; CHECK-SAME: (<...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll look over the tests in more detail once my query has been resolved but otherwise you're nearly there.
if (match(P, m_ZeroInt())) | ||
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this safe? I can see instCombineSVEVectorFuseMulAddSub
is called by instCombineSVEVectorFAdd
whose result for an all inactive predicate is defined.
As with calling instCombineSVEAllActive
you might need to move this functionality into the relevant places that call this function.
@@ -1169,6 +1169,8 @@ instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, | |||
AddendOp = II.getOperand(2); | |||
Mul = II.getOperand(1); | |||
} | |||
if (match(P, m_ZeroInt())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we sure that this is going have the llvm intrinsic as [llvm-ir-instrinsinc]_u?
Because only for _u llvm-ir intrinsics that the compiler can make sure this can return undefined values.
I agree with Paul and maybe here is not the best place to do this check.
instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, | ||
Intrinsic::ID IID) { | ||
if (match(II.getOperand(0), m_ZeroInt())) { | ||
if (II.getIntrinsicID() != IID) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: maybe add a comments
// llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are inactive for sv[func]_m or sv[func]_z
if (II.getIntrinsicID() != IID) | ||
return IC.replaceInstUsesWith(II, II.getOperand(1)); | ||
else | ||
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: maybe add a comments
// llvm_ir_u, pred(0), op1, op2 - Spec says to return undef when all lanes are inactive for sv[func]_x
#73964 replaces this pull request. |
…tive lanes
This patch optimizes SVE intrinsic calls which predicate is all false.