[AArch64][SVE] Teach compiler to use information that there are no ac… #68698

JolantaJensen · 2023-10-10T12:14:49Z

…tive lanes

This patch optimizes SVE intrinsic calls which predicate is all false.

…tive lanes This patch optimizes SVE intrinsic calls which predicate is all false.

llvmbot · 2023-10-10T12:15:52Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-aarch64

Author: None (JolantaJensen)

Changes

…tive lanes

This patch optimizes SVE intrinsic calls which predicate is all false.

Patch is 177.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68698.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+88-38)
(added) llvm/test/Transforms/InstCombine/AArch64/sve-intr-comb-m-forms-no-active-lanes.ll (+1324)
(added) llvm/test/Transforms/InstCombine/AArch64/sve-intr-comb-u-forms-no-active-lanes.ll (+1318)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cded28054f59259..85e7eb0f211af8c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1169,6 +1169,8 @@ instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
     AddendOp = II.getOperand(2);
     Mul = II.getOperand(1);
   }
+  if (match(P, m_ZeroInt()))
+    return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
 
   if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
                                       m_Value(MulOp1))))
@@ -1301,9 +1303,26 @@ static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
   return &II;
 }
 
+// Optimize operations that take an all false predicate or send them for
+// canonicalization.
+static std::optional<Instruction *>
+instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
+                            Intrinsic::ID IID) {
+  if (match(II.getOperand(0), m_ZeroInt())) {
+    if (II.getIntrinsicID() != IID)
+      return IC.replaceInstUsesWith(II, II.getOperand(1));
+    else
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+  }
+  if (II.getIntrinsicID() != IID)
+    return instCombineSVEAllActive(II, IID);
+  return std::nullopt;
+}
+
 static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
                                                             IntrinsicInst &II) {
-  if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_add_u))
+  if (auto II_U =
+          instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
     return II_U;
   if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
                                                    Intrinsic::aarch64_sve_mla>(
@@ -1318,7 +1337,8 @@ static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
 
 static std::optional<Instruction *>
 instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
-  if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fadd_u))
+  if (auto II_U =
+          instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
     return II_U;
   if (auto FMLA =
           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
@@ -1360,7 +1380,8 @@ instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
 
 static std::optional<Instruction *>
 instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
-  if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fsub_u))
+  if (auto II_U =
+          instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
     return II_U;
   if (auto FMLS =
           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
@@ -1402,7 +1423,8 @@ instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
 
 static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
                                                             IntrinsicInst &II) {
-  if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sub_u))
+  if (auto II_U =
+          instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
     return II_U;
   if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
                                                    Intrinsic::aarch64_sve_mls>(
@@ -1418,10 +1440,8 @@ static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
   auto *OpMultiplicand = II.getOperand(1);
   auto *OpMultiplier = II.getOperand(2);
 
-  // Canonicalise a non _u intrinsic only.
-  if (II.getIntrinsicID() != IID)
-    if (auto II_U = instCombineSVEAllActive(II, IID))
-      return II_U;
+  if (auto II_U = instCombineSVEAllOrNoActive(IC, II, IID))
+    return II_U;
 
   // Return true if a given instruction is a unit splat value, false otherwise.
   auto IsUnitSplat = [](auto *I) {
@@ -1786,34 +1806,45 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_ptest_last:
     return instCombineSVEPTest(IC, II);
   case Intrinsic::aarch64_sve_fabd:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fabd_u);
+  case Intrinsic::aarch64_sve_fabd_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
   case Intrinsic::aarch64_sve_fadd:
     return instCombineSVEVectorFAdd(IC, II);
   case Intrinsic::aarch64_sve_fadd_u:
     return instCombineSVEVectorFAddU(IC, II);
   case Intrinsic::aarch64_sve_fdiv:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fdiv_u);
+  case Intrinsic::aarch64_sve_fdiv_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
   case Intrinsic::aarch64_sve_fmax:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmax_u);
+  case Intrinsic::aarch64_sve_fmax_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
   case Intrinsic::aarch64_sve_fmaxnm:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmaxnm_u);
+  case Intrinsic::aarch64_sve_fmaxnm_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
   case Intrinsic::aarch64_sve_fmin:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmin_u);
+  case Intrinsic::aarch64_sve_fmin_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
   case Intrinsic::aarch64_sve_fminnm:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fminnm_u);
+  case Intrinsic::aarch64_sve_fminnm_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
   case Intrinsic::aarch64_sve_fmla:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmla_u);
+  case Intrinsic::aarch64_sve_fmla_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
   case Intrinsic::aarch64_sve_fmls:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmls_u);
+  case Intrinsic::aarch64_sve_fmls_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
   case Intrinsic::aarch64_sve_fmul:
   case Intrinsic::aarch64_sve_fmul_u:
     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
   case Intrinsic::aarch64_sve_fmulx:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmulx_u);
+  case Intrinsic::aarch64_sve_fmulx_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
   case Intrinsic::aarch64_sve_fnmla:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmla_u);
+  case Intrinsic::aarch64_sve_fnmla_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
   case Intrinsic::aarch64_sve_fnmls:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmls_u);
+  case Intrinsic::aarch64_sve_fnmls_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
   case Intrinsic::aarch64_sve_fsub:
     return instCombineSVEVectorFSub(IC, II);
   case Intrinsic::aarch64_sve_fsub_u:
@@ -1825,20 +1856,26 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                              Intrinsic::aarch64_sve_mla_u>(
         IC, II, true);
   case Intrinsic::aarch64_sve_mla:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mla_u);
+  case Intrinsic::aarch64_sve_mla_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
   case Intrinsic::aarch64_sve_mls:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mls_u);
+  case Intrinsic::aarch64_sve_mls_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
   case Intrinsic::aarch64_sve_mul:
   case Intrinsic::aarch64_sve_mul_u:
     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
   case Intrinsic::aarch64_sve_sabd:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sabd_u);
+  case Intrinsic::aarch64_sve_sabd_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
   case Intrinsic::aarch64_sve_smax:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smax_u);
+  case Intrinsic::aarch64_sve_smax_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
   case Intrinsic::aarch64_sve_smin:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smin_u);
+  case Intrinsic::aarch64_sve_smin_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
   case Intrinsic::aarch64_sve_smulh:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smulh_u);
+  case Intrinsic::aarch64_sve_smulh_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
   case Intrinsic::aarch64_sve_sub:
     return instCombineSVEVectorSub(IC, II);
   case Intrinsic::aarch64_sve_sub_u:
@@ -1846,31 +1883,44 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                              Intrinsic::aarch64_sve_mls_u>(
         IC, II, true);
   case Intrinsic::aarch64_sve_uabd:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uabd_u);
+  case Intrinsic::aarch64_sve_uabd_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
   case Intrinsic::aarch64_sve_umax:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umax_u);
+  case Intrinsic::aarch64_sve_umax_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
   case Intrinsic::aarch64_sve_umin:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umin_u);
+  case Intrinsic::aarch64_sve_umin_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
   case Intrinsic::aarch64_sve_umulh:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umulh_u);
+  case Intrinsic::aarch64_sve_umulh_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
   case Intrinsic::aarch64_sve_asr:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_asr_u);
+  case Intrinsic::aarch64_sve_asr_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
   case Intrinsic::aarch64_sve_lsl:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsl_u);
+  case Intrinsic::aarch64_sve_lsl_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
   case Intrinsic::aarch64_sve_lsr:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsr_u);
+  case Intrinsic::aarch64_sve_lsr_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
   case Intrinsic::aarch64_sve_and:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_and_u);
+  case Intrinsic::aarch64_sve_and_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
   case Intrinsic::aarch64_sve_bic:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_bic_u);
+  case Intrinsic::aarch64_sve_bic_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
   case Intrinsic::aarch64_sve_eor:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_eor_u);
+  case Intrinsic::aarch64_sve_eor_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
   case Intrinsic::aarch64_sve_orr:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_orr_u);
+  case Intrinsic::aarch64_sve_orr_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
   case Intrinsic::aarch64_sve_sqsub:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sqsub_u);
+  case Intrinsic::aarch64_sve_sqsub_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
   case Intrinsic::aarch64_sve_uqsub:
-    return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uqsub_u);
+  case Intrinsic::aarch64_sve_uqsub_u:
+    return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
   case Intrinsic::aarch64_sve_tbl:
     return instCombineSVETBL(IC, II);
   case Intrinsic::aarch64_sve_uunpkhi:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intr-comb-m-forms-no-active-lanes.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intr-comb-m-forms-no-active-lanes.ll
new file mode 100644
index 000000000000000..463a5f5d2cfb5c8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intr-comb-m-forms-no-active-lanes.ll
@@ -0,0 +1,1324 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Replace SVE _m intrinsics with their first operand when the predicate is all false.
+
+; Float arithmetic
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fabd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fabd_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fabd_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fabd_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fabd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fabd_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
+;
+  %1 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %1
+}
+
+; aarch64_sve_fadd intrinsic combines to a LLVM instruction fadd.
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fadd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fadd_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fadd_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fadd_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fadd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fadd_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
+;
+  %1 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %1
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fdiv_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fdiv_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fdiv_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fdiv_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fdiv_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fdiv_intrinsic_double
+; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
+;
+  %1 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %1
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmax_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: define <vscale x 8 x half> @replace_fmax_intrinsic_half
+; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
+;
+  %1 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %1
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmax_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @replace_fmax_intrinsic_float
+; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
+;
+  %1 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmax_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x double> @replace_fmax_intrinsic_double
+; CHECK-SAME: (<...
[truncated]

paulwalker-arm

I'll look over the tests in more detail once my query has been resolved but otherwise you're nearly there.

paulwalker-arm · 2023-10-12T12:02:02Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  if (match(P, m_ZeroInt()))
+    return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));


Is this safe? I can see instCombineSVEVectorFuseMulAddSub is called by instCombineSVEVectorFAdd whose result for an all inactive predicate is defined.

As with calling instCombineSVEAllActive you might need to move this functionality into the relevant places that call this function.

CarolineConcatto · 2023-10-13T11:00:33Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

@@ -1169,6 +1169,8 @@ instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
    AddendOp = II.getOperand(2);
    Mul = II.getOperand(1);
  }
+  if (match(P, m_ZeroInt()))


Are we sure that this is going have the llvm intrinsic as [llvm-ir-instrinsinc]_u?
Because only for _u llvm-ir intrinsics that the compiler can make sure this can return undefined values.
I agree with Paul and maybe here is not the best place to do this check.

CarolineConcatto · 2023-10-13T11:07:01Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
+                            Intrinsic::ID IID) {
+  if (match(II.getOperand(0), m_ZeroInt())) {
+    if (II.getIntrinsicID() != IID)


nit: maybe add a comments
// llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are inactive for sv[func]_m or sv[func]_z

CarolineConcatto · 2023-10-13T11:07:03Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+    if (II.getIntrinsicID() != IID)
+      return IC.replaceInstUsesWith(II, II.getOperand(1));
+    else
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));


nit: maybe add a comments
// llvm_ir_u, pred(0), op1, op2 - Spec says to return undef when all lanes are inactive for sv[func]_x

paulwalker-arm · 2023-12-06T18:23:52Z

#73964 replaces this pull request.

[AArch64][SVE] Teach compiler to use information that there are no ac…

9587765

…tive lanes This patch optimizes SVE intrinsic calls which predicate is all false.

JolantaJensen requested a review from paulwalker-arm October 10, 2023 12:15

llvmbot added backend:AArch64 llvm:transforms labels Oct 10, 2023

JolantaJensen requested a review from mgabka October 10, 2023 12:15

paulwalker-arm reviewed Oct 12, 2023

View reviewed changes

CarolineConcatto reviewed Oct 13, 2023

View reviewed changes

paulwalker-arm closed this Dec 6, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64][SVE] Teach compiler to use information that there are no ac… #68698

[AArch64][SVE] Teach compiler to use information that there are no ac… #68698

JolantaJensen commented Oct 10, 2023

llvmbot commented Oct 10, 2023 •

edited

paulwalker-arm left a comment

paulwalker-arm Oct 12, 2023

CarolineConcatto Oct 13, 2023

CarolineConcatto Oct 13, 2023

CarolineConcatto Oct 13, 2023

paulwalker-arm commented Dec 6, 2023

		if (match(P, m_ZeroInt()))
		return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));

[AArch64][SVE] Teach compiler to use information that there are no ac… #68698

[AArch64][SVE] Teach compiler to use information that there are no ac… #68698

Conversation

JolantaJensen commented Oct 10, 2023

llvmbot commented Oct 10, 2023 • edited

paulwalker-arm left a comment

Choose a reason for hiding this comment

paulwalker-arm Oct 12, 2023

Choose a reason for hiding this comment

CarolineConcatto Oct 13, 2023

Choose a reason for hiding this comment

CarolineConcatto Oct 13, 2023

Choose a reason for hiding this comment

CarolineConcatto Oct 13, 2023

Choose a reason for hiding this comment

paulwalker-arm commented Dec 6, 2023

llvmbot commented Oct 10, 2023 •

edited