Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64][SVE] Add optimisation for SVE intrinsics with no active lanes #73964

Merged
merged 5 commits into from Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
98 changes: 59 additions & 39 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Expand Up @@ -1406,9 +1406,23 @@ static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
return &II;
}

// Simplify operations where predicate has all inactive lanes or try to replace
// with _u form when all lanes are active
static std::optional<Instruction *>
instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
Intrinsic::ID IID) {
if (match(II.getOperand(0), m_ZeroInt())) {
// llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
// inactive for sv[func]_m
return IC.replaceInstUsesWith(II, II.getOperand(1));
}
return instCombineSVEAllActive(II, IID);
}

static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
IntrinsicInst &II) {
if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_add_u))
if (auto II_U =
instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
return II_U;
if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mla>(
Expand All @@ -1423,7 +1437,8 @@ static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,

static std::optional<Instruction *>
instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fadd_u))
if (auto II_U =
instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
return II_U;
if (auto FMLA =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Expand Down Expand Up @@ -1465,7 +1480,8 @@ instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {

static std::optional<Instruction *>
instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fsub_u))
if (auto II_U =
instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
return II_U;
if (auto FMLS =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Expand Down Expand Up @@ -1507,7 +1523,8 @@ instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {

static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
IntrinsicInst &II) {
if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sub_u))
if (auto II_U =
instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
return II_U;
if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mls>(
Expand All @@ -1523,11 +1540,6 @@ static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
auto *OpMultiplicand = II.getOperand(1);
auto *OpMultiplier = II.getOperand(2);

// Canonicalise a non _u intrinsic only.
if (II.getIntrinsicID() != IID)
if (auto II_U = instCombineSVEAllActive(II, IID))
return II_U;

// Return true if a given instruction is a unit splat value, false otherwise.
auto IsUnitSplat = [](auto *I) {
auto *SplatValue = getSplatValue(I);
Expand Down Expand Up @@ -1891,34 +1903,38 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_ptest_last:
return instCombineSVEPTest(IC, II);
case Intrinsic::aarch64_sve_fabd:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fabd_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
case Intrinsic::aarch64_sve_fadd:
return instCombineSVEVectorFAdd(IC, II);
case Intrinsic::aarch64_sve_fadd_u:
return instCombineSVEVectorFAddU(IC, II);
case Intrinsic::aarch64_sve_fdiv:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fdiv_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
case Intrinsic::aarch64_sve_fmax:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmax_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
case Intrinsic::aarch64_sve_fmaxnm:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmaxnm_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
case Intrinsic::aarch64_sve_fmin:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmin_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
case Intrinsic::aarch64_sve_fminnm:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fminnm_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
case Intrinsic::aarch64_sve_fmla:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmla_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
case Intrinsic::aarch64_sve_fmls:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmls_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
case Intrinsic::aarch64_sve_fmul:
if (auto II_U =
instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
return II_U;
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
case Intrinsic::aarch64_sve_fmul_u:
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
case Intrinsic::aarch64_sve_fmulx:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmulx_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
case Intrinsic::aarch64_sve_fnmla:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmla_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
case Intrinsic::aarch64_sve_fnmls:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmls_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
case Intrinsic::aarch64_sve_fsub:
return instCombineSVEVectorFSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
Expand All @@ -1930,52 +1946,56 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
case Intrinsic::aarch64_sve_mla:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mla_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
case Intrinsic::aarch64_sve_mls:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mls_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
case Intrinsic::aarch64_sve_mul:
if (auto II_U =
instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
return II_U;
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
case Intrinsic::aarch64_sve_mul_u:
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
case Intrinsic::aarch64_sve_sabd:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sabd_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
case Intrinsic::aarch64_sve_smax:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smax_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
case Intrinsic::aarch64_sve_smin:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smin_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
case Intrinsic::aarch64_sve_smulh:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smulh_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
Intrinsic::aarch64_sve_mls_u>(
IC, II, true);
case Intrinsic::aarch64_sve_uabd:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uabd_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
case Intrinsic::aarch64_sve_umax:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umax_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
case Intrinsic::aarch64_sve_umin:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umin_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
case Intrinsic::aarch64_sve_umulh:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umulh_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
case Intrinsic::aarch64_sve_asr:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_asr_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
case Intrinsic::aarch64_sve_lsl:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsl_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
case Intrinsic::aarch64_sve_lsr:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsr_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
case Intrinsic::aarch64_sve_and:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_and_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
case Intrinsic::aarch64_sve_bic:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_bic_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
case Intrinsic::aarch64_sve_eor:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_eor_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
case Intrinsic::aarch64_sve_orr:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_orr_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
case Intrinsic::aarch64_sve_sqsub:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sqsub_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
case Intrinsic::aarch64_sve_uqsub:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uqsub_u);
return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
case Intrinsic::aarch64_sve_tbl:
return instCombineSVETBL(IC, II);
case Intrinsic::aarch64_sve_uunpkhi:
Expand Down