diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 66272988889a2..532247c8f3b40 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2277,6 +2277,39 @@ static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { } } +static std::optional instCombineSVEVectorMul(InstCombiner &IC, + IntrinsicInst &II) { + Value *PG = II.getOperand(0); + Value *Op1 = II.getOperand(1); + Value *Op2 = II.getOperand(2); + + // Return true if a given instruction is a negative unit splat value, false + // otherwise. + auto IsNegUnitSplat = [](auto *I) { + auto *SplatValue = getSplatValue(I); + ConstantInt *SplatConstantInt = dyn_cast_or_null(SplatValue); + if (!SplatConstantInt) + return false; + APInt SCIV = SplatConstantInt->getValue(); + const int64_t IntValue = SCIV.getSExtValue(); + return IntValue == -1; + }; + + if (IsNegUnitSplat(Op1)) { + auto *NEG = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, + {II.getType()}, {Op2, PG, Op2}); + return IC.replaceInstUsesWith(II, NEG); + } + + if (IsNegUnitSplat(Op2)) { + auto *NEG = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, + {II.getType()}, {Op1, PG, Op1}); + return IC.replaceInstUsesWith(II, NEG); + } + + return std::nullopt; +} + static std::optional instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { // Bail due to missing support for ISD::STRICT_ scalable vector operations. @@ -2852,6 +2885,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVEVectorFuseMulAddSub( IC, II, true); + case Intrinsic::aarch64_sve_mul: + case Intrinsic::aarch64_sve_mul_u: + return instCombineSVEVectorMul(IC, II); case Intrinsic::aarch64_sve_sub: return instCombineSVEVectorSub(IC, II); case Intrinsic::aarch64_sve_sub_u: diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll new file mode 100644 index 0000000000000..a620fee7222ab --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Muls with (-1) as operand should fold to neg. +define @mul_neg_fold_i16( %pg, %a) #0 { +; CHECK-LABEL: define @mul_neg_fold_i16( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.neg.nxv8i16( [[A]], [[PG]], [[A]]) +; CHECK-NEXT: ret [[TMP0]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1) + %2 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %a, %1) + ret %2 +} + +define @mul_neg_fold_i32( %pg, %a) #0 { +; CHECK-LABEL: define @mul_neg_fold_i32( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.neg.nxv4i32( [[A]], [[PG]], [[A]]) +; CHECK-NEXT: ret [[TMP0]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1) + %2 = call @llvm.aarch64.sve.mul.nxv4i32( %pg, %a, %1) + ret %2 +} + +define @mul_neg_fold_i64( %pg, %a) #0 { +; CHECK-LABEL: define @mul_neg_fold_i64( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.neg.nxv2i64( [[A]], [[PG]], [[A]]) +; CHECK-NEXT: ret [[TMP0]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1) + %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg, %a, %1) + ret %2 +} + +define @mul_neg_fold_two_dups( %pg, %a) #0 { + ; Edge case -- make sure that the case where we're multiplying two dups + ; together is sane. +; CHECK-LABEL: define @mul_neg_fold_two_dups( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = select [[PG]], splat (i16 1), splat (i16 -1) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1) + %2 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1) + %3 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %1, %2) + ret %3 +} + +define @mul_neg_fold_different_argument_order( %pg, %a) #0 { +; CHECK-LABEL: define @mul_neg_fold_different_argument_order( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.neg.nxv2i64( [[A]], [[PG]], [[A]]) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1) + ; Different argument order to the above tests. + %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg, %1, %a) + ret %2 +} + +; Non foldable muls -- we don't expect these to be optimised out. +define @no_mul_neg_fold_i16( %pg, %a) #0 { +; CHECK-LABEL: define @no_mul_neg_fold_i16( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.nxv8i16( [[PG]], [[A]], splat (i16 -2)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2) + %2 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %a, %1) + ret %2 +} + +define @no_mul_neg_fold_i32( %pg, %a) #0 { +; CHECK-LABEL: define @no_mul_neg_fold_i32( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.nxv4i32( [[PG]], [[A]], splat (i32 -2)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2) + %2 = call @llvm.aarch64.sve.mul.nxv4i32( %pg, %a, %1) + ret %2 +} + +define @no_mul_neg_fold_i64( %pg, %a) #0 { +; CHECK-LABEL: define @no_mul_neg_fold_i64( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.nxv2i64( [[PG]], [[A]], splat (i64 -2)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2) + %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg, %a, %1) + ret %2 +} + +declare @llvm.aarch64.sve.dup.x.nxv8i16(i16) +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) +declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) + +declare @llvm.aarch64.sve.mul.nxv8i16(, , ) +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) +declare @llvm.aarch64.sve.mul.nxv2i64(, , ) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll new file mode 100644 index 0000000000000..ee179a57a0cae --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Muls with (-1) as operand should fold to neg. +define @mul_neg_fold_u_i16( %pg, %a) #0 { +; CHECK-LABEL: define @mul_neg_fold_u_i16( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.neg.nxv8i16( [[A]], [[PG]], [[A]]) +; CHECK-NEXT: ret [[TMP0]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1) + %2 = call @llvm.aarch64.sve.mul.u.nxv8i16( %pg, %a, %1) + ret %2 +} + +define @mul_neg_fold_u_i32( %pg, %a) #0 { +; CHECK-LABEL: define @mul_neg_fold_u_i32( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.neg.nxv4i32( [[A]], [[PG]], [[A]]) +; CHECK-NEXT: ret [[TMP0]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1) + %2 = call @llvm.aarch64.sve.mul.u.nxv4i32( %pg, %a, %1) + ret %2 +} + +define @mul_neg_fold_u_i64( %pg, %a) #0 { +; CHECK-LABEL: define @mul_neg_fold_u_i64( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.neg.nxv2i64( [[A]], [[PG]], [[A]]) +; CHECK-NEXT: ret [[TMP0]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1) + %2 = call @llvm.aarch64.sve.mul.u.nxv2i64( %pg, %a, %1) + ret %2 +} + +define @mul_neg_fold_u_two_dups( %pg, %a) #0 { + ; Edge case -- make sure that the case where we're multiplying two dups + ; together is sane. +; CHECK-LABEL: define @mul_neg_fold_u_two_dups( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret splat (i16 1) +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1) + %2 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1) + %3 = call @llvm.aarch64.sve.mul.u.nxv8i16( %pg, %1, %2) + ret %3 +} + +define @mul_neg_fold_u_different_argument_order( %pg, %a) #0 { +; CHECK-LABEL: define @mul_neg_fold_u_different_argument_order( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.neg.nxv2i64( [[A]], [[PG]], [[A]]) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1) + ; Different argument order to the above tests. + %2 = call @llvm.aarch64.sve.mul.u.nxv2i64( %pg, %1, %a) + ret %2 +} + +; Non foldable muls -- we don't expect these to be optimised out. +define @no_mul_neg_fold_u_i16( %pg, %a) #0 { +; CHECK-LABEL: define @no_mul_neg_fold_u_i16( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.u.nxv8i16( [[PG]], [[A]], splat (i16 -2)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2) + %2 = call @llvm.aarch64.sve.mul.u.nxv8i16( %pg, %a, %1) + ret %2 +} + +define @no_mul_neg_fold_u_i32( %pg, %a) #0 { +; CHECK-LABEL: define @no_mul_neg_fold_u_i32( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.u.nxv4i32( [[PG]], [[A]], splat (i32 -2)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2) + %2 = call @llvm.aarch64.sve.mul.u.nxv4i32( %pg, %a, %1) + ret %2 +} + +define @no_mul_neg_fold_u_i64( %pg, %a) #0 { +; CHECK-LABEL: define @no_mul_neg_fold_u_i64( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.u.nxv2i64( [[PG]], [[A]], splat (i64 -2)) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2) + %2 = call @llvm.aarch64.sve.mul.u.nxv2i64( %pg, %a, %1) + ret %2 +} + +declare @llvm.aarch64.sve.dup.x.nxv8i16(i16) +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) +declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) + +declare @llvm.aarch64.sve.mul.u.nxv8i16(, , ) +declare @llvm.aarch64.sve.mul.u.nxv4i32(, , ) +declare @llvm.aarch64.sve.mul.u.nxv2i64(, , ) + +attributes #0 = { "target-features"="+sve" }