diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 94bc2ee51975a..ead1bf95d25d2 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -366,6 +366,30 @@ static Optional instCombineConvertFromSVBool(InstCombiner &IC, return IC.replaceInstUsesWith(II, EarliestReplacement); } +static Optional instCombineSVEDup(InstCombiner &IC, + IntrinsicInst &II) { + IntrinsicInst *Pg = dyn_cast(II.getArgOperand(1)); + if (!Pg) + return None; + + if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) + return None; + + const auto PTruePattern = + cast(Pg->getOperand(0))->getZExtValue(); + if (PTruePattern != AArch64SVEPredPattern::vl1) + return None; + + // The intrinsic is inserting into lane zero so use an insert instead. + auto *IdxTy = Type::getInt64Ty(II.getContext()); + auto *Insert = InsertElementInst::Create( + II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); + Insert->insertBefore(&II); + Insert->takeName(&II); + + return IC.replaceInstUsesWith(II, Insert); +} + static Optional instCombineSVELast(InstCombiner &IC, IntrinsicInst &II) { Value *Pg = II.getArgOperand(0); @@ -455,6 +479,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, break; case Intrinsic::aarch64_sve_convert_from_svbool: return instCombineConvertFromSVBool(IC, II); + case Intrinsic::aarch64_sve_dup: + return instCombineSVEDup(IC, II); case Intrinsic::aarch64_sve_lasta: case Intrinsic::aarch64_sve_lastb: return instCombineSVELast(IC, II); diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll new file mode 100644 index 0000000000000..add9cfb56f1e4 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll @@ -0,0 +1,52 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @dup_insertelement_0( %v, i8 %s) #0 { +; CHECK-LABEL: @dup_insertelement_0( +; CHECK: %insert = insertelement %v, i8 %s, i64 0 +; CHECK-NEXT: ret %insert + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) + ret %insert +} + +define @dup_insertelement_1( %v, i8 %s) #0 { +; CHECK-LABEL: @dup_insertelement_1( +; CHECK: %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) +; CHECK-NEXT: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) +; CHECK-NEXT: ret %insert + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) + ret %insert +} + +define @dup_insertelement_x( %v, i8 %s, %pg) #0 { +; CHECK-LABEL: @dup_insertelement_x( +; CHECK: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) +; CHECK-NEXT: ret %insert + %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) + ret %insert +} + +define @dup_insertelement_0_convert( %v, i16 %s) #0 { +; CHECK-LABEL: @dup_insertelement_0_convert( +; CHECK: %insert = insertelement %v, i16 %s, i64 0 +; CHECK-NEXT: ret %insert + %pg = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg) + %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %1) + %insert = tail call @llvm.aarch64.sve.dup.nxv8i16( %v, %2, i16 %s) + ret %insert +} + +declare @llvm.aarch64.sve.dup.nxv16i8(, , i8) +declare @llvm.aarch64.sve.dup.nxv8i16(, , i16) + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) + +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() + +attributes #0 = { "target-features"="+sve" }