-
Notifications
You must be signed in to change notification settings - Fork 11.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] optimisation for unary SVE store intrinsics with no active lanes #95793
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: None (Lukacma) ChangesThis patch extends #73964 and adds optimisation of store SVE intrinsics when predicate is zero. Full diff: https://github.com/llvm/llvm-project/pull/95793.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f5756fc7e401..d2a31a6ce4dc6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -985,6 +985,16 @@ static bool isAllActivePredicate(Value *Pred) {
m_ConstantInt<AArch64SVEPredPattern::all>()));
}
+// Erase unary operation where predicate has all inactive lanes
+static std::optional<Instruction *>
+instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
+ int PredPos) {
+ if (match(II.getOperand(PredPos), m_ZeroInt())) {
+ return IC.eraseInstFromFunction(II);
+ }
+ return std::nullopt;
+}
+
static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
IntrinsicInst &II) {
// svsel(ptrue, x, y) => x
@@ -1417,6 +1427,10 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Value *Pred = II.getOperand(1);
Value *PtrOp = II.getOperand(2);
+ // Remove when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryErase(IC, II, 0))
+ return II_NA;
+
if (isAllActivePredicate(Pred)) {
StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
Store->copyMetadata(II);
@@ -1775,6 +1789,10 @@ instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
Value *Index = II.getOperand(3);
Type *Ty = Val->getType();
+ // Remove when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryErase(IC, II, 0))
+ return II_NA;
+
// Contiguous scatter => masked store.
// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
@@ -1971,6 +1989,33 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
switch (IID) {
default:
break;
+
+ case Intrinsic::aarch64_sve_st1_scatter:
+ case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
+ case Intrinsic::aarch64_sve_st1_scatter_sxtw:
+ case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
+ case Intrinsic::aarch64_sve_st1_scatter_uxtw:
+ case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
+ case Intrinsic::aarch64_sve_st1dq:
+ case Intrinsic::aarch64_sve_st1q_scatter_index:
+ case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
+ case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
+ case Intrinsic::aarch64_sve_st1wq:
+ case Intrinsic::aarch64_sve_stnt1:
+ case Intrinsic::aarch64_sve_stnt1_scatter:
+ case Intrinsic::aarch64_sve_stnt1_scatter_index:
+ case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
+ case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
+ return instCombineSVENoActiveUnaryErase(IC, II, 1);
+ case Intrinsic::aarch64_sve_st2:
+ case Intrinsic::aarch64_sve_st2q:
+ return instCombineSVENoActiveUnaryErase(IC, II, 2);
+ case Intrinsic::aarch64_sve_st3:
+ case Intrinsic::aarch64_sve_st3q:
+ return instCombineSVENoActiveUnaryErase(IC, II, 3);
+ case Intrinsic::aarch64_sve_st4:
+ case Intrinsic::aarch64_sve_st4q:
+ return instCombineSVENoActiveUnaryErase(IC, II, 4);
case Intrinsic::aarch64_neon_fmaxnm:
case Intrinsic::aarch64_neon_fminnm:
return instCombineMaxMinNM(IC, II);
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll
new file mode 100644
index 0000000000000..00681dee3fcb5
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll
@@ -0,0 +1,310 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_st1(ptr %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: define void @test_st1(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define void @test_st1_scatter(
+; CHECK-SAME: <vscale x 2 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @test_st1_scatter_index(<vscale x 2 x i32> %data_trunc, ptr %base, <vscale x 2 x i64> %offsets) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_index(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> [[DATA_TRUNC]], <vscale x 2 x i1> zeroinitializer, ptr [[BASE]], <vscale x 2 x i64> [[OFFSETS]])
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 2 x i64> %offsets)
+ ret void
+}
+
+define void @test_st1_scatter_scalar_offset(<vscale x 4 x i8> %data_trunc, <vscale x 4 x i32> %base) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], <vscale x 4 x i32> [[BASE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret void
+}
+
+define void @test_st1_scatter_sxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_sxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @test_st1_scatter_sxtw_index(<vscale x 4 x i16> %data_trunc, ptr %base, <vscale x 4 x i32> %indices) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_sxtw_index(
+; CHECK-SAME: <vscale x 4 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @test_st1_scatter_uxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_uxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @test_st1_scatter_uxtw_index(<vscale x 4 x i16> %data_trunc, ptr %base, <vscale x 4 x i32> %indices) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_uxtw_index(
+; CHECK-SAME: <vscale x 4 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @test_st1dq(<vscale x 2 x i64> %zt, ptr %gep1) #0 {
+; CHECK-LABEL: define void @test_st1dq(
+; CHECK-SAME: <vscale x 2 x i64> [[ZT:%.*]], ptr [[GEP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> zeroinitializer, ptr %gep1)
+ ret void
+}
+
+define void @test_st1q_scatter_index(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_index(
+; CHECK-SAME: <vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[PG:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> zeroinitializer, ptr %base, <vscale x 2 x i64> %idx)
+ ret void
+}
+
+define void @test_st1q_scatter_scalar_offset(<vscale x 2 x i64> %data, <vscale x 2 x i64> %base) #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i64> [[BASE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> zeroinitializer, <vscale x 2 x i64> %base, i64 0)
+ ret void
+}
+
+define void @test_st1q_scatter_vector_offset(<vscale x 8 x i16> %data, ptr %base, <vscale x 2 x i64> %off) #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_vector_offset(
+; CHECK-SAME: <vscale x 8 x i16> [[DATA:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> zeroinitializer, ptr %base, <vscale x 2 x i64> %off)
+ ret void
+}
+
+define void @test_st1wq(ptr %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st1wq(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %b, <vscale x 1 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+
+define void @test_st2(ptr %a, <vscale x 8 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st2(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 8 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 4)
+ tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st2q(ptr %a, <vscale x 8 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st2q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 8 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 4)
+ tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st3(ptr %a, <vscale x 12 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st3(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 12 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 4)
+ %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 8)
+ tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st3q(ptr %a, <vscale x 12 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st3q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 12 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 4)
+ %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 8)
+ tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st4(ptr %a, <vscale x 16 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st4(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 4)
+ %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 8)
+ %3 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 12)
+ tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st4q(ptr %a, <vscale x 16 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st4q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 4)
+ %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 8)
+ %3 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 12)
+ tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_stnt1(ptr %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: define void @test_stnt1(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_stnt1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter(
+; CHECK-SAME: <vscale x 2 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @test_stnt1_scatter_index(<vscale x 2 x i32> %data_trunc, ptr %base, <vscale x 2 x i64> %offsets) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_index(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 2 x i64> %offsets)
+ ret void
+}
+
+define void @test_stnt1_scatter_scalar_offset(<vscale x 4 x i8> %data_trunc, <vscale x 4 x i32> %base) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], <vscale x 4 x i32> [[BASE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret void
+}
+
+define void @test_stnt1_scatter_uxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_uxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
|
@llvm/pr-subscribers-llvm-transforms Author: None (Lukacma) ChangesThis patch extends #73964 and adds optimisation of store SVE intrinsics when predicate is zero. Full diff: https://github.com/llvm/llvm-project/pull/95793.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f5756fc7e401..d2a31a6ce4dc6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -985,6 +985,16 @@ static bool isAllActivePredicate(Value *Pred) {
m_ConstantInt<AArch64SVEPredPattern::all>()));
}
+// Erase unary operation where predicate has all inactive lanes
+static std::optional<Instruction *>
+instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
+ int PredPos) {
+ if (match(II.getOperand(PredPos), m_ZeroInt())) {
+ return IC.eraseInstFromFunction(II);
+ }
+ return std::nullopt;
+}
+
static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
IntrinsicInst &II) {
// svsel(ptrue, x, y) => x
@@ -1417,6 +1427,10 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Value *Pred = II.getOperand(1);
Value *PtrOp = II.getOperand(2);
+ // Remove when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryErase(IC, II, 0))
+ return II_NA;
+
if (isAllActivePredicate(Pred)) {
StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
Store->copyMetadata(II);
@@ -1775,6 +1789,10 @@ instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
Value *Index = II.getOperand(3);
Type *Ty = Val->getType();
+ // Remove when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryErase(IC, II, 0))
+ return II_NA;
+
// Contiguous scatter => masked store.
// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
@@ -1971,6 +1989,33 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
switch (IID) {
default:
break;
+
+ case Intrinsic::aarch64_sve_st1_scatter:
+ case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
+ case Intrinsic::aarch64_sve_st1_scatter_sxtw:
+ case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
+ case Intrinsic::aarch64_sve_st1_scatter_uxtw:
+ case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
+ case Intrinsic::aarch64_sve_st1dq:
+ case Intrinsic::aarch64_sve_st1q_scatter_index:
+ case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
+ case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
+ case Intrinsic::aarch64_sve_st1wq:
+ case Intrinsic::aarch64_sve_stnt1:
+ case Intrinsic::aarch64_sve_stnt1_scatter:
+ case Intrinsic::aarch64_sve_stnt1_scatter_index:
+ case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
+ case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
+ return instCombineSVENoActiveUnaryErase(IC, II, 1);
+ case Intrinsic::aarch64_sve_st2:
+ case Intrinsic::aarch64_sve_st2q:
+ return instCombineSVENoActiveUnaryErase(IC, II, 2);
+ case Intrinsic::aarch64_sve_st3:
+ case Intrinsic::aarch64_sve_st3q:
+ return instCombineSVENoActiveUnaryErase(IC, II, 3);
+ case Intrinsic::aarch64_sve_st4:
+ case Intrinsic::aarch64_sve_st4q:
+ return instCombineSVENoActiveUnaryErase(IC, II, 4);
case Intrinsic::aarch64_neon_fmaxnm:
case Intrinsic::aarch64_neon_fminnm:
return instCombineMaxMinNM(IC, II);
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll
new file mode 100644
index 0000000000000..00681dee3fcb5
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-stores.ll
@@ -0,0 +1,310 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_st1(ptr %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: define void @test_st1(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define void @test_st1_scatter(
+; CHECK-SAME: <vscale x 2 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @test_st1_scatter_index(<vscale x 2 x i32> %data_trunc, ptr %base, <vscale x 2 x i64> %offsets) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_index(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> [[DATA_TRUNC]], <vscale x 2 x i1> zeroinitializer, ptr [[BASE]], <vscale x 2 x i64> [[OFFSETS]])
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 2 x i64> %offsets)
+ ret void
+}
+
+define void @test_st1_scatter_scalar_offset(<vscale x 4 x i8> %data_trunc, <vscale x 4 x i32> %base) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], <vscale x 4 x i32> [[BASE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret void
+}
+
+define void @test_st1_scatter_sxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_sxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @test_st1_scatter_sxtw_index(<vscale x 4 x i16> %data_trunc, ptr %base, <vscale x 4 x i32> %indices) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_sxtw_index(
+; CHECK-SAME: <vscale x 4 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @test_st1_scatter_uxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_uxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @test_st1_scatter_uxtw_index(<vscale x 4 x i16> %data_trunc, ptr %base, <vscale x 4 x i32> %indices) #0 {
+; CHECK-LABEL: define void @test_st1_scatter_uxtw_index(
+; CHECK-SAME: <vscale x 4 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @test_st1dq(<vscale x 2 x i64> %zt, ptr %gep1) #0 {
+; CHECK-LABEL: define void @test_st1dq(
+; CHECK-SAME: <vscale x 2 x i64> [[ZT:%.*]], ptr [[GEP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> zeroinitializer, ptr %gep1)
+ ret void
+}
+
+define void @test_st1q_scatter_index(<vscale x 8 x i16> %data, <vscale x 1 x i1> %pg, ptr %base, <vscale x 2 x i64> %idx) #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_index(
+; CHECK-SAME: <vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[PG:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> zeroinitializer, ptr %base, <vscale x 2 x i64> %idx)
+ ret void
+}
+
+define void @test_st1q_scatter_scalar_offset(<vscale x 2 x i64> %data, <vscale x 2 x i64> %base) #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 2 x i64> [[DATA:%.*]], <vscale x 2 x i64> [[BASE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data, <vscale x 1 x i1> zeroinitializer, <vscale x 2 x i64> %base, i64 0)
+ ret void
+}
+
+define void @test_st1q_scatter_vector_offset(<vscale x 8 x i16> %data, ptr %base, <vscale x 2 x i64> %off) #0 {
+; CHECK-LABEL: define void @test_st1q_scatter_vector_offset(
+; CHECK-SAME: <vscale x 8 x i16> [[DATA:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> %data, <vscale x 1 x i1> zeroinitializer, ptr %base, <vscale x 2 x i64> %off)
+ ret void
+}
+
+define void @test_st1wq(ptr %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st1wq(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %b, <vscale x 1 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+
+define void @test_st2(ptr %a, <vscale x 8 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st2(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 8 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 4)
+ tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st2q(ptr %a, <vscale x 8 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st2q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 8 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %b, i64 4)
+ tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st3(ptr %a, <vscale x 12 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st3(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 12 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 4)
+ %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 8)
+ tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st3q(ptr %a, <vscale x 12 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st3q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 12 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 4)
+ %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> %b, i64 8)
+ tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st4(ptr %a, <vscale x 16 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st4(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 4)
+ %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 8)
+ %3 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 12)
+ tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_st4q(ptr %a, <vscale x 16 x i32> %b) #0 {
+; CHECK-LABEL: define void @test_st4q(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i32> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 0)
+ %1 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 4)
+ %2 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 8)
+ %3 = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %b, i64 12)
+ tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, <vscale x 4 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_stnt1(ptr %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: define void @test_stnt1(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret void
+}
+
+define void @test_stnt1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter(
+; CHECK-SAME: <vscale x 2 x i16> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @test_stnt1_scatter_index(<vscale x 2 x i32> %data_trunc, ptr %base, <vscale x 2 x i64> %offsets) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_index(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 2 x i64> %offsets)
+ ret void
+}
+
+define void @test_stnt1_scatter_scalar_offset(<vscale x 4 x i8> %data_trunc, <vscale x 4 x i32> %base) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], <vscale x 4 x i32> [[BASE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret void
+}
+
+define void @test_stnt1_scatter_uxtw(<vscale x 4 x i8> %data_trunc, ptr %base, <vscale x 4 x i32> %offsets) #0 {
+; CHECK-LABEL: define void @test_stnt1_scatter_uxtw(
+; CHECK-SAME: <vscale x 4 x i8> [[DATA_TRUNC:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[OFFSETS:%.*]]) {
+; CHECK-NEXT: ret void
+;
+ call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> zeroinitializer,
+ ptr %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
|
@@ -1417,6 +1427,10 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { | |||
Value *Pred = II.getOperand(1); | |||
Value *PtrOp = II.getOperand(2); | |||
|
|||
// Remove when all lanes are inactive | |||
if (auto II_NA = instCombineSVENoActiveUnaryErase(IC, II, 0)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would expect this to be:
auto II_NA = instCombineSVENoActiveUnaryErase(IC, II, 1)), because in line 1427 is shows the predicate as 1
Value *Pred = II.getOperand(1);
Same for line 1793.
Am I missing something here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right this was incorrect. But I also noticed this is unnecessary as other optimizations already optimize this so I removed it altogether
ret void | ||
} | ||
|
||
define void @test_st1_scatter(<vscale x 2 x i16> %data_trunc, ptr %base, <vscale x 2 x i64> %b) #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we add
attributes #0 = { "target-features"="+sve" }
or remove #0
?
Just in case.
Because it does not need to lower to SVE instructions I think it works fine.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you Marian!
…tive lanes (llvm#95793) This patch extends llvm#73964 and adds optimisation of store SVE intrinsics when predicate is zero.
…tive lanes (llvm#95793) This patch extends llvm#73964 and adds optimisation of store SVE intrinsics when predicate is zero.
This patch extends #73964 and adds optimisation of store SVE intrinsics when predicate is zero.