-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AArch64][SVE] Select non-temporal instructions for unpredicated loads/stores with the nontemporal flag #171261
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
… the nontemporal flag Select SVE non-temporal load/store instructions for unpredicated vector loads/stores with the `nontemporal` flag, for which regular instructions were previously used. Also, disable the transformation of predicated loads/stores with an all-true mask into `ldr`/`str` when the `nontemporal` flag is present, ensuring non-temporal instructions are used. Fixes llvm#169034
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: Yuta Mukai (ytmukai) ChangesSelect SVE non-temporal load/store instructions for unpredicated vector loads/stores with the Also, disable the transformation of predicated loads/stores with an all-true mask into Fixes #169034 Full diff: https://github.com/llvm/llvm-project/pull/171261.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 64017d7cafca3..97287c2d9e389 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -639,6 +639,12 @@ def non_temporal_load :
cast<MaskedLoadSDNode>(N)->isNonTemporal();
}]>;
+def temporal_load :
+ PatFrag<(ops node:$ptr),
+ (load node:$ptr), [{
+ return !cast<LoadSDNode>(N)->isNonTemporal();
+}]>;
+
// non-truncating masked store fragment.
def nontrunc_masked_store :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
@@ -684,6 +690,12 @@ def non_temporal_store :
!cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
+def temporal_store :
+ PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return !cast<StoreSDNode>(N)->isNonTemporal();
+}]>;
+
multiclass masked_gather_scatter<PatFrags GatherScatterOp> {
// offsets = (signed)Index << sizeof(elt)
def NAME#_signed_scaled :
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bfa4ce6da212b..9e00f4b944b38 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3075,6 +3075,15 @@ let Predicates = [HasSVE_or_SME] in {
(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
}
+ defm : unpred_store<nontemporalstore, nxv16i8, STNT1B_ZRR, STNT1B_ZRI, PTRUE_B ,am_sve_regreg_lsl0>;
+ defm : unpred_store<nontemporalstore, nxv8i16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_store<nontemporalstore, nxv4i32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
+ defm : unpred_store<nontemporalstore, nxv2i64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
+ defm : unpred_store<nontemporalstore, nxv8f16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_store<nontemporalstore, nxv8bf16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_store<nontemporalstore, nxv4f32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
+ defm : unpred_store<nontemporalstore, nxv2f64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
+
defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
@@ -3111,6 +3120,15 @@ let Predicates = [HasSVE_or_SME] in {
(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
}
+ defm : unpred_load<nontemporalload, nxv16i8, LDNT1B_ZRR, LDNT1B_ZRI, PTRUE_B ,am_sve_regreg_lsl0>;
+ defm : unpred_load<nontemporalload, nxv8i16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_load<nontemporalload, nxv4i32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
+ defm : unpred_load<nontemporalload, nxv2i64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
+ defm : unpred_load<nontemporalload, nxv8f16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_load<nontemporalload, nxv8bf16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_load<nontemporalload, nxv4f32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
+ defm : unpred_load<nontemporalload, nxv2f64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
+
defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
@@ -3164,18 +3182,19 @@ let Predicates = [HasSVE_or_SME] in {
}
// Allow using LDR/STR to avoid the predicate dependence.
+ // Not applied if the nontemporal flag is set.
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in
foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in {
let AddedComplexity = 2 in {
- def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
+ def : Pat<(Ty (temporal_load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
(LDR_ZXI GPR64sp:$base, simm9:$offset)>;
- def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
+ def : Pat<(temporal_store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
(STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>;
}
- def : Pat<(Ty (load GPR64sp:$base)),
+ def : Pat<(Ty (temporal_load GPR64sp:$base)),
(LDR_ZXI GPR64sp:$base, (i64 0))>;
- def : Pat<(store Ty:$val, GPR64sp:$base),
+ def : Pat<(temporal_store Ty:$val, GPR64sp:$base),
(STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>;
}
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index ffafe69b29266..ad92530eabf08 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -612,21 +612,22 @@ define <16 x double> @test_ldnp_v16f64(ptr %A) {
define <vscale x 20 x float> @test_ldnp_v20f32_vscale(ptr %A) {
; CHECK-LABEL: test_ldnp_v20f32_vscale:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ldr z0, [x0]
-; CHECK-NEXT: ldr z1, [x0, #1, mul vl]
-; CHECK-NEXT: ldr z2, [x0, #2, mul vl]
-; CHECK-NEXT: ldr z3, [x0, #3, mul vl]
-; CHECK-NEXT: ldr z4, [x0, #4, mul vl]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl]
; CHECK-NEXT: ret
;
; CHECK-BE-LABEL: test_ldnp_v20f32_vscale:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: ptrue p0.s
-; CHECK-BE-NEXT: ld1w { z0.s }, p0/z, [x0]
-; CHECK-BE-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
-; CHECK-BE-NEXT: ld1w { z2.s }, p0/z, [x0, #2, mul vl]
-; CHECK-BE-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl]
-; CHECK-BE-NEXT: ld1w { z4.s }, p0/z, [x0, #4, mul vl]
+; CHECK-BE-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-BE-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-BE-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl]
+; CHECK-BE-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-BE-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl]
; CHECK-BE-NEXT: ret
%lv = load<vscale x 20 x float>, ptr %A, align 8, !nontemporal !0
ret <vscale x 20 x float> %lv
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
new file mode 100644
index 0000000000000..e631d7cbe711d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i8> @load_nxv16i8(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 16 x i8>, ptr %a, !nontemporal !0
+ ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 8 x i16> @load_nxv8i16(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 8 x i16>, ptr %a, !nontemporal !0
+ ret <vscale x 8 x i16> %load
+}
+
+define <vscale x 4 x i32> @load_nxv4i32(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 4 x i32>, ptr %a, !nontemporal !0
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @load_nxv2i64(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 2 x i64>, ptr %a, !nontemporal !0
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 8 x half> @load_nxv8f16(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 8 x half>, ptr %a, !nontemporal !0
+ ret <vscale x 8 x half> %load
+}
+
+define <vscale x 8 x bfloat> @load_nxv8bf16(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 8 x bfloat>, ptr %a, !nontemporal !0
+ ret <vscale x 8 x bfloat> %load
+}
+
+define <vscale x 4 x float> @load_nxv4f32(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 4 x float>, ptr %a, !nontemporal !0
+ ret <vscale x 4 x float> %load
+}
+
+define <vscale x 2 x double> @load_nxv2f64(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 2 x double>, ptr %a, !nontemporal !0
+ ret <vscale x 2 x double> %load
+}
+
+define void @store_nxv16i8(<vscale x 16 x i8> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: stnt1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 16 x i8> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv8i16(<vscale x 8 x i16> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 8 x i16> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 4 x i32> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv2i64(<vscale x 2 x i64> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: stnt1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 2 x i64> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv8f16(<vscale x 8 x half> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 8 x half> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv8bfl16(<vscale x 8 x bfloat> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv8bfl16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 8 x bfloat> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv4f32(<vscale x 4 x float> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 4 x float> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv2f64(<vscale x 2 x double> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: stnt1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 2 x double> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
index 36df5e5deadfc..bb016f840411a 100644
--- a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
@@ -66,9 +66,32 @@ define void @masked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a, <vscale x 4 x i
ret void
}
+define <vscale x 4 x i32> @unmasked_load_nxv4i32(ptr %a) nounwind {
+; CHECK-LABEL: unmasked_load_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i32> poison), !nontemporal !0
+ ret <vscale x 4 x i32> %load
+}
+
+define void @unmasked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
+; CHECK-LABEL: unmasked_store_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %x, ptr %a, i32 1, <vscale x 4 x i1> %mask), !nontemporal !0
+ ret void
+}
+
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32(ptr, i32, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
!0 = !{i32 1}
|
…es with the nontemporal flag
…es with the nontemporal flag
…es with the nontemporal flag
rj-jesus
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, cheers!
| @@ -0,0 +1,252 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | |||
| ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you mind adding a run line for BE?
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefixes=CHECK,CHECK-LE %s
; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve < %s | FileCheck --check-prefixes=CHECK,CHECK-BE %s
Add patterns to select SVE non-temporal load/store instructions for unpredicated vector loads/stores with the
nontemporalflag. Previously, regular instructions were used for these cases. Additionally, theAddedComplexityvalue was increased to prioritize this selection over theldr/strselection (#127837).Fixes #169034