Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3062,14 +3062,14 @@ let Predicates = [HasSVE_or_SME] in {
multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegRegInst,
Instruction RegImmInst, Instruction PTrue,
ComplexPattern AddrCP> {
let AddedComplexity = 1 in {
def _reg : Pat<(Store Ty:$val, (AddrCP GPR64sp:$base, GPR64:$offset)),
(RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
}
let AddedComplexity = 2 in {
def _imm : Pat<(Store Ty:$val, (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
let AddedComplexity = 1 in {
def _reg : Pat<(Store Ty:$val, (AddrCP GPR64sp:$base, GPR64:$offset)),
(RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
}

def : Pat<(Store Ty:$val, GPR64:$base),
(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
Expand All @@ -3095,17 +3095,28 @@ let Predicates = [HasSVE_or_SME] in {
defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;

let AddedComplexity = 3 in {
defm : unpred_store<nontemporalstore, nxv16i8, STNT1B_ZRR, STNT1B_ZRI, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_store<nontemporalstore, nxv8i16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_store<nontemporalstore, nxv4i32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2>;
defm : unpred_store<nontemporalstore, nxv2i64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3>;
defm : unpred_store<nontemporalstore, nxv8f16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_store<nontemporalstore, nxv8bf16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_store<nontemporalstore, nxv4f32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2>;
defm : unpred_store<nontemporalstore, nxv2f64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3>;
}

multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegRegInst,
Instruction RegImmInst, Instruction PTrue,
ComplexPattern AddrCP> {
let AddedComplexity = 1 in {
def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))),
(RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
}
let AddedComplexity = 2 in {
def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
let AddedComplexity = 1 in {
def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))),
(RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
}

def : Pat<(Ty (Load GPR64:$base)),
(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
Expand Down Expand Up @@ -3143,6 +3154,17 @@ let Predicates = [HasSVE_or_SME] in {
defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;

let AddedComplexity = 3 in {
defm : unpred_load<nontemporalload, nxv16i8, LDNT1B_ZRR, LDNT1B_ZRI, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_load<nontemporalload, nxv8i16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_load<nontemporalload, nxv4i32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2>;
defm : unpred_load<nontemporalload, nxv2i64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3>;
defm : unpred_load<nontemporalload, nxv8f16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_load<nontemporalload, nxv8bf16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
defm : unpred_load<nontemporalload, nxv4f32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2>;
defm : unpred_load<nontemporalload, nxv2f64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3>;
}

let Predicates = [HasSVE_or_SME, IsLE] in {
// Allow using the reg+reg form of ld1b/st1b for memory accesses with the
// same width as nxv16i8. This saves an add in cases where we would
Expand Down
21 changes: 11 additions & 10 deletions llvm/test/CodeGen/AArch64/nontemporal-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -612,21 +612,22 @@ define <16 x double> @test_ldnp_v16f64(ptr %A) {
define <vscale x 20 x float> @test_ldnp_v20f32_vscale(ptr %A) {
; CHECK-LABEL: test_ldnp_v20f32_vscale:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x0, #1, mul vl]
; CHECK-NEXT: ldr z2, [x0, #2, mul vl]
; CHECK-NEXT: ldr z3, [x0, #3, mul vl]
; CHECK-NEXT: ldr z4, [x0, #4, mul vl]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl]
; CHECK-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl]
; CHECK-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl]
; CHECK-NEXT: ret
;
; CHECK-BE-LABEL: test_ldnp_v20f32_vscale:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: ptrue p0.s
; CHECK-BE-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-BE-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
; CHECK-BE-NEXT: ld1w { z2.s }, p0/z, [x0, #2, mul vl]
; CHECK-BE-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl]
; CHECK-BE-NEXT: ld1w { z4.s }, p0/z, [x0, #4, mul vl]
; CHECK-BE-NEXT: ldnt1w { z0.s }, p0/z, [x0]
; CHECK-BE-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl]
; CHECK-BE-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl]
; CHECK-BE-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl]
; CHECK-BE-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl]
; CHECK-BE-NEXT: ret
%lv = load<vscale x 20 x float>, ptr %A, align 8, !nontemporal !0
ret <vscale x 20 x float> %lv
Expand Down
252 changes: 252 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you mind adding a run line for BE?

; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefixes=CHECK,CHECK-LE %s
; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve < %s | FileCheck --check-prefixes=CHECK,CHECK-BE %s


define <vscale x 16 x i8> @load_nxv16i8(ptr %a) nounwind {
; CHECK-LABEL: load_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ret
%load = load <vscale x 16 x i8>, ptr %a, !nontemporal !0
ret <vscale x 16 x i8> %load
}

define <vscale x 8 x i16> @load_nxv8i16(ptr %a) nounwind {
; CHECK-LABEL: load_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%load = load <vscale x 8 x i16>, ptr %a, !nontemporal !0
ret <vscale x 8 x i16> %load
}

define <vscale x 4 x i32> @load_nxv4i32(ptr %a) nounwind {
; CHECK-LABEL: load_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = load <vscale x 4 x i32>, ptr %a, !nontemporal !0
ret <vscale x 4 x i32> %load
}

define <vscale x 2 x i64> @load_nxv2i64(ptr %a) nounwind {
; CHECK-LABEL: load_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = load <vscale x 2 x i64>, ptr %a, !nontemporal !0
ret <vscale x 2 x i64> %load
}

define <vscale x 8 x half> @load_nxv8f16(ptr %a) nounwind {
; CHECK-LABEL: load_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%load = load <vscale x 8 x half>, ptr %a, !nontemporal !0
ret <vscale x 8 x half> %load
}

define <vscale x 8 x bfloat> @load_nxv8bf16(ptr %a) nounwind {
; CHECK-LABEL: load_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%load = load <vscale x 8 x bfloat>, ptr %a, !nontemporal !0
ret <vscale x 8 x bfloat> %load
}

define <vscale x 4 x float> @load_nxv4f32(ptr %a) nounwind {
; CHECK-LABEL: load_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = load <vscale x 4 x float>, ptr %a, !nontemporal !0
ret <vscale x 4 x float> %load
}

define <vscale x 2 x double> @load_nxv2f64(ptr %a) nounwind {
; CHECK-LABEL: load_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = load <vscale x 2 x double>, ptr %a, !nontemporal !0
ret <vscale x 2 x double> %load
}

define <vscale x 16 x i8> @load_nxv16i8_reg(ptr %a, i64 %off) nounwind {
; CHECK-LABEL: load_nxv16i8_reg:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr i8, ptr %a, i64 %off
%load = load <vscale x 16 x i8>, ptr %ptr, !nontemporal !0
ret <vscale x 16 x i8> %load
}

define <vscale x 16 x i8> @load_nxv16i8_imm(ptr %a) nounwind {
; CHECK-LABEL: load_nxv16i8_imm:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ret
%ptr = getelementptr <vscale x 16 x i8>, ptr %a, i64 1
%load = load <vscale x 16 x i8>, ptr %ptr, !nontemporal !0
ret <vscale x 16 x i8> %load
}

define <vscale x 2 x double> @load_nxv2f64_reg(ptr %a, i64 %off) nounwind {
; CHECK-LABEL: load_nxv2f64_reg:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0, x1, lsl #3]
; CHECK-NEXT: ret
%ptr = getelementptr double, ptr %a, i64 %off
%load = load <vscale x 2 x double>, ptr %ptr, !nontemporal !0
ret <vscale x 2 x double> %load
}

define <vscale x 2 x double> @load_nxv2f64_imm(ptr %a) nounwind {
; CHECK-LABEL: load_nxv2f64_imm:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ret
%ptr = getelementptr <vscale x 2 x double>, ptr %a, i64 1
%load = load <vscale x 2 x double>, ptr %ptr, !nontemporal !0
ret <vscale x 2 x double> %load
}

define void @store_nxv16i8(<vscale x 16 x i8> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: stnt1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
store <vscale x 16 x i8> %x, ptr %a, !nontemporal !0
ret void
}

define void @store_nxv8i16(<vscale x 8 x i16> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
store <vscale x 8 x i16> %x, ptr %a, !nontemporal !0
ret void
}

define void @store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
store <vscale x 4 x i32> %x, ptr %a, !nontemporal !0
ret void
}

define void @store_nxv2i64(<vscale x 2 x i64> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: stnt1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
store <vscale x 2 x i64> %x, ptr %a, !nontemporal !0
ret void
}

define void @store_nxv8f16(<vscale x 8 x half> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
store <vscale x 8 x half> %x, ptr %a, !nontemporal !0
ret void
}

define void @store_nxv8bf16(<vscale x 8 x bfloat> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
store <vscale x 8 x bfloat> %x, ptr %a, !nontemporal !0
ret void
}

define void @store_nxv4f32(<vscale x 4 x float> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
store <vscale x 4 x float> %x, ptr %a, !nontemporal !0
ret void
}

define void @store_nxv2f64(<vscale x 2 x double> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: stnt1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
store <vscale x 2 x double> %x, ptr %a, !nontemporal !0
ret void
}

define void @store_nxv16i8_reg(<vscale x 16 x i8> %x, ptr %a, i64 %off) nounwind {
; CHECK-LABEL: store_nxv16i8_reg:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: stnt1b { z0.b }, p0, [x0, x1]
; CHECK-NEXT: ret
%ptr = getelementptr i8, ptr %a, i64 %off
store <vscale x 16 x i8> %x, ptr %ptr, !nontemporal !0
ret void
}

define void @store_nxv16i8_imm(<vscale x 16 x i8> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv16i8_imm:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: stnt1b { z0.b }, p0, [x0, #1, mul vl]
; CHECK-NEXT: ret
%ptr = getelementptr <vscale x 16 x i8>, ptr %a, i64 1
store <vscale x 16 x i8> %x, ptr %ptr, !nontemporal !0
ret void
}

define void @store_nxv2f64_reg(<vscale x 2 x double> %x, ptr %a, i64 %off) nounwind {
; CHECK-LABEL: store_nxv2f64_reg:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: stnt1d { z0.d }, p0, [x0, x1, lsl #3]
; CHECK-NEXT: ret
%ptr = getelementptr double, ptr %a, i64 %off
store <vscale x 2 x double> %x, ptr %ptr, !nontemporal !0
ret void
}

define void @store_nxv2f64_imm(<vscale x 2 x double> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv2f64_imm:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: stnt1d { z0.d }, p0, [x0, #1, mul vl]
; CHECK-NEXT: ret
%ptr = getelementptr <vscale x 2 x double>, ptr %a, i64 1
store <vscale x 2 x double> %x, ptr %ptr, !nontemporal !0
ret void
}

!0 = !{i32 1}
20 changes: 20 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,26 @@ define void @masked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a, <vscale x 4 x i
ret void
}

define <vscale x 4 x i32> @all_active_load_nxv4i32(ptr %a) nounwind {
; CHECK-LABEL: all_active_load_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %a, i32 1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !nontemporal !0
ret <vscale x 4 x i32> %load
}

define void @all_active_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
; CHECK-LABEL: all_active_store_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %x, ptr %a, i32 1, <vscale x 4 x i1> splat (i1 true)), !nontemporal !0
ret void
}

declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32(ptr, i32, <4 x i1>, <4 x i32>)
Expand Down