diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index bfa4ce6da212b..2e96ae39a1ec8 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3062,14 +3062,14 @@ let Predicates = [HasSVE_or_SME] in { multiclass unpred_store { - let AddedComplexity = 1 in { - def _reg : Pat<(Store Ty:$val, (AddrCP GPR64sp:$base, GPR64:$offset)), - (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>; - } let AddedComplexity = 2 in { def _imm : Pat<(Store Ty:$val, (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } + let AddedComplexity = 1 in { + def _reg : Pat<(Store Ty:$val, (AddrCP GPR64sp:$base, GPR64:$offset)), + (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } def : Pat<(Store Ty:$val, GPR64:$base), (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; @@ -3095,17 +3095,28 @@ let Predicates = [HasSVE_or_SME] in { defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + let AddedComplexity = 3 in { + defm : unpred_store; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store; + } + multiclass unpred_load { - let AddedComplexity = 1 in { - def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))), - (RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>; - } let AddedComplexity = 2 in { def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; } + let AddedComplexity = 1 in { + def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))), + (RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } def : Pat<(Ty (Load GPR64:$base)), (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; @@ -3143,6 +3154,17 @@ let Predicates = [HasSVE_or_SME] in { defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + let AddedComplexity = 3 in { + defm : unpred_load; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load; + } + let Predicates = [HasSVE_or_SME, IsLE] in { // Allow using the reg+reg form of ld1b/st1b for memory accesses with the // same width as nxv16i8. This saves an add in cases where we would diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll index ffafe69b29266..ad92530eabf08 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -612,21 +612,22 @@ define <16 x double> @test_ldnp_v16f64(ptr %A) { define @test_ldnp_v20f32_vscale(ptr %A) { ; CHECK-LABEL: test_ldnp_v20f32_vscale: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: ldr z1, [x0, #1, mul vl] -; CHECK-NEXT: ldr z2, [x0, #2, mul vl] -; CHECK-NEXT: ldr z3, [x0, #3, mul vl] -; CHECK-NEXT: ldr z4, [x0, #4, mul vl] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v20f32_vscale: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: ptrue p0.s -; CHECK-BE-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-BE-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl] -; CHECK-BE-NEXT: ld1w { z2.s }, p0/z, [x0, #2, mul vl] -; CHECK-BE-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl] -; CHECK-BE-NEXT: ld1w { z4.s }, p0/z, [x0, #4, mul vl] +; CHECK-BE-NEXT: ldnt1w { z0.s }, p0/z, [x0] +; CHECK-BE-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl] +; CHECK-BE-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl] +; CHECK-BE-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl] +; CHECK-BE-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl] ; CHECK-BE-NEXT: ret %lv = load, ptr %A, align 8, !nontemporal !0 ret %lv diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll new file mode 100644 index 0000000000000..67035559d72da --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll @@ -0,0 +1,252 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define @load_nxv16i8(ptr %a) nounwind { +; CHECK-LABEL: load_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %load = load , ptr %a, !nontemporal !0 + ret %load +} + +define @load_nxv8i16(ptr %a) nounwind { +; CHECK-LABEL: load_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = load , ptr %a, !nontemporal !0 + ret %load +} + +define @load_nxv4i32(ptr %a) nounwind { +; CHECK-LABEL: load_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = load , ptr %a, !nontemporal !0 + ret %load +} + +define @load_nxv2i64(ptr %a) nounwind { +; CHECK-LABEL: load_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = load , ptr %a, !nontemporal !0 + ret %load +} + +define @load_nxv8f16(ptr %a) nounwind { +; CHECK-LABEL: load_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = load , ptr %a, !nontemporal !0 + ret %load +} + +define @load_nxv8bf16(ptr %a) nounwind { +; CHECK-LABEL: load_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = load , ptr %a, !nontemporal !0 + ret %load +} + +define @load_nxv4f32(ptr %a) nounwind { +; CHECK-LABEL: load_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = load , ptr %a, !nontemporal !0 + ret %load +} + +define @load_nxv2f64(ptr %a) nounwind { +; CHECK-LABEL: load_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = load , ptr %a, !nontemporal !0 + ret %load +} + +define @load_nxv16i8_reg(ptr %a, i64 %off) nounwind { +; CHECK-LABEL: load_nxv16i8_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr i8, ptr %a, i64 %off + %load = load , ptr %ptr, !nontemporal !0 + ret %load +} + +define @load_nxv16i8_imm(ptr %a) nounwind { +; CHECK-LABEL: load_nxv16i8_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %ptr = getelementptr , ptr %a, i64 1 + %load = load , ptr %ptr, !nontemporal !0 + ret %load +} + +define @load_nxv2f64_reg(ptr %a, i64 %off) nounwind { +; CHECK-LABEL: load_nxv2f64_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr double, ptr %a, i64 %off + %load = load , ptr %ptr, !nontemporal !0 + ret %load +} + +define @load_nxv2f64_imm(ptr %a) nounwind { +; CHECK-LABEL: load_nxv2f64_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %ptr = getelementptr , ptr %a, i64 1 + %load = load , ptr %ptr, !nontemporal !0 + ret %load +} + +define void @store_nxv16i8( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: stnt1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + store %x, ptr %a, !nontemporal !0 + ret void +} + +define void @store_nxv8i16( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: stnt1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + store %x, ptr %a, !nontemporal !0 + ret void +} + +define void @store_nxv4i32( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: stnt1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + store %x, ptr %a, !nontemporal !0 + ret void +} + +define void @store_nxv2i64( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: stnt1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + store %x, ptr %a, !nontemporal !0 + ret void +} + +define void @store_nxv8f16( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: stnt1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + store %x, ptr %a, !nontemporal !0 + ret void +} + +define void @store_nxv8bf16( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: stnt1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + store %x, ptr %a, !nontemporal !0 + ret void +} + +define void @store_nxv4f32( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: stnt1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + store %x, ptr %a, !nontemporal !0 + ret void +} + +define void @store_nxv2f64( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: stnt1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + store %x, ptr %a, !nontemporal !0 + ret void +} + +define void @store_nxv16i8_reg( %x, ptr %a, i64 %off) nounwind { +; CHECK-LABEL: store_nxv16i8_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: stnt1b { z0.b }, p0, [x0, x1] +; CHECK-NEXT: ret + %ptr = getelementptr i8, ptr %a, i64 %off + store %x, ptr %ptr, !nontemporal !0 + ret void +} + +define void @store_nxv16i8_imm( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv16i8_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: stnt1b { z0.b }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ret + %ptr = getelementptr , ptr %a, i64 1 + store %x, ptr %ptr, !nontemporal !0 + ret void +} + +define void @store_nxv2f64_reg( %x, ptr %a, i64 %off) nounwind { +; CHECK-LABEL: store_nxv2f64_reg: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: stnt1d { z0.d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %ptr = getelementptr double, ptr %a, i64 %off + store %x, ptr %ptr, !nontemporal !0 + ret void +} + +define void @store_nxv2f64_imm( %x, ptr %a) nounwind { +; CHECK-LABEL: store_nxv2f64_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: stnt1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ret + %ptr = getelementptr , ptr %a, i64 1 + store %x, ptr %ptr, !nontemporal !0 + ret void +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll index 36df5e5deadfc..f097d874cd11b 100644 --- a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll +++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll @@ -66,6 +66,26 @@ define void @masked_store_nxv4i32( %x, ptr %a, @all_active_load_nxv4i32(ptr %a) nounwind { +; CHECK-LABEL: all_active_load_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.masked.load.nxv4i32(ptr %a, i32 1, splat (i1 true), poison), !nontemporal !0 + ret %load +} + +define void @all_active_store_nxv4i32( %x, ptr %a) nounwind { +; CHECK-LABEL: all_active_store_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: stnt1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv4i32.p0( %x, ptr %a, i32 1, splat (i1 true)), !nontemporal !0 + ret void +} + declare @llvm.masked.load.nxv4i32(ptr, i32, , ) declare void @llvm.masked.store.nxv4i32.p0(, ptr, i32, ) declare <4 x i32> @llvm.masked.load.v4i32(ptr, i32, <4 x i1>, <4 x i32>)