From ef3c36be695ee68dda5df9e36da6407772dac21e Mon Sep 17 00:00:00 2001 From: Rajveer Date: Wed, 8 Oct 2025 18:00:53 +0530 Subject: [PATCH] [AArch64][SVE] Allow factors other than 2/4 for load+deinterleave3+store patterns for codegen Resolves #159801 and #162068 --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 +- .../AArch64/sve-vector-load+deinterleave.ll | 74 ++++++++++++ .../scalable-deinterleave-intrinsics.ll | 110 ++++++++++++++++++ 3 files changed, 186 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 29d65d5d1db64..a41e3f73fd5b4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17973,7 +17973,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); - if (Factor != 2 && Factor != 4) { + if (Factor != 2 && Factor != 3 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); return false; } @@ -18052,7 +18052,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( Instruction *Store, Value *Mask, ArrayRef InterleavedValues) const { unsigned Factor = InterleavedValues.size(); - if (Factor != 2 && Factor != 4) { + if (Factor != 2 && Factor != 3 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n"); return false; } diff --git a/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll new file mode 100644 index 0000000000000..0d41dc9113978 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vector-load+deinterleave.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-elf -mattr=+sve | FileCheck %s -check-prefixes=SVE + +define void @load_factor2(i32* %ptr, * %s1, * %s2) { +; SVE-LABEL: load_factor2: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0] +; SVE-NEXT: str z0, [x1] +; SVE-NEXT: str z1, [x2] +; SVE-NEXT: ret + %wide.vec = load , ptr %ptr, align 8 + %ldN = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.vec) + + %3 = extractvalue { , } %ldN, 0 + %4 = extractvalue { , } %ldN, 1 + + store %3, * %s1 + store %4, * %s2 + ret void +} + +define void @load_factor3(i32* %ptr, * %s1, * %s2, * %s3) { +; SVE-LABEL: load_factor3: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: ld3w { z0.s - z2.s }, p0/z, [x0] +; SVE-NEXT: str z0, [x1] +; SVE-NEXT: str z1, [x2] +; SVE-NEXT: str z2, [x3] +; SVE-NEXT: ret + %wide.vec = load , ptr %ptr, align 8 + %ldN = tail call { , , } @llvm.vector.deinterleave3.nxv12i32( %wide.vec) + + %3 = extractvalue { , , } %ldN, 0 + %4 = extractvalue { , , } %ldN, 1 + %5 = extractvalue { , , } %ldN, 2 + + store %3, * %s1 + store %4, * %s2 + store %5, * %s3 + ret void +} + +define void @load_factor4(i32* %ptr, * %s1, * %s2, * %s3, * %s4) { +; SVE-LABEL: load_factor4: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0] +; SVE-NEXT: str z0, [x1] +; SVE-NEXT: str z1, [x2] +; SVE-NEXT: str z2, [x3] +; SVE-NEXT: str z3, [x4] +; SVE-NEXT: ret + %wide.vec = load , ptr %ptr, align 8 + %ldN = tail call { , , , } @llvm.vector.deinterleave4.nxv16i32( %wide.vec) + + %3 = extractvalue { , , , } %ldN, 0 + %4 = extractvalue { , , , } %ldN, 1 + %5 = extractvalue { , , , } %ldN, 2 + %6 = extractvalue { , , , } %ldN, 3 + + store %3, * %s1 + store %4, * %s2 + store %5, * %s3 + store %6, * %s4 + ret void +} + + +declare { , } @llvm.vector.deinterleave2.nxv8i32() +declare { , , } @llvm.vector.deinterleave3.nxv12i32() +declare { , , , } @llvm.vector.deinterleave4.nxv16i32() + diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll index ed9fba3a01965..e7b94173ac4d2 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll @@ -289,6 +289,110 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, %l ret void } +define void @deinterleave_nxi64_factor3(i32* %ptr, * %s1, * %s2, * %s3) { +; CHECK-LABEL: define void @deinterleave_nxi64_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) { +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[PTR]], align 8 +; CHECK-NEXT: [[LDN:%.*]] = tail call { , , } @llvm.vector.deinterleave3.nxv3i64( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: store [[TMP1]], ptr [[S1]], align 8 +; CHECK-NEXT: store [[TMP2]], ptr [[S2]], align 8 +; CHECK-NEXT: store [[TMP3]], ptr [[S3]], align 8 +; CHECK-NEXT: ret void +; + %wide.vec = load , ptr %ptr, align 8 + %ldN = tail call { , , } @llvm.vector.deinterleave3.nxv3i64( %wide.vec) + + %3 = extractvalue { , , } %ldN, 0 + %4 = extractvalue { , , } %ldN, 1 + %5 = extractvalue { , , } %ldN, 2 + + store %3, * %s1 + store %4, * %s2 + store %5, * %s3 + ret void +} + +define void @deinterleave_nxi32_factor3(i32* %ptr, * %s1, * %s2, * %s3) { +; CHECK-LABEL: define void @deinterleave_nxi32_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) { +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[PTR]], align 8 +; CHECK-NEXT: [[LDN:%.*]] = tail call { , , } @llvm.vector.deinterleave3.nxv6i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: store [[TMP1]], ptr [[S1]], align 8 +; CHECK-NEXT: store [[TMP2]], ptr [[S2]], align 8 +; CHECK-NEXT: store [[TMP3]], ptr [[S3]], align 8 +; CHECK-NEXT: ret void +; + %wide.vec = load , ptr %ptr, align 8 + %ldN = tail call { , , } @llvm.vector.deinterleave3.nxv3i32( %wide.vec) + + %3 = extractvalue { , , } %ldN, 0 + %4 = extractvalue { , , } %ldN, 1 + %5 = extractvalue { , , } %ldN, 2 + + store %3, * %s1 + store %4, * %s2 + store %5, * %s3 + ret void +} + +define void @deinterleave_nxi16_factor3(i32* %ptr, * %s1, * %s2, * %s3) { +; CHECK-LABEL: define void @deinterleave_nxi16_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) { +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[PTR]], align 8 +; CHECK-NEXT: [[LDN:%.*]] = tail call { , , } @llvm.vector.deinterleave3.nxv12i16( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: store [[TMP1]], ptr [[S1]], align 8 +; CHECK-NEXT: store [[TMP2]], ptr [[S2]], align 8 +; CHECK-NEXT: store [[TMP3]], ptr [[S3]], align 8 +; CHECK-NEXT: ret void +; + %wide.vec = load , ptr %ptr, align 8 + %ldN = tail call { , , } @llvm.vector.deinterleave3.nxv3i16( %wide.vec) + + %3 = extractvalue { , , } %ldN, 0 + %4 = extractvalue { , , } %ldN, 1 + %5 = extractvalue { , , } %ldN, 2 + + store %3, * %s1 + store %4, * %s2 + store %5, * %s3 + ret void +} + +define void @deinterleave_nxi8_factor3(i32* %ptr, * %s1, * %s2, * %s3) { +; CHECK-LABEL: define void @deinterleave_nxi8_factor3 +; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) { +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[PTR]], align 8 +; CHECK-NEXT: [[LDN:%.*]] = tail call { , , } @llvm.vector.deinterleave3.nxv24i8( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: store [[TMP1]], ptr [[S1]], align 8 +; CHECK-NEXT: store [[TMP2]], ptr [[S2]], align 8 +; CHECK-NEXT: store [[TMP3]], ptr [[S3]], align 8 +; CHECK-NEXT: ret void +; + %wide.vec = load , ptr %ptr, align 8 + %ldN = tail call { , , } @llvm.vector.deinterleave3.nxv3i8( %wide.vec) + + %3 = extractvalue { , , } %ldN, 0 + %4 = extractvalue { , , } %ldN, 1 + %5 = extractvalue { , , } %ldN, 2 + + store %3, * %s1 + store %4, * %s2 + store %5, * %s3 + ret void +} + declare { , } @llvm.vector.deinterleave2.nxv32i8() declare { , } @llvm.vector.deinterleave2.nxv16i16() declare { , } @llvm.vector.deinterleave2.nxv8i32() @@ -312,4 +416,10 @@ declare @llvm.vector.interleave2.nxv4p0(, < ; Larger interleaves to test 'legalization' declare @llvm.vector.interleave2.nxv8f64(, ) +; Interleaves with Factor=3 +declare { , , } @llvm.vector.deinterleave3.nxv3i64() +declare { , , } @llvm.vector.deinterleave3.nxv3i32() +declare { , , } @llvm.vector.deinterleave3.nxv3i16() +declare { , , } @llvm.vector.deinterleave3.nxv3i8() + attributes #0 = { vscale_range(1,16) "target-features"="+sve" }