[AArch64] Avoid vector interleave instructions when NEON and SVE are unavailable #90723

sdesmalen-arm · 2024-05-01T11:16:34Z

As the summary suggests, the code incorrectly assumes that it can use NEON or SVE instructions to implement an interleaved load/store operation, even when both features are unavailable in the selected runtime mode.

…unavailable. As the summary suggests, the code incorrectly assumes that it can use NEON or SVE instructions to implement an interleaved load/store operation, even when both features are unavailable in the selected runtime mode.

llvmbot · 2024-05-01T11:17:04Z

@llvm/pr-subscribers-backend-aarch64

Author: Sander de Smalen (sdesmalen-arm)

Changes

As the summary suggests, the code incorrectly assumes that it can use NEON or SVE instructions to implement an interleaved load/store operation, even when both features are unavailable in the selected runtime mode.

Full diff: https://github.com/llvm/llvm-project/pull/90723.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+14-13)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll (+89-25)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll (+29)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2af679e0755b54..1daa472ee58592 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15979,7 +15979,8 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
 
   UseScalable = false;
 
-  if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
+  if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() &&
+      !Subtarget->useSVEForFixedLengthVectors())
     return false;
 
   if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
@@ -16003,18 +16004,20 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
   }
 
   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
-  if (!Subtarget->isNeonAvailable() ||
-      (Subtarget->useSVEForFixedLengthVectors() &&
-       (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
-        (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
-         isPowerOf2_32(MinElts) && VecSize > 128)))) {
-    UseScalable = true;
-    return true;
+  if (Subtarget->useSVEForFixedLengthVectors()) {
+    unsigned MinSVEVectorSize =
+        std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
+    if (VecSize % MinSVEVectorSize == 0 ||
+        (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
+         VecSize > 128)) {
+      UseScalable = true;
+      return true;
+    }
   }
 
   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
   // 128 will be split into multiple interleaved accesses.
-  return VecSize == 64 || VecSize % 128 == 0;
+  return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
 }
 
 static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
@@ -16105,8 +16108,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
   bool UseScalable;
-  if (!Subtarget->hasNEON() ||
-      !isLegalInterleavedAccessType(VTy, DL, UseScalable))
+  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
 
   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
@@ -16283,8 +16285,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
-  if (!Subtarget->hasNEON() ||
-      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+  if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
     return false;
 
   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index efe9066f2c835f..6f4ac7f80328f3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,19 +17,36 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    add x20, sp, #28
 ; CHECK-NEXT:    bl def
-; CHECK-NEXT:    ptrue p0.b, vl2
-; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x20]
-; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mov z2.b, z0.b[1]
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x20]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    mov z1.h, z0.h[2]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    fmov w9, s1
 ; CHECK-NEXT:    stp w8, w9, [sp, #8]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x19]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: alloc_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    mov x19, x0
+; NONEON-NOSVE-NEXT:    add x0, sp, #12
+; NONEON-NOSVE-NEXT:    bl def
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    umov w8, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[0]
+; NONEON-NOSVE-NEXT:    strb w8, [x19, #1]
+; NONEON-NOSVE-NEXT:    strb w9, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [4 x i8]
   call void @def(ptr %alloc)
   %load = load <4 x i8>, ptr %alloc
@@ -40,38 +59,51 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ; CHECK-LABEL: alloc_v6i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    add x0, sp, #24
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    add x20, sp, #24
 ; CHECK-NEXT:    bl def
-; CHECK-NEXT:    ptrue p0.b, vl3
-; CHECK-NEXT:    ptrue p1.s, vl2
-; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x20]
+; CHECK-NEXT:    ldr d0, [sp, #24]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    mov z2.b, z1.b[3]
+; CHECK-NEXT:    ptrue p1.s, vl2
+; CHECK-NEXT:    mov z1.b, z0.b[3]
+; CHECK-NEXT:    mov z2.b, z0.b[5]
+; CHECK-NEXT:    mov z0.b, z0.b[1]
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z3.b, z1.b[2]
-; CHECK-NEXT:    mov z4.b, z1.b[1]
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w9, [sp, #4]
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    add x8, sp, #12
-; CHECK-NEXT:    ldr d0, [sp]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    add x8, sp, #20
+; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x8]
 ; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x8]
 ; CHECK-NEXT:    strb w9, [x19, #2]
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strh w8, [x19]
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: alloc_v6i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    mov x19, x0
+; NONEON-NOSVE-NEXT:    add x0, sp, #8
+; NONEON-NOSVE-NEXT:    bl def
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x9, x19, #2
+; NONEON-NOSVE-NEXT:    rev16 v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    xtn v1.8b, v1.8h
+; NONEON-NOSVE-NEXT:    str s1, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    st1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    strh w8, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [6 x i8]
   call void @def(ptr %alloc)
   %load = load <6 x i8>, ptr %alloc
@@ -100,6 +132,22 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: alloc_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    mov x19, x0
+; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    bl def
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    add x8, x19, #8
+; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    st1 { v1.b }[0], [x8]
+; NONEON-NOSVE-NEXT:    str d0, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [32 x i8]
   call void @def(ptr %alloc)
   %load = load <32 x i8>, ptr %alloc
@@ -128,6 +176,22 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: alloc_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    mov x19, x0
+; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    bl def
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp]
+; NONEON-NOSVE-NEXT:    zip1 v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [8 x double]
   call void @def(ptr %alloc)
   %load = load <8 x double>, ptr %alloc
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index ad0d4ef0afef36..6f82c97f3b872d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,13 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) {
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    st2w { z0.s, z1.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: hang_when_merging_stores_after_legalisation:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer
   %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x i32> %interleaved.vec, ptr %a, align 4
@@ -28,6 +36,13 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    st2w { z0.s, z1.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: interleave_store_without_splat:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved = shufflevector <8 x i32> %shuffle, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x i32> %interleaved, ptr %a, align 1
@@ -46,6 +61,16 @@ define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2)
 ; CHECK-NEXT:    st2w { z4.s, z5.s }, p0, [x0]
 ; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: interleave_store_legalization:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip2 v4.4s, v1.4s, v3.4s
+; NONEON-NOSVE-NEXT:    zip1 v1.4s, v1.4s, v3.4s
+; NONEON-NOSVE-NEXT:    zip2 v3.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %interleaved.vec = shufflevector <8 x i32> %v1, <8 x i32> %v2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
                                                                              i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i32> %interleaved.vec, ptr %a, align 4
@@ -57,6 +82,10 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) {
 ; CHECK-LABEL: crash_when_lowering_extract_shuffle:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: crash_when_lowering_extract_shuffle:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
   br i1 %cond, label %exit, label %vector.body

sdesmalen-arm requested review from aemerson, hassnaaHamdi and paulwalker-arm May 1, 2024 11:16

llvmbot added the backend:AArch64 label May 1, 2024

sdesmalen-arm mentioned this pull request May 2, 2024

[AArch64] Expand vector ops when NEON and SVE are unavailable. #90833

Merged

Fix alloc_v4i8 shuffle

52ba552

paulwalker-arm approved these changes May 2, 2024

View reviewed changes

sdesmalen-arm merged commit eb3a671 into llvm:main May 2, 2024
3 of 4 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Avoid vector interleave instructions when NEON and SVE are unavailable #90723

[AArch64] Avoid vector interleave instructions when NEON and SVE are unavailable #90723

sdesmalen-arm commented May 1, 2024

llvmbot commented May 1, 2024

[AArch64] Avoid vector interleave instructions when NEON and SVE are unavailable #90723

[AArch64] Avoid vector interleave instructions when NEON and SVE are unavailable #90723

Conversation

sdesmalen-arm commented May 1, 2024

llvmbot commented May 1, 2024