-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64][SME] Enable split SVE for hazard padding in SVE CC functions #166561
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis patch enables This improves the codegen over the base hazard padding implementation, as rather than placing the padding in the callee-save area, it is placed at the start of the ZPR area. E.g., Current lowering: New lowering: This also re-enables paired stores for GPRs (as the offsets no longer include the hazard padding). Patch is 43.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166561.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3ee4d58ca892c..ced61106532a3 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2364,9 +2364,31 @@ void AArch64FrameLowering::determineStackHazardSlot(
AFI->setStackHazardSlotIndex(ID);
}
+ if (!AFI->hasStackHazardSlotIndex())
+ return;
+
// Determine if we should use SplitSVEObjects. This should only be used if
// there's a possibility of a stack hazard between PPRs and ZPRs or FPRs.
if (SplitSVEObjects) {
+ CallingConv::ID CC = MF.getFunction().getCallingConv();
+ if (AFI->isSVECC() || CC == CallingConv::AArch64_SVE_VectorCall) {
+ AFI->setSplitSVEObjects(true);
+ LLVM_DEBUG(dbgs() << "Using SplitSVEObjects for SVE CC function\n");
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "Determining if SplitSVEObjects should be used in "
+ "non-SVE CC function...\n");
+
+ // If another calling convention is explicitly set FPRs can't be promoted to
+ // ZPR callee-saves.
+ if (!is_contained({CallingConv::C, CallingConv::Fast}, CC)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Calling convention is not supported with SplitSVEObjects\n");
+ return;
+ }
+
if (!HasPPRCSRs && !HasPPRStackObjects) {
LLVM_DEBUG(
dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n");
@@ -2380,16 +2402,6 @@ void AArch64FrameLowering::determineStackHazardSlot(
return;
}
- // If another calling convention is explicitly set FPRs can't be promoted to
- // ZPR callee-saves.
- if (!is_contained({CallingConv::C, CallingConv::Fast,
- CallingConv::AArch64_SVE_VectorCall},
- MF.getFunction().getCallingConv())) {
- LLVM_DEBUG(
- dbgs() << "Calling convention is not supported with SplitSVEObjects");
- return;
- }
-
[[maybe_unused]] const AArch64Subtarget &Subtarget =
MF.getSubtarget<AArch64Subtarget>();
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
index f65aec6665cec..9d8b077e9268e 100644
--- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -839,11 +839,10 @@ define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) {
define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) {
; CHECK-LABEL: only_zpr_csr_vla:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #1056
-; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK-NEXT: add x29, sp, #1024
-; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #1040] // 8-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill
; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
@@ -870,11 +869,9 @@ define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) {
; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: sub sp, x29, #1024
-; CHECK-NEXT: ldr x19, [sp, #1040] // 8-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%alloc = alloca i8, i64 %n, align 1
call void (...) @llvm.fake.use(ptr %alloc)
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 70874761b82ab..05450468f87a7 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -975,8 +975,8 @@ define i32 @svecc_csr_d8(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_psta
;
; CHECK64-LABEL: svecc_csr_d8:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #80
-; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT: sub sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #-1
; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill
; CHECK64-NEXT: sub sp, sp, #64
@@ -988,30 +988,50 @@ define i32 @svecc_csr_d8(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_psta
; CHECK64-NEXT: //NO_APP
; CHECK64-NEXT: add sp, sp, #64
; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT: add sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #1
-; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #80
+; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_csr_d8:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-1
-; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG
-; CHECK1024-NEXT: .cfi_offset w29, -16
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHECK1024-NEXT: mov w0, wzr
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #1
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1040
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #1
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT: mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #1
+; CHECK1024-SPLITSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
tail call void asm sideeffect "", "~{d8}"() #1
ret i32 0
@@ -1039,8 +1059,8 @@ define i32 @svecc_csr_d8d9(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_ps
;
; CHECK64-LABEL: svecc_csr_d8d9:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #80
-; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT: sub sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #-2
; CHECK64-NEXT: str z9, [sp] // 16-byte Folded Spill
; CHECK64-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
@@ -1055,33 +1075,56 @@ define i32 @svecc_csr_d8d9(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_ps
; CHECK64-NEXT: add sp, sp, #64
; CHECK64-NEXT: ldr z9, [sp] // 16-byte Folded Reload
; CHECK64-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: add sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #2
-; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #80
+; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_csr_d8d9:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-2
-; CHECK1024-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
-; CHECK1024-NEXT: .cfi_offset w29, -16
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040
-; CHECK1024-NEXT: mov w0, wzr
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #2
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1040
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8d9:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #2
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8d9:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT: mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
tail call void asm sideeffect "", "~{d8},~{d9}"() #1
ret i32 0
@@ -1108,8 +1151,8 @@ define i32 @svecc_csr_d8_allocd(double %d, <vscale x 4 x i32> %vs) "aarch64_psta
;
; CHECK64-LABEL: svecc_csr_d8_allocd:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #80
-; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT: sub sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #-1
; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill
; CHECK64-NEXT: sub sp, sp, #80
@@ -1122,31 +1165,52 @@ define i32 @svecc_csr_d8_allocd(double %d, <vscale x 4 x i32> %vs) "aarch64_psta
; CHECK64-NEXT: str d0, [sp, #72]
; CHECK64-NEXT: add sp, sp, #80
; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT: add sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #1
-; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #80
+; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_csr_d8_allocd:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-1
-; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-; CHECK1024-NEXT: .cfi_offset w29, -16
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHECK1024-NEXT: mov w0, wzr
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: str d0, [sp, #1032]
-; CHECK1024-NEXT: add sp, sp, #1040
-; CHECK1024-NEXT: ldr z8, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #1
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1040
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8_allocd:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: str d0, [sp, #1032]
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #1
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8_allocd:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1040
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT: mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: str d0, [sp, #1032]
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1040
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #1
+; CHECK1024-SPLITSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
%a = alloca double
tail call void asm sideeffect "", "~{d8}"() #1
@@ -1176,8 +1240,8 @@ define i32 @svecc_csr_d8_alloci64(i64 %d, <vscale x 4 x i32> %vs) "aarch64_pstat
;
; CHECK64-LABEL: svecc_csr_d8_alloci64:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #80
-; CHECK64-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT: sub sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #-1
; CHECK64-NEXT: str z8, [sp] // 16-byte Folded Spill
; CHECK64-NEXT: sub sp, sp, #80
@@ -1191,32 +1255,54 @@ define i32 @svecc_csr_d8_alloci64(i64 %d, <vscale x 4 x i32> %vs) "aarch64_pstat
; CHECK64-NEXT: str x8, [sp, #8]
; CHECK64-NEXT: add sp, sp, #80
; CHECK64-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT: add sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #1
-; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #80
+; CHECK64-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_csr_d8_alloci64:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-1
-; CHECK1024-NEXT: str z8, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-; CHECK1024-NEXT: .cfi_offset w29, -16
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHEC...
[truncated]
|
SamTebbs33
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This patch enables `aarch64-split-sve-objects` to handle hazard padding in functions that use the SVE CC even when there are no predicate spills/locals. This improves the codegen over the base hazard padding implementation, as rather than placing the padding in the callee-save area, it is placed at the start of the ZPR area. E.g., Current lowering: ``` sub sp, sp, llvm#1040 str x29, [sp, llvm#1024] // 8-byte Folded Spill addvl sp, sp, #-1 str z8, [sp] // 16-byte Folded Spill sub sp, sp, llvm#1040 ``` New lowering: ``` str x29, [sp, #-16]! // 8-byte Folded Spill sub sp, sp, llvm#1024 addvl sp, sp, #-1 str z8, [sp] // 16-byte Folded Spill sub sp, sp, llvm#1040 ``` This also re-enables paired stores for GPRs (as the offsets no longer include the hazard padding).
b72210f to
411df38
Compare
llvm#166561) This patch enables `aarch64-split-sve-objects` to handle hazard padding in functions that use the SVE CC even when there are no predicate spills/locals. This improves the codegen over the base hazard padding implementation, as rather than placing the padding in the callee-save area, it is placed at the start of the ZPR area. E.g., Current lowering: ``` sub sp, sp, llvm#1040 str x29, [sp, llvm#1024] // 8-byte Folded Spill addvl sp, sp, #-1 str z8, [sp] // 16-byte Folded Spill sub sp, sp, llvm#1040 ``` New lowering: ``` str x29, [sp, #-16]! // 8-byte Folded Spill sub sp, sp, llvm#1024 addvl sp, sp, #-1 str z8, [sp] // 16-byte Folded Spill sub sp, sp, llvm#1040 ``` This also re-enables paired stores for GPRs (as the offsets no longer include the hazard padding).
This patch enables
aarch64-split-sve-objectsto handle hazard padding in functions that use the SVE CC even when there are no predicate spills/locals.This improves the codegen over the base hazard padding implementation, as rather than placing the padding in the callee-save area, it is placed at the start of the ZPR area.
E.g., Current lowering:
New lowering:
This also re-enables paired stores for GPRs (as the offsets no longer include the hazard padding).